1 // Copyright 2021 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // This is an implementation based on the s390x
6 // implementation.
7
8 // Find a separator with 2 <= len <= 32 within a string.
9 // Separators with lengths of 2, 3 or 4 are handled
10 // specially.
11
12 // This works on power8 and above. The loads and
13 // compares are done in big endian order
14 // since that allows the used of VCLZD, and allows
15 // the same implementation to work on big and little
16 // endian platforms with minimal conditional changes.
17
18 // NOTE: There is a power9 implementation that
19 // improves performance by 10-15% on little
20 // endian for some of the benchmarks, but
21 // work is still needed for a big endian
22 // implementation on power9.
23
24 //go:build ppc64 || ppc64le
25
26 #include "go_asm.h"
27 #include "textflag.h"
28
29 // Needed to swap LXVD2X loads to the correct
30 // byte order to work on POWER8.
31
32 #ifdef GOARCH_ppc64
33 DATA byteswap<>+0(SB)/8, $0x0001020304050607
34 DATA byteswap<>+8(SB)/8, $0x08090a0b0c0d0e0f
35 #else
36 DATA byteswap<>+0(SB)/8, $0x0706050403020100
37 DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
38 #endif
39
40 // Load bytes in big endian order. Address
41 // alignment does not need checking.
42 #define VLOADSWAP(base, index, vreg, vsreg) \
43 LXVD2X (base)(index), vsreg; \
44 VPERM vreg, vreg, SWAP, vreg
45
46 GLOBL byteswap<>+0(SB), RODATA, $16
47
48 TEXT ·Index<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
49 #ifdef GOEXPERIMENT_regabiargs
50 // R3 = byte array pointer
51 // R4 = length
52 MOVD R6,R5 // R5 = separator pointer
53 MOVD R7,R6 // R6 = separator length
54 #else
55 MOVD a_base+0(FP), R3 // R3 = byte array pointer
56 MOVD a_len+8(FP), R4 // R4 = length
57 MOVD b_base+24(FP), R5 // R5 = separator pointer
58 MOVD b_len+32(FP), R6 // R6 = separator length
59 MOVD $ret+48(FP), R14 // R14 = &ret
60 #endif
61
62
63 #ifdef GOARCH_ppc64le
64 MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
65 CMP R7, $1
66 BNE power8
67 BR indexbodyp9<>(SB)
68
69 #endif
70 power8:
71 BR indexbody<>(SB)
72
73 TEXT ·IndexString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
74 #ifndef GOEXPERIMENT_regabiargs
75 MOVD a_base+0(FP), R3 // R3 = string
76 MOVD a_len+8(FP), R4 // R4 = length
77 MOVD b_base+16(FP), R5 // R5 = separator pointer
78 MOVD b_len+24(FP), R6 // R6 = separator length
79 MOVD $ret+32(FP), R14 // R14 = &ret
80 #endif
81
82
83 #ifdef GOARCH_ppc64le
84 MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
85 CMP R7, $1
86 BNE power8
87 BR indexbodyp9<>(SB)
88
89 #endif
90 power8:
91 BR indexbody<>(SB)
92
93 // s: string we are searching
94 // sep: string to search for
95 // R3=&s[0], R4=len(s)
96 // R5=&sep[0], R6=len(sep)
97 // R14=&ret (index where sep found)
98 // R7=working addr of string
99 // R16=index value 16
100 // R17=index value 17
101 // R18=index value 18
102 // R19=index value 1
103 // R26=LASTBYTE of string
104 // R27=LASTSTR last start byte to compare with sep
105 // R8, R9 scratch
106 // V0=sep left justified zero fill
107 // CR4=sep length >= 16
108
109 #define SEPMASK V17
110 #define LASTBYTE R26
111 #define LASTSTR R27
112 #define ONES V20
113 #define SWAP V21
114 #define V0_ VS32
115 #define V1_ VS33
116 #define V2_ VS34
117 #define V3_ VS35
118 #define V4_ VS36
119 #define V5_ VS37
120 #define V6_ VS38
121 #define V7_ VS39
122 #define V8_ VS40
123 #define V9_ VS41
124 #define SWAP_ VS53
125 TEXT indexbody<>(SB), NOSPLIT|NOFRAME, $0
126 CMP R6, R4 // Compare lengths
127 BGT notfound // If sep len is > string, notfound
128 ADD R4, R3, LASTBYTE // find last byte addr
129 SUB R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index)
130 CMP R6, $0 // Check sep len
131 BEQ notfound // sep len 0 -- not found
132 MOVD R3, R7 // Copy of string addr
133 MOVD $16, R16 // Index value 16
134 MOVD $17, R17 // Index value 17
135 MOVD $18, R18 // Index value 18
136 MOVD $1, R19 // Index value 1
137 MOVD $byteswap<>+00(SB), R8
138 VSPLTISB $0xFF, ONES // splat all 1s
139 LXVD2X (R8)(R0), SWAP_ // Set up swap string
140
141 CMP R6, $16, CR4 // CR4 for len(sep) >= 16
142 VOR ONES, ONES, SEPMASK // Set up full SEPMASK
143 BGE CR4, loadge16 // Load for len(sep) >= 16
144 SUB R6, R16, R9 // 16-len of sep
145 SLD $3, R9 // Set up for VSLO
146 MTVSRD R9, V9_ // Set up for VSLO
147 VSLDOI $8, V9, V9, V9 // Set up for VSLO
148 VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16
149
150 loadge16:
151 ANDCC $15, R5, R9 // Find byte offset of sep
152 ADD R9, R6, R10 // Add sep len
153 CMP R10, $16 // Check if sep len+offset > 16
154 BGT sepcross16 // Sep crosses 16 byte boundary
155
156 RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container
157 VLOADSWAP(R8, R0, V0, V0_)// Load 16 bytes @R8 into V0
158 SLD $3, R9 // Set up shift count for VSLO
159 MTVSRD R9, V8_ // Set up shift count for VSLO
160 VSLDOI $8, V8, V8, V8
161 VSLO V0, V8, V0 // Shift by start byte
162
163 VAND V0, SEPMASK, V0 // Mask separator (< 16)
164 BR index2plus
165
166 sepcross16:
167 VLOADSWAP(R5, R0, V0, V0_) // Load 16 bytes @R5 into V0
168
169 VAND V0, SEPMASK, V0 // mask out separator
170 BLE CR4, index2to16
171 BR index17plus // Handle sep > 16
172
173 index2plus:
174 CMP R6, $2 // Check length of sep
175 BNE index3plus // If not 2, check for 3
176 ADD $16, R7, R9 // Check if next 16 bytes past last
177 CMP R9, LASTBYTE // compare with last
178 BGE index2to16 // 2 <= len(string) <= 16
179 MOVD $0xff00, R21 // Mask for later
180 MTVSRD R21, V25 // Move to Vreg
181 VSPLTH $3, V25, V31 // Splat mask
182 VSPLTH $0, V0, V1 // Splat 1st 2 bytes of sep
183 VSPLTISB $0, V10 // Clear V10
184
185 // First case: 2 byte separator
186 // V1: 2 byte separator splatted
187 // V2: 16 bytes at addr
188 // V4: 16 bytes at addr+1
189 // Compare 2 byte separator at start
190 // and at start+1. Use VSEL to combine
191 // those results to find the first
192 // matching start byte, returning
193 // that value when found. Loop as
194 // long as len(string) > 16
195 index2loop2:
196 VLOADSWAP(R7, R19, V3, V3_) // Load 16 bytes @R7+1 into V3
197
198 index2loop:
199 VLOADSWAP(R7, R0, V2, V2_) // Load 16 bytes @R7 into V2
200 VCMPEQUH V1, V2, V5 // Search for sep
201 VCMPEQUH V1, V3, V6 // Search for sep offset by 1
202 VSEL V6, V5, V31, V7 // merge even and odd indices
203 VCLZD V7, V18 // find index of first match
204 MFVSRD V18, R25 // get first value
205 CMP R25, $64 // Found if < 64
206 BLT foundR25 // Return byte index where found
207 VSLDOI $8, V18, V18, V18 // Adjust 2nd value
208 MFVSRD V18, R25 // get second value
209 CMP R25, $64 // Found if < 64
210 ADD $64, R25 // Update byte offset
211 BLT foundR25 // Return value
212 ADD $16, R7 // R7+=16 Update string pointer
213 ADD $17, R7, R9 // R9=F7+17 since loop unrolled
214 CMP R9, LASTBYTE // Compare addr+17 against last byte
215 BLT index2loop2 // If < last, continue loop
216 CMP R7, LASTBYTE // Compare addr+16 against last byte
217 BLT index2to16 // If < 16 handle specially
218 VLOADSWAP(R7, R0, V3, V3_) // Load 16 bytes @R7 into V3
219 VSLDOI $1, V3, V10, V3 // Shift left by 1 byte
220 BR index2loop
221
222 index3plus:
223 CMP R6, $3 // Check if sep == 3
224 BNE index4plus // If not check larger
225 ADD $19, R7, R9 // Find bytes for use in this loop
226 CMP R9, LASTBYTE // Compare against last byte
227 BGE index2to16 // Remaining string 2<=len<=16
228 MOVD $0xff00, R21 // Set up mask for upcoming loop
229 MTVSRD R21, V25 // Move mask to Vreg
230 VSPLTH $3, V25, V31 // Splat mask
231 VSPLTH $0, V0, V1 // Splat 1st two bytes of sep
232 VSPLTB $2, V0, V8 // Splat 3rd byte of sep
233
234 // Loop to process 3 byte separator.
235 // string[0:16] is in V2
236 // string[2:18] is in V3
237 // sep[0:2] splatted in V1
238 // sec[3] splatted in v8
239 // Load vectors at string, string+1
240 // and string+2. Compare string, string+1
241 // against first 2 bytes of separator
242 // splatted, and string+2 against 3rd
243 // byte splatted. Merge the results with
244 // VSEL to find the first byte of a match.
245
246 // Special handling for last 16 bytes if the
247 // string fits in 16 byte multiple.
248 index3loop2:
249 MOVD $2, R21 // Set up index for 2
250 VSPLTISB $0, V10 // Clear V10
251 VLOADSWAP(R7, R21, V3, V3_)// Load 16 bytes @R7+2 into V3
252 VSLDOI $14, V3, V10, V3 // Left justify next 2 bytes
253
254 index3loop:
255 VLOADSWAP(R7, R0, V2, V2_) // Load with correct order
256 VSLDOI $1, V2, V3, V4 // string[1:17]
257 VSLDOI $2, V2, V3, V9 // string[2:18]
258 VCMPEQUH V1, V2, V5 // compare hw even indices
259 VCMPEQUH V1, V4, V6 // compare hw odd indices
260 VCMPEQUB V8, V9, V10 // compare 3rd to last byte
261 VSEL V6, V5, V31, V7 // Find 1st matching byte using mask
262 VAND V7, V10, V7 // AND matched bytes with matched 3rd byte
263 VCLZD V7, V18 // Find first nonzero indexes
264 MFVSRD V18, R25 // Move 1st doubleword
265 CMP R25, $64 // If < 64 found
266 BLT foundR25 // Return matching index
267 VSLDOI $8, V18, V18, V18 // Move value
268 MFVSRD V18, R25 // Move 2nd doubleword
269 CMP R25, $64 // If < 64 found
270 ADD $64, R25 // Update byte index
271 BLT foundR25 // Return matching index
272 ADD $16, R7 // R7+=16 string ptr
273 ADD $19, R7, R9 // Number of string bytes for loop
274 CMP R9, LASTBYTE // Compare against last byte of string
275 BLT index3loop2 // If within, continue this loop
276 CMP R7, LASTSTR // Compare against last start byte
277 BLT index2to16 // Process remainder
278 VSPLTISB $0, V3 // Special case for last 16 bytes
279 BR index3loop // Continue this loop
280
281 // Loop to process 4 byte separator
282 // string[0:16] in V2
283 // string[3:16] in V3
284 // sep[0:4] splatted in V1
285 // Set up vectors with strings at offsets
286 // 0, 1, 2, 3 and compare against the 4 byte
287 // separator also splatted. Use VSEL with the
288 // compare results to find the first byte where
289 // a separator match is found.
290 index4plus:
291 CMP R6, $4 // Check if 4 byte separator
292 BNE index5plus // If not next higher
293 ADD $20, R7, R9 // Check string size to load
294 CMP R9, LASTBYTE // Verify string length
295 BGE index2to16 // If not large enough, process remaining
296 MOVD $2, R15 // Set up index
297
298 // Set up masks for use with VSEL
299 MOVD $0xff, R21 // Set up mask 0xff000000ff000000...
300 SLD $24, R21
301 MTVSRD R21, V10
302 VSPLTW $1, V10, V29
303 VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00...
304 MOVD $0xffff, R21
305 SLD $16, R21
306 MTVSRD R21, V10
307 VSPLTW $1, V10, V31 // Mask 0xffff0000ffff0000...
308 VSPLTW $0, V0, V1 // Splat 1st word of separator
309
310 index4loop:
311 VLOADSWAP(R7, R0, V2, V2_) // Load 16 bytes @R7 into V2
312
313 next4:
314 VSPLTISB $0, V10 // Clear
315 MOVD $3, R9 // Number of bytes beyond 16
316 VLOADSWAP(R7, R9, V3, V3_) // Load 16 bytes @R7+3 into V3
317 VSLDOI $13, V3, V10, V3 // Shift left last 3 bytes
318 VSLDOI $1, V2, V3, V4 // V4=(V2:V3)<<1
319 VSLDOI $2, V2, V3, V9 // V9=(V2:V3)<<2
320 VSLDOI $3, V2, V3, V10 // V10=(V2:v3)<<3
321 VCMPEQUW V1, V2, V5 // compare index 0, 4, ... with sep
322 VCMPEQUW V1, V4, V6 // compare index 1, 5, ... with sep
323 VCMPEQUW V1, V9, V11 // compare index 2, 6, ... with sep
324 VCMPEQUW V1, V10, V12 // compare index 3, 7, ... with sep
325 VSEL V6, V5, V29, V13 // merge index 0, 1, 4, 5, using mask
326 VSEL V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask
327 VSEL V14, V13, V31, V7 // final merge
328 VCLZD V7, V18 // Find first index for each half
329 MFVSRD V18, R25 // Isolate value
330 CMP R25, $64 // If < 64, found
331 BLT foundR25 // Return found index
332 VSLDOI $8, V18, V18, V18 // Move for MFVSRD
333 MFVSRD V18, R25 // Isolate other value
334 CMP R25, $64 // If < 64, found
335 ADD $64, R25 // Update index for high doubleword
336 BLT foundR25 // Return found index
337 ADD $16, R7 // R7+=16 for next string
338 ADD $20, R7, R9 // R+20 for all bytes to load
339 CMP R9, LASTBYTE // Past end? Maybe check for extra?
340 BLT index4loop // If not, continue loop
341 CMP R7, LASTSTR // Check remainder
342 BLE index2to16 // Process remainder
343 BR notfound // Not found
344
345 index5plus:
346 CMP R6, $16 // Check for sep > 16
347 BGT index17plus // Handle large sep
348
349 // Assumption is that the separator is smaller than the string at this point
350 index2to16:
351 CMP R7, LASTSTR // Compare last start byte
352 BGT notfound // last takes len(sep) into account
353
354 ADD $16, R7, R9 // Check for last byte of string
355 CMP R9, LASTBYTE
356 BGT index2to16tail
357
358 // At least 16 bytes of string left
359 // Mask the number of bytes in sep
360 index2to16loop:
361 VLOADSWAP(R7, R0, V1, V1_) // Load 16 bytes @R7 into V1
362
363 compare:
364 VAND V1, SEPMASK, V2 // Mask out sep size
365 VCMPEQUBCC V0, V2, V3 // Compare masked string
366 BLT CR6, found // All equal
367 ADD $1, R7 // Update ptr to next byte
368 CMP R7, LASTSTR // Still less than last start byte
369 BGT notfound // Not found
370 ADD $16, R7, R9 // Verify remaining bytes
371 CMP R9, LASTBYTE // At least 16
372 BLT index2to16loop // Try again
373
374 // Less than 16 bytes remaining in string
375 // Separator >= 2
376 index2to16tail:
377 ADD R3, R4, R9 // End of string
378 SUB R7, R9, R9 // Number of bytes left
379 ANDCC $15, R7, R10 // 16 byte offset
380 ADD R10, R9, R11 // offset + len
381 CMP R11, $16 // >= 16?
382 BLE short // Does not cross 16 bytes
383 VLOADSWAP(R7, R0, V1, V1_)// Load 16 bytes @R7 into V1
384 BR index2to16next // Continue on
385
386 short:
387 RLDICR $0, R7, $59, R9 // Adjust addr to 16 byte container
388 VLOADSWAP(R9, R0, V1, V1_)// Load 16 bytes @R9 into V1
389 SLD $3, R10 // Set up shift
390 MTVSRD R10, V8_ // Set up shift
391 VSLDOI $8, V8, V8, V8
392 VSLO V1, V8, V1 // Shift by start byte
393 VSPLTISB $0, V25 // Clear for later use
394
395 index2to16next:
396 VAND V1, SEPMASK, V2 // Just compare size of sep
397 VCMPEQUBCC V0, V2, V3 // Compare sep and partial string
398 BLT CR6, found // Found
399 ADD $1, R7 // Not found, try next partial string
400 CMP R7, LASTSTR // Check for end of string
401 BGT notfound // If at end, then not found
402 VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte
403 BR index2to16next // Check the next partial string
404
405 index17plus:
406 CMP R6, $32 // Check if 17 < len(sep) <= 32
407 BGT index33plus
408 SUB $16, R6, R9 // Extra > 16
409 SLD $56, R9, R10 // Shift to use in VSLO
410 MTVSRD R10, V9_ // Set up for VSLO
411 VLOADSWAP(R5, R9, V1, V1_)// Load 16 bytes @R5+R9 into V1
412 VSLO V1, V9, V1 // Shift left
413 VSPLTISB $0xff, V7 // Splat 1s
414 VSPLTISB $0, V27 // Splat 0
415
416 index17to32loop:
417 VLOADSWAP(R7, R0, V2, V2_) // Load 16 bytes @R7 into V2
418
419 next17:
420 VLOADSWAP(R7, R9, V3, V3_) // Load 16 bytes @R7+R9 into V3
421 VSLO V3, V9, V3 // Shift left
422 VCMPEQUB V0, V2, V4 // Compare first 16 bytes
423 VCMPEQUB V1, V3, V5 // Compare extra over 16 bytes
424 VAND V4, V5, V6 // Check if both equal
425 VCMPEQUBCC V6, V7, V8 // All equal?
426 BLT CR6, found // Yes
427 ADD $1, R7 // On to next byte
428 CMP R7, LASTSTR // Check if last start byte
429 BGT notfound // If too high, not found
430 BR index17to32loop // Continue
431
432 notfound:
433 #ifdef GOEXPERIMENT_regabiargs
434 MOVD $-1, R3 // Return -1 if not found
435 #else
436 MOVD $-1, R8 // Return -1 if not found
437 MOVD R8, (R14)
438 #endif
439 RET
440
441 index33plus:
442 MOVD $0, (R0) // Case not implemented
443 RET // Crash before return
444
445 foundR25:
446 SRD $3, R25 // Convert from bits to bytes
447 ADD R25, R7 // Add to current string address
448 SUB R3, R7 // Subtract from start of string
449 #ifdef GOEXPERIMENT_regabiargs
450 MOVD R7, R3 // Return byte where found
451 #else
452 MOVD R7, (R14) // Return byte where found
453 #endif
454 RET
455
456 found:
457 SUB R3, R7 // Return byte where found
458 #ifdef GOEXPERIMENT_regabiargs
459 MOVD R7, R3
460 #else
461 MOVD R7, (R14)
462 #endif
463 RET
464
465 TEXT indexbodyp9<>(SB), NOSPLIT|NOFRAME, $0
466 CMP R6, R4 // Compare lengths
467 BGT notfound // If sep len is > string, notfound
468 ADD R4, R3, LASTBYTE // find last byte addr
469 SUB R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index)
470 CMP R6, $0 // Check sep len
471 BEQ notfound // sep len 0 -- not found
472 MOVD R3, R7 // Copy of string addr
473 MOVD $16, R16 // Index value 16
474 MOVD $17, R17 // Index value 17
475 MOVD $18, R18 // Index value 18
476 MOVD $1, R19 // Index value 1
477 VSPLTISB $0xFF, ONES // splat all 1s
478
479 CMP R6, $16, CR4 // CR4 for len(sep) >= 16
480 VOR ONES, ONES, SEPMASK // Set up full SEPMASK
481 BGE CR4, loadge16 // Load for len(sep) >= 16
482 SUB R6, R16, R9 // 16-len of sep
483 SLD $3, R9 // Set up for VSLO
484 MTVSRD R9, V9_ // Set up for VSLO
485 VSLDOI $8, V9, V9, V9 // Set up for VSLO
486 VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16
487
488 loadge16:
489 ANDCC $15, R5, R9 // Find byte offset of sep
490 ADD R9, R6, R10 // Add sep len
491 CMP R10, $16 // Check if sep len+offset > 16
492 BGT sepcross16 // Sep crosses 16 byte boundary
493
494 RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container
495 LXVB16X (R8)(R0), V0_ // Load 16 bytes @R8 into V0
496 SLD $3, R9 // Set up shift count for VSLO
497 MTVSRD R9, V8_ // Set up shift count for VSLO
498 VSLDOI $8, V8, V8, V8
499 VSLO V0, V8, V0 // Shift by start byte
500
501 VAND V0, SEPMASK, V0 // Mask separator (< 16)
502 BR index2plus
503
504 sepcross16:
505 LXVB16X (R5)(R0), V0_ // Load 16 bytes @R5 into V0
506
507 VAND V0, SEPMASK, V0 // mask out separator
508 BLE CR4, index2to16
509 BR index17plus // Handle sep > 16
510
511 index2plus:
512 CMP R6, $2 // Check length of sep
513 BNE index3plus // If not 2, check for 3
514 ADD $16, R7, R9 // Check if next 16 bytes past last
515 CMP R9, LASTBYTE // compare with last
516 BGE index2to16 // 2 <= len(string) <= 16
517 MOVD $0xff00, R21 // Mask for later
518 MTVSRD R21, V25 // Move to Vreg
519 VSPLTH $3, V25, V31 // Splat mask
520 VSPLTH $0, V0, V1 // Splat 1st 2 bytes of sep
521 VSPLTISB $0, V10 // Clear V10
522
523 // First case: 2 byte separator
524 // V1: 2 byte separator splatted
525 // V2: 16 bytes at addr
526 // V4: 16 bytes at addr+1
527 // Compare 2 byte separator at start
528 // and at start+1. Use VSEL to combine
529 // those results to find the first
530 // matching start byte, returning
531 // that value when found. Loop as
532 // long as len(string) > 16
533 index2loop2:
534 LXVB16X (R7)(R19), V3_ // Load 16 bytes @R7+1 into V3
535
536 index2loop:
537 LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7 into V2
538 VCMPEQUH V1, V2, V5 // Search for sep
539 VCMPEQUH V1, V3, V6 // Search for sep offset by 1
540 VSEL V6, V5, V31, V7 // merge even and odd indices
541 VCLZD V7, V18 // find index of first match
542 MFVSRD V18, R25 // get first value
543 CMP R25, $64 // Found if < 64
544 BLT foundR25 // Return byte index where found
545
546 MFVSRLD V18, R25 // get second value
547 CMP R25, $64 // Found if < 64
548 ADD $64, R25 // Update byte offset
549 BLT foundR25 // Return value
550 ADD $16, R7 // R7+=16 Update string pointer
551 ADD $17, R7, R9 // R9=F7+17 since loop unrolled
552 CMP R9, LASTBYTE // Compare addr+17 against last byte
553 BLT index2loop2 // If < last, continue loop
554 CMP R7, LASTBYTE // Compare addr+16 against last byte
555 BLT index2to16 // If < 16 handle specially
556 LXVB16X (R7)(R0), V3_ // Load 16 bytes @R7 into V3
557 VSLDOI $1, V3, V10, V3 // Shift left by 1 byte
558 BR index2loop
559
560 index3plus:
561 CMP R6, $3 // Check if sep == 3
562 BNE index4plus // If not check larger
563 ADD $19, R7, R9 // Find bytes for use in this loop
564 CMP R9, LASTBYTE // Compare against last byte
565 BGE index2to16 // Remaining string 2<=len<=16
566 MOVD $0xff00, R21 // Set up mask for upcoming loop
567 MTVSRD R21, V25 // Move mask to Vreg
568 VSPLTH $3, V25, V31 // Splat mask
569 VSPLTH $0, V0, V1 // Splat 1st two bytes of sep
570 VSPLTB $2, V0, V8 // Splat 3rd byte of sep
571
572 // Loop to process 3 byte separator.
573 // string[0:16] is in V2
574 // string[2:18] is in V3
575 // sep[0:2] splatted in V1
576 // sec[3] splatted in v8
577 // Load vectors at string, string+1
578 // and string+2. Compare string, string+1
579 // against first 2 bytes of separator
580 // splatted, and string+2 against 3rd
581 // byte splatted. Merge the results with
582 // VSEL to find the first byte of a match.
583
584 // Special handling for last 16 bytes if the
585 // string fits in 16 byte multiple.
586 index3loop2:
587 MOVD $2, R21 // Set up index for 2
588 VSPLTISB $0, V10 // Clear V10
589 LXVB16X (R7)(R21), V3_ // Load 16 bytes @R7+2 into V3
590 VSLDOI $14, V3, V10, V3 // Left justify next 2 bytes
591
592 index3loop:
593 LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7
594 VSLDOI $1, V2, V3, V4 // string[1:17]
595 VSLDOI $2, V2, V3, V9 // string[2:18]
596 VCMPEQUH V1, V2, V5 // compare hw even indices
597 VCMPEQUH V1, V4, V6 // compare hw odd indices
598 VCMPEQUB V8, V9, V10 // compare 3rd to last byte
599 VSEL V6, V5, V31, V7 // Find 1st matching byte using mask
600 VAND V7, V10, V7 // AND matched bytes with matched 3rd byte
601 VCLZD V7, V18 // Find first nonzero indexes
602 MFVSRD V18, R25 // Move 1st doubleword
603 CMP R25, $64 // If < 64 found
604 BLT foundR25 // Return matching index
605
606 MFVSRLD V18, R25 // Move 2nd doubleword
607 CMP R25, $64 // If < 64 found
608 ADD $64, R25 // Update byte index
609 BLT foundR25 // Return matching index
610 ADD $16, R7 // R7+=16 string ptr
611 ADD $19, R7, R9 // Number of string bytes for loop
612 CMP R9, LASTBYTE // Compare against last byte of string
613 BLT index3loop2 // If within, continue this loop
614 CMP R7, LASTSTR // Compare against last start byte
615 BLT index2to16 // Process remainder
616 VSPLTISB $0, V3 // Special case for last 16 bytes
617 BR index3loop // Continue this loop
618
619 // Loop to process 4 byte separator
620 // string[0:16] in V2
621 // string[3:16] in V3
622 // sep[0:4] splatted in V1
623 // Set up vectors with strings at offsets
624 // 0, 1, 2, 3 and compare against the 4 byte
625 // separator also splatted. Use VSEL with the
626 // compare results to find the first byte where
627 // a separator match is found.
628 index4plus:
629 CMP R6, $4 // Check if 4 byte separator
630 BNE index5plus // If not next higher
631 ADD $20, R7, R9 // Check string size to load
632 CMP R9, LASTBYTE // Verify string length
633 BGE index2to16 // If not large enough, process remaining
634 MOVD $2, R15 // Set up index
635
636 // Set up masks for use with VSEL
637 MOVD $0xff, R21 // Set up mask 0xff000000ff000000...
638 SLD $24, R21
639 MTVSRWS R21, V29
640
641 VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00...
642 MOVD $0xffff, R21
643 SLD $16, R21
644 MTVSRWS R21, V31
645
646 VSPLTW $0, V0, V1 // Splat 1st word of separator
647
648 index4loop:
649 LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7 into V2
650
651 next4:
652 VSPLTISB $0, V10 // Clear
653 MOVD $3, R9 // Number of bytes beyond 16
654 LXVB16X (R7)(R9), V3_ // Load 16 bytes @R7 into V2
655 VSLDOI $13, V3, V10, V3 // Shift left last 3 bytes
656 VSLDOI $1, V2, V3, V4 // V4=(V2:V3)<<1
657 VSLDOI $2, V2, V3, V9 // V9=(V2:V3)<<2
658 VSLDOI $3, V2, V3, V10 // V10=(V2:v3)<<3
659 VCMPEQUW V1, V2, V5 // compare index 0, 4, ... with sep
660 VCMPEQUW V1, V4, V6 // compare index 1, 5, ... with sep
661 VCMPEQUW V1, V9, V11 // compare index 2, 6, ... with sep
662 VCMPEQUW V1, V10, V12 // compare index 3, 7, ... with sep
663 VSEL V6, V5, V29, V13 // merge index 0, 1, 4, 5, using mask
664 VSEL V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask
665 VSEL V14, V13, V31, V7 // final merge
666 VCLZD V7, V18 // Find first index for each half
667 MFVSRD V18, R25 // Isolate value
668 CMP R25, $64 // If < 64, found
669 BLT foundR25 // Return found index
670
671 MFVSRLD V18, R25 // Isolate other value
672 CMP R25, $64 // If < 64, found
673 ADD $64, R25 // Update index for high doubleword
674 BLT foundR25 // Return found index
675 ADD $16, R7 // R7+=16 for next string
676 ADD $20, R7, R9 // R+20 for all bytes to load
677 CMP R9, LASTBYTE // Past end? Maybe check for extra?
678 BLT index4loop // If not, continue loop
679 CMP R7, LASTSTR // Check remainder
680 BLE index2to16 // Process remainder
681 BR notfound // Not found
682
683 index5plus:
684 CMP R6, $16 // Check for sep > 16
685 BGT index17plus // Handle large sep
686
687 // Assumption is that the separator is smaller than the string at this point
688 index2to16:
689 CMP R7, LASTSTR // Compare last start byte
690 BGT notfound // last takes len(sep) into account
691
692 ADD $16, R7, R9 // Check for last byte of string
693 CMP R9, LASTBYTE
694 BGT index2to16tail
695
696 // At least 16 bytes of string left
697 // Mask the number of bytes in sep
698 index2to16loop:
699 LXVB16X (R7)(R0), V1_ // Load 16 bytes @R7 into V1
700
701 compare:
702 VAND V1, SEPMASK, V2 // Mask out sep size
703 VCMPEQUBCC V0, V2, V3 // Compare masked string
704 BLT CR6, found // All equal
705 ADD $1, R7 // Update ptr to next byte
706 CMP R7, LASTSTR // Still less than last start byte
707 BGT notfound // Not found
708 ADD $16, R7, R9 // Verify remaining bytes
709 CMP R9, LASTBYTE // At least 16
710 BLT index2to16loop // Try again
711
712 // Less than 16 bytes remaining in string
713 // Separator >= 2
714 index2to16tail:
715 ADD R3, R4, R9 // End of string
716 SUB R7, R9, R9 // Number of bytes left
717 ANDCC $15, R7, R10 // 16 byte offset
718 ADD R10, R9, R11 // offset + len
719 CMP R11, $16 // >= 16?
720 BLE short // Does not cross 16 bytes
721 LXVB16X (R7)(R0), V1_ // Load 16 bytes @R7 into V1
722 BR index2to16next // Continue on
723
724 short:
725 RLDICR $0, R7, $59, R9 // Adjust addr to 16 byte container
726 LXVB16X (R9)(R0), V1_ // Load 16 bytes @R9 into V1
727 SLD $3, R10 // Set up shift
728 MTVSRD R10, V8_ // Set up shift
729 VSLDOI $8, V8, V8, V8
730 VSLO V1, V8, V1 // Shift by start byte
731 VSPLTISB $0, V25 // Clear for later use
732
733 index2to16next:
734 VAND V1, SEPMASK, V2 // Just compare size of sep
735 VCMPEQUBCC V0, V2, V3 // Compare sep and partial string
736 BLT CR6, found // Found
737 ADD $1, R7 // Not found, try next partial string
738 CMP R7, LASTSTR // Check for end of string
739 BGT notfound // If at end, then not found
740 VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte
741 BR index2to16next // Check the next partial string
742
743 index17plus:
744 CMP R6, $32 // Check if 17 < len(sep) <= 32
745 BGT index33plus
746 SUB $16, R6, R9 // Extra > 16
747 SLD $56, R9, R10 // Shift to use in VSLO
748 MTVSRD R10, V9_ // Set up for VSLO
749 LXVB16X (R5)(R9), V1_ // Load 16 bytes @R5+R9 into V1
750 VSLO V1, V9, V1 // Shift left
751 VSPLTISB $0xff, V7 // Splat 1s
752 VSPLTISB $0, V27 // Splat 0
753
754 index17to32loop:
755 LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7 into V2
756
757 next17:
758 LXVB16X (R7)(R9), V3_ // Load 16 bytes @R7+R9 into V3
759 VSLO V3, V9, V3 // Shift left
760 VCMPEQUB V0, V2, V4 // Compare first 16 bytes
761 VCMPEQUB V1, V3, V5 // Compare extra over 16 bytes
762 VAND V4, V5, V6 // Check if both equal
763 VCMPEQUBCC V6, V7, V8 // All equal?
764 BLT CR6, found // Yes
765 ADD $1, R7 // On to next byte
766 CMP R7, LASTSTR // Check if last start byte
767 BGT notfound // If too high, not found
768 BR index17to32loop // Continue
769
770 notfound:
771 #ifdef GOEXPERIMENT_regabiargs
772 MOVD $-1, R3 // Return -1 if not found
773 #else
774 MOVD $-1, R8 // Return -1 if not found
775 MOVD R8, (R14)
776 #endif
777 RET
778
779 index33plus:
780 MOVD $0, (R0) // Case not implemented
781 RET // Crash before return
782
783 foundR25:
784 SRD $3, R25 // Convert from bits to bytes
785 ADD R25, R7 // Add to current string address
786 SUB R3, R7 // Subtract from start of string
787 #ifdef GOEXPERIMENT_regabiargs
788 MOVD R7, R3 // Return byte where found
789 #else
790 MOVD R7, (R14) // Return byte where found
791 #endif
792 RET
793
794 found:
795 SUB R3, R7 // Return byte where found
796 #ifdef GOEXPERIMENT_regabiargs
797 MOVD R7, R3
798 #else
799 MOVD R7, (R14)
800 #endif
801 RET
802
803
View as plain text