1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build ppc64 || ppc64le
6
7 #include "go_asm.h"
8 #include "textflag.h"
9
10 TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
11 #ifndef GOEXPERIMENT_regabiargs
12 MOVD b_base+0(FP), R3 // R3 = byte array pointer
13 MOVD b_len+8(FP), R4 // R4 = length
14 MOVBZ c+24(FP), R5 // R5 = byte
15 MOVD $ret+32(FP), R14 // R14 = &ret
16 #else
17 MOVD R6, R5
18 #endif
19 BR indexbytebody<>(SB)
20
21 TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32
22 #ifndef GOEXPERIMENT_regabiargs
23 MOVD s_base+0(FP), R3 // R3 = string
24 MOVD s_len+8(FP), R4 // R4 = length
25 MOVBZ c+16(FP), R5 // R5 = byte
26 MOVD $ret+24(FP), R14 // R14 = &ret
27 #endif
28 BR indexbytebody<>(SB)
29 // R3 = addr of string
30 // R4 = len of string
31 // R5 = byte to find
32 // R14 = addr of return value when not regabi
33 TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
34 MOVD R3,R17 // Save base address for calculating the index later.
35 RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8.
36 RLDIMI $8,R5,$48,R5 // Replicating the byte across the register.
37 ADD R4,R3,R7 // Last acceptable address in R7.
38 DCBT (R8) // Prepare cache line.
39
40 RLDIMI $16,R5,$32,R5
41 CMPU R4,$32 // Check if it's a small string (≤32 bytes). Those will be processed differently.
42 MOVD $-1,R9
43 WORD $0x54661EB8 // Calculate padding in R6 (rlwinm r6,r3,3,26,28).
44 RLDIMI $32,R5,$0,R5
45 MOVD R7,R10 // Save last acceptable address in R10 for later.
46 ADD $-1,R7,R7
47 #ifdef GOARCH_ppc64le
48 SLD R6,R9,R9 // Prepare mask for Little Endian
49 #else
50 SRD R6,R9,R9 // Same for Big Endian
51 #endif
52 BLE small_string // Jump to the small string case if it's ≤32 bytes.
53
54 // If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
55 // in V0, V1 and V10, then branch to the preloop.
56 ANDCC $63,R3,R11
57 BEQ CR0,qw_align
58 RLDICL $0,R3,$61,R11
59
60 MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
61 CMPB R12,R5,R3 // Check for a match.
62 AND R9,R3,R3 // Mask bytes below s_base
63 RLDICL $0,R7,$61,R6 // length-1
64 RLDICR $0,R7,$60,R7 // Last doubleword in R7
65 CMPU R3,$0,CR7 // If we have a match, jump to the final computation
66 BNE CR7,done
67 ADD $8,R8,R8
68 ADD $-8,R4,R4
69 ADD R4,R11,R4
70
71 // Check for quadword alignment
72 ANDCC $15,R8,R11
73 BEQ CR0,qw_align
74
75 // Not aligned, so handle the next doubleword
76 MOVD 0(R8),R12
77 CMPB R12,R5,R3
78 CMPU R3,$0,CR7
79 BNE CR7,done
80 ADD $8,R8,R8
81 ADD $-8,R4,R4
82
83 // Either quadword aligned or 64-byte at this point. We can use LVX.
84 qw_align:
85
86 // Set up auxiliary data for the vectorized algorithm.
87 VSPLTISB $0,V0 // Replicate 0 across V0
88 VSPLTISB $3,V10 // Use V10 as control for VBPERMQ
89 MTVRD R5,V1
90 LVSL (R0+R0),V11
91 VSLB V11,V10,V10
92 VSPLTB $7,V1,V1 // Replicate byte across V1
93 CMPU R4, $64 // If len ≤ 64, don't use the vectorized loop
94 BLE tail
95
96 // We will load 4 quardwords per iteration in the loop, so check for
97 // 64-byte alignment. If 64-byte aligned, then branch to the preloop.
98 ANDCC $63,R8,R11
99 BEQ CR0,preloop
100
101 // Not 64-byte aligned. Load one quadword at a time until aligned.
102 LVX (R8+R0),V4
103 VCMPEQUBCC V1,V4,V6 // Check for byte in V4
104 BNE CR6,found_qw_align
105 ADD $16,R8,R8
106 ADD $-16,R4,R4
107
108 ANDCC $63,R8,R11
109 BEQ CR0,preloop
110 LVX (R8+R0),V4
111 VCMPEQUBCC V1,V4,V6 // Check for byte in V4
112 BNE CR6,found_qw_align
113 ADD $16,R8,R8
114 ADD $-16,R4,R4
115
116 ANDCC $63,R8,R11
117 BEQ CR0,preloop
118 LVX (R8+R0),V4
119 VCMPEQUBCC V1,V4,V6 // Check for byte in V4
120 BNE CR6,found_qw_align
121 ADD $-16,R4,R4
122 ADD $16,R8,R8
123
124 // 64-byte aligned. Prepare for the main loop.
125 preloop:
126 CMPU R4,$64
127 BLE tail // If len ≤ 64, don't use the vectorized loop
128
129 // We are now aligned to a 64-byte boundary. We will load 4 quadwords
130 // per loop iteration. The last doubleword is in R10, so our loop counter
131 // starts at (R10-R8)/64.
132 SUB R8,R10,R6
133 SRD $6,R6,R9 // Loop counter in R9
134 MOVD R9,CTR
135
136 ADD $-64,R8,R8 // Adjust index for loop entry
137 MOVD $16,R11 // Load offsets for the vector loads
138 MOVD $32,R9
139 MOVD $48,R7
140
141 // Main loop we will load 64 bytes per iteration
142 loop:
143 ADD $64,R8,R8 // Fuse addi+lvx for performance
144 LVX (R8+R0),V2 // Load 4 16-byte vectors
145 LVX (R8+R11),V3
146 VCMPEQUB V1,V2,V6 // Look for byte in each vector
147 VCMPEQUB V1,V3,V7
148
149 LVX (R8+R9),V4
150 LVX (R8+R7),V5
151 VCMPEQUB V1,V4,V8
152 VCMPEQUB V1,V5,V9
153
154 VOR V6,V7,V11 // Compress the result in a single vector
155 VOR V8,V9,V12
156 VOR V11,V12,V13
157 VCMPEQUBCC V0,V13,V14 // Check for byte
158 BGE CR6,found
159 BC 16,0,loop // bdnz loop
160
161 // Handle the tailing bytes or R4 ≤ 64
162 RLDICL $0,R6,$58,R4
163 ADD $64,R8,R8
164 tail:
165 CMPU R4,$0
166 BEQ notfound
167 LVX (R8+R0),V4
168 VCMPEQUBCC V1,V4,V6
169 BNE CR6,found_qw_align
170 ADD $16,R8,R8
171 CMPU R4,$16,CR6
172 BLE CR6,notfound
173 ADD $-16,R4,R4
174
175 LVX (R8+R0),V4
176 VCMPEQUBCC V1,V4,V6
177 BNE CR6,found_qw_align
178 ADD $16,R8,R8
179 CMPU R4,$16,CR6
180 BLE CR6,notfound
181 ADD $-16,R4,R4
182
183 LVX (R8+R0),V4
184 VCMPEQUBCC V1,V4,V6
185 BNE CR6,found_qw_align
186 ADD $16,R8,R8
187 CMPU R4,$16,CR6
188 BLE CR6,notfound
189 ADD $-16,R4,R4
190
191 LVX (R8+R0),V4
192 VCMPEQUBCC V1,V4,V6
193 BNE CR6,found_qw_align
194
195 notfound:
196 MOVD $-1,R3
197 #ifndef GOEXPERIMENT_regabiargs
198 MOVD R3,(R14)
199 #endif
200 RET
201
202 found:
203 // We will now compress the results into a single doubleword,
204 // so it can be moved to a GPR for the final index calculation.
205
206 // The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
207 // first bit of each byte into bits 48-63.
208 VBPERMQ V6,V10,V6
209 VBPERMQ V7,V10,V7
210 VBPERMQ V8,V10,V8
211 VBPERMQ V9,V10,V9
212
213 // Shift each 16-bit component into its correct position for
214 // merging into a single doubleword.
215 #ifdef GOARCH_ppc64le
216 VSLDOI $2,V7,V7,V7
217 VSLDOI $4,V8,V8,V8
218 VSLDOI $6,V9,V9,V9
219 #else
220 VSLDOI $6,V6,V6,V6
221 VSLDOI $4,V7,V7,V7
222 VSLDOI $2,V8,V8,V8
223 #endif
224
225 // Merge V6-V9 into a single doubleword and move to a GPR.
226 VOR V6,V7,V11
227 VOR V8,V9,V4
228 VOR V4,V11,V4
229 MFVRD V4,R3
230
231 #ifdef GOARCH_ppc64le
232 ADD $-1,R3,R11
233 ANDN R3,R11,R11
234 POPCNTD R11,R11 // Count trailing zeros (Little Endian).
235 #else
236 CNTLZD R3,R11 // Count leading zeros (Big Endian).
237 #endif
238 ADD R8,R11,R3 // Calculate byte address
239
240 return:
241 SUB R17,R3
242 #ifndef GOEXPERIMENT_regabiargs
243 MOVD R3,(R14)
244 #endif
245 RET
246
247 found_qw_align:
248 // Use the same algorithm as above. Compress the result into
249 // a single doubleword and move it to a GPR for the final
250 // calculation.
251 VBPERMQ V6,V10,V6
252
253 #ifdef GOARCH_ppc64le
254 MFVRD V6,R3
255 ADD $-1,R3,R11
256 ANDN R3,R11,R11
257 POPCNTD R11,R11
258 #else
259 VSLDOI $6,V6,V6,V6
260 MFVRD V6,R3
261 CNTLZD R3,R11
262 #endif
263 ADD R8,R11,R3
264 CMPU R11,R4
265 BLT return
266 BR notfound
267
268 done:
269 // At this point, R3 has 0xFF in the same position as the byte we are
270 // looking for in the doubleword. Use that to calculate the exact index
271 // of the byte.
272 #ifdef GOARCH_ppc64le
273 ADD $-1,R3,R11
274 ANDN R3,R11,R11
275 POPCNTD R11,R11 // Count trailing zeros (Little Endian).
276 #else
277 CNTLZD R3,R11 // Count leading zeros (Big Endian).
278 #endif
279 CMPU R8,R7 // Check if we are at the last doubleword.
280 SRD $3,R11 // Convert trailing zeros to bytes.
281 ADD R11,R8,R3
282 CMPU R11,R6,CR7 // If at the last doubleword, check the byte offset.
283 BNE return
284 BLE CR7,return
285 BR notfound
286
287 small_string:
288 // We unroll this loop for better performance.
289 CMPU R4,$0 // Check for length=0
290 BEQ notfound
291
292 MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
293 CMPB R12,R5,R3 // Check for a match.
294 AND R9,R3,R3 // Mask bytes below s_base.
295 CMPU R3,$0,CR7 // If we have a match, jump to the final computation.
296 RLDICL $0,R7,$61,R6 // length-1
297 RLDICR $0,R7,$60,R7 // Last doubleword in R7.
298 CMPU R8,R7
299 BNE CR7,done
300 BEQ notfound // Hit length.
301
302 MOVDU 8(R8),R12
303 CMPB R12,R5,R3
304 CMPU R3,$0,CR6
305 CMPU R8,R7
306 BNE CR6,done
307 BEQ notfound
308
309 MOVDU 8(R8),R12
310 CMPB R12,R5,R3
311 CMPU R3,$0,CR6
312 CMPU R8,R7
313 BNE CR6,done
314 BEQ notfound
315
316 MOVDU 8(R8),R12
317 CMPB R12,R5,R3
318 CMPU R3,$0,CR6
319 CMPU R8,R7
320 BNE CR6,done
321 BEQ notfound
322
323 MOVDU 8(R8),R12
324 CMPB R12,R5,R3
325 CMPU R3,$0,CR6
326 BNE CR6,done
327 BR notfound
328
329
View as plain text