1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "go_asm.h"
6 #include "textflag.h"
7
8 TEXT ·Index(SB),NOSPLIT,$0-56
9 MOVQ a_base+0(FP), DI
10 MOVQ a_len+8(FP), DX
11 MOVQ b_base+24(FP), R8
12 MOVQ b_len+32(FP), AX
13 MOVQ DI, R10
14 LEAQ ret+48(FP), R11
15 JMP indexbody<>(SB)
16
17 TEXT ·IndexString(SB),NOSPLIT,$0-40
18 MOVQ a_base+0(FP), DI
19 MOVQ a_len+8(FP), DX
20 MOVQ b_base+16(FP), R8
21 MOVQ b_len+24(FP), AX
22 MOVQ DI, R10
23 LEAQ ret+32(FP), R11
24 JMP indexbody<>(SB)
25
26 // AX: length of string, that we are searching for
27 // DX: length of string, in which we are searching
28 // DI: pointer to string, in which we are searching
29 // R8: pointer to string, that we are searching for
30 // R11: address, where to put return value
31 // Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them
32 TEXT indexbody<>(SB),NOSPLIT,$0
33 CMPQ AX, DX
34 JA fail
35 CMPQ DX, $16
36 JAE sse42
37 no_sse42:
38 CMPQ AX, $2
39 JA _3_or_more
40 MOVW (R8), R8
41 LEAQ -1(DI)(DX*1), DX
42 loop2:
43 MOVW (DI), SI
44 CMPW SI,R8
45 JZ success
46 ADDQ $1,DI
47 CMPQ DI,DX
48 JB loop2
49 JMP fail
50 _3_or_more:
51 CMPQ AX, $3
52 JA _4_or_more
53 MOVW 1(R8), BX
54 MOVW (R8), R8
55 LEAQ -2(DI)(DX*1), DX
56 loop3:
57 MOVW (DI), SI
58 CMPW SI,R8
59 JZ partial_success3
60 ADDQ $1,DI
61 CMPQ DI,DX
62 JB loop3
63 JMP fail
64 partial_success3:
65 MOVW 1(DI), SI
66 CMPW SI,BX
67 JZ success
68 ADDQ $1,DI
69 CMPQ DI,DX
70 JB loop3
71 JMP fail
72 _4_or_more:
73 CMPQ AX, $4
74 JA _5_or_more
75 MOVL (R8), R8
76 LEAQ -3(DI)(DX*1), DX
77 loop4:
78 MOVL (DI), SI
79 CMPL SI,R8
80 JZ success
81 ADDQ $1,DI
82 CMPQ DI,DX
83 JB loop4
84 JMP fail
85 _5_or_more:
86 CMPQ AX, $7
87 JA _8_or_more
88 LEAQ 1(DI)(DX*1), DX
89 SUBQ AX, DX
90 MOVL -4(R8)(AX*1), BX
91 MOVL (R8), R8
92 loop5to7:
93 MOVL (DI), SI
94 CMPL SI,R8
95 JZ partial_success5to7
96 ADDQ $1,DI
97 CMPQ DI,DX
98 JB loop5to7
99 JMP fail
100 partial_success5to7:
101 MOVL -4(AX)(DI*1), SI
102 CMPL SI,BX
103 JZ success
104 ADDQ $1,DI
105 CMPQ DI,DX
106 JB loop5to7
107 JMP fail
108 _8_or_more:
109 CMPQ AX, $8
110 JA _9_or_more
111 MOVQ (R8), R8
112 LEAQ -7(DI)(DX*1), DX
113 loop8:
114 MOVQ (DI), SI
115 CMPQ SI,R8
116 JZ success
117 ADDQ $1,DI
118 CMPQ DI,DX
119 JB loop8
120 JMP fail
121 _9_or_more:
122 CMPQ AX, $15
123 JA _16_or_more
124 LEAQ 1(DI)(DX*1), DX
125 SUBQ AX, DX
126 MOVQ -8(R8)(AX*1), BX
127 MOVQ (R8), R8
128 loop9to15:
129 MOVQ (DI), SI
130 CMPQ SI,R8
131 JZ partial_success9to15
132 ADDQ $1,DI
133 CMPQ DI,DX
134 JB loop9to15
135 JMP fail
136 partial_success9to15:
137 MOVQ -8(AX)(DI*1), SI
138 CMPQ SI,BX
139 JZ success
140 ADDQ $1,DI
141 CMPQ DI,DX
142 JB loop9to15
143 JMP fail
144 _16_or_more:
145 CMPQ AX, $16
146 JA _17_or_more
147 MOVOU (R8), X1
148 LEAQ -15(DI)(DX*1), DX
149 loop16:
150 MOVOU (DI), X2
151 PCMPEQB X1, X2
152 PMOVMSKB X2, SI
153 CMPQ SI, $0xffff
154 JE success
155 ADDQ $1,DI
156 CMPQ DI,DX
157 JB loop16
158 JMP fail
159 _17_or_more:
160 CMPQ AX, $31
161 JA _32_or_more
162 LEAQ 1(DI)(DX*1), DX
163 SUBQ AX, DX
164 MOVOU -16(R8)(AX*1), X0
165 MOVOU (R8), X1
166 loop17to31:
167 MOVOU (DI), X2
168 PCMPEQB X1,X2
169 PMOVMSKB X2, SI
170 CMPQ SI, $0xffff
171 JE partial_success17to31
172 ADDQ $1,DI
173 CMPQ DI,DX
174 JB loop17to31
175 JMP fail
176 partial_success17to31:
177 MOVOU -16(AX)(DI*1), X3
178 PCMPEQB X0, X3
179 PMOVMSKB X3, SI
180 CMPQ SI, $0xffff
181 JE success
182 ADDQ $1,DI
183 CMPQ DI,DX
184 JB loop17to31
185 JMP fail
186 // We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
187 // So no need to check cpuid
188 _32_or_more:
189 CMPQ AX, $32
190 JA _33_to_63
191 VMOVDQU (R8), Y1
192 LEAQ -31(DI)(DX*1), DX
193 loop32:
194 VMOVDQU (DI), Y2
195 VPCMPEQB Y1, Y2, Y3
196 VPMOVMSKB Y3, SI
197 CMPL SI, $0xffffffff
198 JE success_avx2
199 ADDQ $1,DI
200 CMPQ DI,DX
201 JB loop32
202 JMP fail_avx2
203 _33_to_63:
204 LEAQ 1(DI)(DX*1), DX
205 SUBQ AX, DX
206 VMOVDQU -32(R8)(AX*1), Y0
207 VMOVDQU (R8), Y1
208 loop33to63:
209 VMOVDQU (DI), Y2
210 VPCMPEQB Y1, Y2, Y3
211 VPMOVMSKB Y3, SI
212 CMPL SI, $0xffffffff
213 JE partial_success33to63
214 ADDQ $1,DI
215 CMPQ DI,DX
216 JB loop33to63
217 JMP fail_avx2
218 partial_success33to63:
219 VMOVDQU -32(AX)(DI*1), Y3
220 VPCMPEQB Y0, Y3, Y4
221 VPMOVMSKB Y4, SI
222 CMPL SI, $0xffffffff
223 JE success_avx2
224 ADDQ $1,DI
225 CMPQ DI,DX
226 JB loop33to63
227 fail_avx2:
228 VZEROUPPER
229 fail:
230 MOVQ $-1, (R11)
231 RET
232 success_avx2:
233 VZEROUPPER
234 JMP success
235 sse42:
236 CMPB internal∕cpu·X86+const_offsetX86HasSSE42(SB), $1
237 JNE no_sse42
238 CMPQ AX, $12
239 // PCMPESTRI is slower than normal compare,
240 // so using it makes sense only if we advance 4+ bytes per compare
241 // This value was determined experimentally and is the ~same
242 // on Nehalem (first with SSE42) and Haswell.
243 JAE _9_or_more
244 LEAQ 16(R8), SI
245 TESTW $0xff0, SI
246 JEQ no_sse42
247 MOVOU (R8), X1
248 LEAQ -15(DI)(DX*1), SI
249 MOVQ $16, R9
250 SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
251 loop_sse42:
252 // 0x0c means: unsigned byte compare (bits 0,1 are 00)
253 // for equality (bits 2,3 are 11)
254 // result is not masked or inverted (bits 4,5 are 00)
255 // and corresponds to first matching byte (bit 6 is 0)
256 PCMPESTRI $0x0c, (DI), X1
257 // CX == 16 means no match,
258 // CX > R9 means partial match at the end of the string,
259 // otherwise sep is at offset CX from X1 start
260 CMPQ CX, R9
261 JBE sse42_success
262 ADDQ R9, DI
263 CMPQ DI, SI
264 JB loop_sse42
265 PCMPESTRI $0x0c, -1(SI), X1
266 CMPQ CX, R9
267 JA fail
268 LEAQ -1(SI), DI
269 sse42_success:
270 ADDQ CX, DI
271 success:
272 SUBQ R10, DI
273 MOVQ DI, (R11)
274 RET
275
View as plain text