1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "go_asm.h"
6 #include "textflag.h"
7
8 TEXT ·IndexByte(SB), NOSPLIT, $0-40
9 MOVQ b_base+0(FP), SI
10 MOVQ b_len+8(FP), BX
11 MOVB c+24(FP), AL
12 LEAQ ret+32(FP), R8
13 JMP indexbytebody<>(SB)
14
15 TEXT ·IndexByteString(SB), NOSPLIT, $0-32
16 MOVQ s_base+0(FP), SI
17 MOVQ s_len+8(FP), BX
18 MOVB c+16(FP), AL
19 LEAQ ret+24(FP), R8
20 JMP indexbytebody<>(SB)
21
22 // input:
23 // SI: data
24 // BX: data len
25 // AL: byte sought
26 // R8: address to put result
27 TEXT indexbytebody<>(SB), NOSPLIT, $0
28 // Shuffle X0 around so that each byte contains
29 // the character we're looking for.
30 MOVD AX, X0
31 PUNPCKLBW X0, X0
32 PUNPCKLBW X0, X0
33 PSHUFL $0, X0, X0
34
35 CMPQ BX, $16
36 JLT small
37
38 MOVQ SI, DI
39
40 CMPQ BX, $32
41 JA avx2
42 sse:
43 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
44 JMP sseloopentry
45
46 sseloop:
47 // Move the next 16-byte chunk of the data into X1.
48 MOVOU (DI), X1
49 // Compare bytes in X0 to X1.
50 PCMPEQB X0, X1
51 // Take the top bit of each byte in X1 and put the result in DX.
52 PMOVMSKB X1, DX
53 // Find first set bit, if any.
54 BSFL DX, DX
55 JNZ ssesuccess
56 // Advance to next block.
57 ADDQ $16, DI
58 sseloopentry:
59 CMPQ DI, AX
60 JB sseloop
61
62 // Search the last 16-byte chunk. This chunk may overlap with the
63 // chunks we've already searched, but that's ok.
64 MOVQ AX, DI
65 MOVOU (AX), X1
66 PCMPEQB X0, X1
67 PMOVMSKB X1, DX
68 BSFL DX, DX
69 JNZ ssesuccess
70
71 failure:
72 MOVQ $-1, (R8)
73 RET
74
75 // We've found a chunk containing the byte.
76 // The chunk was loaded from DI.
77 // The index of the matching byte in the chunk is DX.
78 // The start of the data is SI.
79 ssesuccess:
80 SUBQ SI, DI // Compute offset of chunk within data.
81 ADDQ DX, DI // Add offset of byte within chunk.
82 MOVQ DI, (R8)
83 RET
84
85 // handle for lengths < 16
86 small:
87 TESTQ BX, BX
88 JEQ failure
89
90 // Check if we'll load across a page boundary.
91 LEAQ 16(SI), AX
92 TESTW $0xff0, AX
93 JEQ endofpage
94
95 MOVOU (SI), X1 // Load data
96 PCMPEQB X0, X1 // Compare target byte with each byte in data.
97 PMOVMSKB X1, DX // Move result bits to integer register.
98 BSFL DX, DX // Find first set bit.
99 JZ failure // No set bit, failure.
100 CMPL DX, BX
101 JAE failure // Match is past end of data.
102 MOVQ DX, (R8)
103 RET
104
105 endofpage:
106 MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1.
107 PCMPEQB X0, X1 // Compare target byte with each byte in data.
108 PMOVMSKB X1, DX // Move result bits to integer register.
109 MOVL BX, CX
110 SHLL CX, DX
111 SHRL $16, DX // Shift desired bits down to bottom of register.
112 BSFL DX, DX // Find first set bit.
113 JZ failure // No set bit, failure.
114 MOVQ DX, (R8)
115 RET
116
117 avx2:
118 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
119 JNE sse
120 MOVD AX, X0
121 LEAQ -32(SI)(BX*1), R11
122 VPBROADCASTB X0, Y1
123 avx2_loop:
124 VMOVDQU (DI), Y2
125 VPCMPEQB Y1, Y2, Y3
126 VPTEST Y3, Y3
127 JNZ avx2success
128 ADDQ $32, DI
129 CMPQ DI, R11
130 JLT avx2_loop
131 MOVQ R11, DI
132 VMOVDQU (DI), Y2
133 VPCMPEQB Y1, Y2, Y3
134 VPTEST Y3, Y3
135 JNZ avx2success
136 VZEROUPPER
137 MOVQ $-1, (R8)
138 RET
139
140 avx2success:
141 VPMOVMSKB Y3, DX
142 BSFL DX, DX
143 SUBQ SI, DI
144 ADDQ DI, DX
145 MOVQ DX, (R8)
146 VZEROUPPER
147 RET
148
View as plain text