1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "go_asm.h"
6 #include "textflag.h"
7
8 TEXT ·Count(SB),NOSPLIT,$0-40
9 CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
10 JEQ 2(PC)
11 JMP ·countGeneric(SB)
12 MOVQ b_base+0(FP), SI
13 MOVQ b_len+8(FP), BX
14 MOVB c+24(FP), AL
15 LEAQ ret+32(FP), R8
16 JMP countbody<>(SB)
17
18 TEXT ·CountString(SB),NOSPLIT,$0-32
19 CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
20 JEQ 2(PC)
21 JMP ·countGenericString(SB)
22 MOVQ s_base+0(FP), SI
23 MOVQ s_len+8(FP), BX
24 MOVB c+16(FP), AL
25 LEAQ ret+24(FP), R8
26 JMP countbody<>(SB)
27
28 // input:
29 // SI: data
30 // BX: data len
31 // AL: byte sought
32 // R8: address to put result
33 // This function requires the POPCNT instruction.
34 TEXT countbody<>(SB),NOSPLIT,$0
35 // Shuffle X0 around so that each byte contains
36 // the character we're looking for.
37 MOVD AX, X0
38 PUNPCKLBW X0, X0
39 PUNPCKLBW X0, X0
40 PSHUFL $0, X0, X0
41
42 CMPQ BX, $16
43 JLT small
44
45 MOVQ $0, R12 // Accumulator
46
47 MOVQ SI, DI
48
49 CMPQ BX, $32
50 JA avx2
51 sse:
52 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
53 JMP sseloopentry
54
55 sseloop:
56 // Move the next 16-byte chunk of the data into X1.
57 MOVOU (DI), X1
58 // Compare bytes in X0 to X1.
59 PCMPEQB X0, X1
60 // Take the top bit of each byte in X1 and put the result in DX.
61 PMOVMSKB X1, DX
62 // Count number of matching bytes
63 POPCNTL DX, DX
64 // Accumulate into R12
65 ADDQ DX, R12
66 // Advance to next block.
67 ADDQ $16, DI
68 sseloopentry:
69 CMPQ DI, AX
70 JBE sseloop
71
72 // Get the number of bytes to consider in the last 16 bytes
73 ANDQ $15, BX
74 JZ end
75
76 // Create mask to ignore overlap between previous 16 byte block
77 // and the next.
78 MOVQ $16,CX
79 SUBQ BX, CX
80 MOVQ $0xFFFF, R10
81 SARQ CL, R10
82 SALQ CL, R10
83
84 // Process the last 16-byte chunk. This chunk may overlap with the
85 // chunks we've already searched so we need to mask part of it.
86 MOVOU (AX), X1
87 PCMPEQB X0, X1
88 PMOVMSKB X1, DX
89 // Apply mask
90 ANDQ R10, DX
91 POPCNTL DX, DX
92 ADDQ DX, R12
93 end:
94 MOVQ R12, (R8)
95 RET
96
97 // handle for lengths < 16
98 small:
99 TESTQ BX, BX
100 JEQ endzero
101
102 // Check if we'll load across a page boundary.
103 LEAQ 16(SI), AX
104 TESTW $0xff0, AX
105 JEQ endofpage
106
107 // We must ignore high bytes as they aren't part of our slice.
108 // Create mask.
109 MOVB BX, CX
110 MOVQ $1, R10
111 SALQ CL, R10
112 SUBQ $1, R10
113
114 // Load data
115 MOVOU (SI), X1
116 // Compare target byte with each byte in data.
117 PCMPEQB X0, X1
118 // Move result bits to integer register.
119 PMOVMSKB X1, DX
120 // Apply mask
121 ANDQ R10, DX
122 POPCNTL DX, DX
123 // Directly return DX, we don't need to accumulate
124 // since we have <16 bytes.
125 MOVQ DX, (R8)
126 RET
127 endzero:
128 MOVQ $0, (R8)
129 RET
130
131 endofpage:
132 // We must ignore low bytes as they aren't part of our slice.
133 MOVQ $16,CX
134 SUBQ BX, CX
135 MOVQ $0xFFFF, R10
136 SARQ CL, R10
137 SALQ CL, R10
138
139 // Load data into the high end of X1.
140 MOVOU -16(SI)(BX*1), X1
141 // Compare target byte with each byte in data.
142 PCMPEQB X0, X1
143 // Move result bits to integer register.
144 PMOVMSKB X1, DX
145 // Apply mask
146 ANDQ R10, DX
147 // Directly return DX, we don't need to accumulate
148 // since we have <16 bytes.
149 POPCNTL DX, DX
150 MOVQ DX, (R8)
151 RET
152
153 avx2:
154 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
155 JNE sse
156 MOVD AX, X0
157 LEAQ -32(SI)(BX*1), R11
158 VPBROADCASTB X0, Y1
159 avx2_loop:
160 VMOVDQU (DI), Y2
161 VPCMPEQB Y1, Y2, Y3
162 VPMOVMSKB Y3, DX
163 POPCNTL DX, DX
164 ADDQ DX, R12
165 ADDQ $32, DI
166 CMPQ DI, R11
167 JLE avx2_loop
168
169 // If last block is already processed,
170 // skip to the end.
171 CMPQ DI, R11
172 JEQ endavx
173
174 // Load address of the last 32 bytes.
175 // There is an overlap with the previous block.
176 MOVQ R11, DI
177 VMOVDQU (DI), Y2
178 VPCMPEQB Y1, Y2, Y3
179 VPMOVMSKB Y3, DX
180 // Exit AVX mode.
181 VZEROUPPER
182
183 // Create mask to ignore overlap between previous 32 byte block
184 // and the next.
185 ANDQ $31, BX
186 MOVQ $32,CX
187 SUBQ BX, CX
188 MOVQ $0xFFFFFFFF, R10
189 SARQ CL, R10
190 SALQ CL, R10
191 // Apply mask
192 ANDQ R10, DX
193 POPCNTL DX, DX
194 ADDQ DX, R12
195 MOVQ R12, (R8)
196 RET
197 endavx:
198 // Exit AVX mode.
199 VZEROUPPER
200 MOVQ R12, (R8)
201 RET
202
View as plain text