1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "go_asm.h"
6 #include "textflag.h"
7
8 TEXT ·Compare<ABIInternal>(SB),NOSPLIT,$0-56
9 // AX = a_base (want in SI)
10 // BX = a_len (want in BX)
11 // CX = a_cap (unused)
12 // DI = b_base (want in DI)
13 // SI = b_len (want in DX)
14 // R8 = b_cap (unused)
15 MOVQ SI, DX
16 MOVQ AX, SI
17 JMP cmpbody<>(SB)
18
19 TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT,$0-40
20 // AX = a_base (want in SI)
21 // BX = a_len (want in BX)
22 // CX = b_base (want in DI)
23 // DI = b_len (want in DX)
24 MOVQ AX, SI
25 MOVQ DI, DX
26 MOVQ CX, DI
27 JMP cmpbody<>(SB)
28
29 // input:
30 // SI = a
31 // DI = b
32 // BX = alen
33 // DX = blen
34 // output:
35 // AX = output (-1/0/1)
36 TEXT cmpbody<>(SB),NOSPLIT,$0-0
37 CMPQ SI, DI
38 JEQ allsame
39 CMPQ BX, DX
40 MOVQ DX, R8
41 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare
42 CMPQ R8, $8
43 JB small
44
45 CMPQ R8, $63
46 JBE loop
47 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
48 JEQ big_loop_avx2
49 JMP big_loop
50 loop:
51 CMPQ R8, $16
52 JBE _0through16
53 MOVOU (SI), X0
54 MOVOU (DI), X1
55 PCMPEQB X0, X1
56 PMOVMSKB X1, AX
57 XORQ $0xffff, AX // convert EQ to NE
58 JNE diff16 // branch if at least one byte is not equal
59 ADDQ $16, SI
60 ADDQ $16, DI
61 SUBQ $16, R8
62 JMP loop
63
64 diff64:
65 ADDQ $48, SI
66 ADDQ $48, DI
67 JMP diff16
68 diff48:
69 ADDQ $32, SI
70 ADDQ $32, DI
71 JMP diff16
72 diff32:
73 ADDQ $16, SI
74 ADDQ $16, DI
75 // AX = bit mask of differences
76 diff16:
77 BSFQ AX, BX // index of first byte that differs
78 XORQ AX, AX
79 MOVB (SI)(BX*1), CX
80 CMPB CX, (DI)(BX*1)
81 SETHI AX
82 LEAQ -1(AX*2), AX // convert 1/0 to +1/-1
83 RET
84
85 // 0 through 16 bytes left, alen>=8, blen>=8
86 _0through16:
87 CMPQ R8, $8
88 JBE _0through8
89 MOVQ (SI), AX
90 MOVQ (DI), CX
91 CMPQ AX, CX
92 JNE diff8
93 _0through8:
94 MOVQ -8(SI)(R8*1), AX
95 MOVQ -8(DI)(R8*1), CX
96 CMPQ AX, CX
97 JEQ allsame
98
99 // AX and CX contain parts of a and b that differ.
100 diff8:
101 BSWAPQ AX // reverse order of bytes
102 BSWAPQ CX
103 XORQ AX, CX
104 BSRQ CX, CX // index of highest bit difference
105 SHRQ CX, AX // move a's bit to bottom
106 ANDQ $1, AX // mask bit
107 LEAQ -1(AX*2), AX // 1/0 => +1/-1
108 RET
109
110 // 0-7 bytes in common
111 small:
112 LEAQ (R8*8), CX // bytes left -> bits left
113 NEGQ CX // - bits lift (== 64 - bits left mod 64)
114 JEQ allsame
115
116 // load bytes of a into high bytes of AX
117 CMPB SI, $0xf8
118 JA si_high
119 MOVQ (SI), SI
120 JMP si_finish
121 si_high:
122 MOVQ -8(SI)(R8*1), SI
123 SHRQ CX, SI
124 si_finish:
125 SHLQ CX, SI
126
127 // load bytes of b in to high bytes of BX
128 CMPB DI, $0xf8
129 JA di_high
130 MOVQ (DI), DI
131 JMP di_finish
132 di_high:
133 MOVQ -8(DI)(R8*1), DI
134 SHRQ CX, DI
135 di_finish:
136 SHLQ CX, DI
137
138 BSWAPQ SI // reverse order of bytes
139 BSWAPQ DI
140 XORQ SI, DI // find bit differences
141 JEQ allsame
142 BSRQ DI, CX // index of highest bit difference
143 SHRQ CX, SI // move a's bit to bottom
144 ANDQ $1, SI // mask bit
145 LEAQ -1(SI*2), AX // 1/0 => +1/-1
146 RET
147
148 allsame:
149 XORQ AX, AX
150 XORQ CX, CX
151 CMPQ BX, DX
152 SETGT AX // 1 if alen > blen
153 SETEQ CX // 1 if alen == blen
154 LEAQ -1(CX)(AX*2), AX // 1,0,-1 result
155 RET
156
157 // this works for >= 64 bytes of data.
158 big_loop:
159 MOVOU (SI), X0
160 MOVOU (DI), X1
161 PCMPEQB X0, X1
162 PMOVMSKB X1, AX
163 XORQ $0xffff, AX
164 JNE diff16
165
166 MOVOU 16(SI), X0
167 MOVOU 16(DI), X1
168 PCMPEQB X0, X1
169 PMOVMSKB X1, AX
170 XORQ $0xffff, AX
171 JNE diff32
172
173 MOVOU 32(SI), X0
174 MOVOU 32(DI), X1
175 PCMPEQB X0, X1
176 PMOVMSKB X1, AX
177 XORQ $0xffff, AX
178 JNE diff48
179
180 MOVOU 48(SI), X0
181 MOVOU 48(DI), X1
182 PCMPEQB X0, X1
183 PMOVMSKB X1, AX
184 XORQ $0xffff, AX
185 JNE diff64
186
187 ADDQ $64, SI
188 ADDQ $64, DI
189 SUBQ $64, R8
190 CMPQ R8, $64
191 JBE loop
192 JMP big_loop
193
194 // Compare 64-bytes per loop iteration.
195 // Loop is unrolled and uses AVX2.
196 big_loop_avx2:
197 VMOVDQU (SI), Y2
198 VMOVDQU (DI), Y3
199 VMOVDQU 32(SI), Y4
200 VMOVDQU 32(DI), Y5
201 VPCMPEQB Y2, Y3, Y0
202 VPMOVMSKB Y0, AX
203 XORL $0xffffffff, AX
204 JNE diff32_avx2
205 VPCMPEQB Y4, Y5, Y6
206 VPMOVMSKB Y6, AX
207 XORL $0xffffffff, AX
208 JNE diff64_avx2
209
210 ADDQ $64, SI
211 ADDQ $64, DI
212 SUBQ $64, R8
213 CMPQ R8, $64
214 JB big_loop_avx2_exit
215 JMP big_loop_avx2
216
217 // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
218 diff32_avx2:
219 VZEROUPPER
220 JMP diff16
221
222 // Same as diff32_avx2, but for last 32 bytes.
223 diff64_avx2:
224 VZEROUPPER
225 JMP diff48
226
227 // For <64 bytes remainder jump to normal loop.
228 big_loop_avx2_exit:
229 VZEROUPPER
230 JMP loop
231
View as plain text