Text file
src/runtime/memclr_amd64.s
1 // Copyright 2014 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build !plan9
6
7 #include "go_asm.h"
8 #include "textflag.h"
9
10 // See memclrNoHeapPointers Go doc for important implementation constraints.
11
12 // func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
13 // ABIInternal for performance.
14 TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB), NOSPLIT, $0-16
15 // AX = ptr
16 // BX = n
17 MOVQ AX, DI // DI = ptr
18 XORQ AX, AX
19
20 // MOVOU seems always faster than REP STOSQ.
21 tail:
22 // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
23 TESTQ BX, BX
24 JEQ _0
25 CMPQ BX, $2
26 JBE _1or2
27 CMPQ BX, $4
28 JBE _3or4
29 CMPQ BX, $8
30 JB _5through7
31 JE _8
32 CMPQ BX, $16
33 JBE _9through16
34 CMPQ BX, $32
35 JBE _17through32
36 CMPQ BX, $64
37 JBE _33through64
38 CMPQ BX, $128
39 JBE _65through128
40 CMPQ BX, $256
41 JBE _129through256
42 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
43 JE loop_preheader_avx2
44 // TODO: for really big clears, use MOVNTDQ, even without AVX2.
45
46 loop:
47 MOVOU X15, 0(DI)
48 MOVOU X15, 16(DI)
49 MOVOU X15, 32(DI)
50 MOVOU X15, 48(DI)
51 MOVOU X15, 64(DI)
52 MOVOU X15, 80(DI)
53 MOVOU X15, 96(DI)
54 MOVOU X15, 112(DI)
55 MOVOU X15, 128(DI)
56 MOVOU X15, 144(DI)
57 MOVOU X15, 160(DI)
58 MOVOU X15, 176(DI)
59 MOVOU X15, 192(DI)
60 MOVOU X15, 208(DI)
61 MOVOU X15, 224(DI)
62 MOVOU X15, 240(DI)
63 SUBQ $256, BX
64 ADDQ $256, DI
65 CMPQ BX, $256
66 JAE loop
67 JMP tail
68
69 loop_preheader_avx2:
70 VPXOR Y0, Y0, Y0
71 // For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
72 // For larger sizes it is always faster, even on dual Xeons with 30M cache.
73 // TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
74 CMPQ BX, $0x2000000
75 JAE loop_preheader_avx2_huge
76 loop_avx2:
77 VMOVDQU Y0, 0(DI)
78 VMOVDQU Y0, 32(DI)
79 VMOVDQU Y0, 64(DI)
80 VMOVDQU Y0, 96(DI)
81 SUBQ $128, BX
82 ADDQ $128, DI
83 CMPQ BX, $128
84 JAE loop_avx2
85 VMOVDQU Y0, -32(DI)(BX*1)
86 VMOVDQU Y0, -64(DI)(BX*1)
87 VMOVDQU Y0, -96(DI)(BX*1)
88 VMOVDQU Y0, -128(DI)(BX*1)
89 VZEROUPPER
90 RET
91 loop_preheader_avx2_huge:
92 // Align to 32 byte boundary
93 VMOVDQU Y0, 0(DI)
94 MOVQ DI, SI
95 ADDQ $32, DI
96 ANDQ $~31, DI
97 SUBQ DI, SI
98 ADDQ SI, BX
99 loop_avx2_huge:
100 VMOVNTDQ Y0, 0(DI)
101 VMOVNTDQ Y0, 32(DI)
102 VMOVNTDQ Y0, 64(DI)
103 VMOVNTDQ Y0, 96(DI)
104 SUBQ $128, BX
105 ADDQ $128, DI
106 CMPQ BX, $128
107 JAE loop_avx2_huge
108 // In the description of MOVNTDQ in [1]
109 // "... fencing operation implemented with the SFENCE or MFENCE instruction
110 // should be used in conjunction with MOVNTDQ instructions..."
111 // [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
112 SFENCE
113 VMOVDQU Y0, -32(DI)(BX*1)
114 VMOVDQU Y0, -64(DI)(BX*1)
115 VMOVDQU Y0, -96(DI)(BX*1)
116 VMOVDQU Y0, -128(DI)(BX*1)
117 VZEROUPPER
118 RET
119
120 _1or2:
121 MOVB AX, (DI)
122 MOVB AX, -1(DI)(BX*1)
123 RET
124 _0:
125 RET
126 _3or4:
127 MOVW AX, (DI)
128 MOVW AX, -2(DI)(BX*1)
129 RET
130 _5through7:
131 MOVL AX, (DI)
132 MOVL AX, -4(DI)(BX*1)
133 RET
134 _8:
135 // We need a separate case for 8 to make sure we clear pointers atomically.
136 MOVQ AX, (DI)
137 RET
138 _9through16:
139 MOVQ AX, (DI)
140 MOVQ AX, -8(DI)(BX*1)
141 RET
142 _17through32:
143 MOVOU X15, (DI)
144 MOVOU X15, -16(DI)(BX*1)
145 RET
146 _33through64:
147 MOVOU X15, (DI)
148 MOVOU X15, 16(DI)
149 MOVOU X15, -32(DI)(BX*1)
150 MOVOU X15, -16(DI)(BX*1)
151 RET
152 _65through128:
153 MOVOU X15, (DI)
154 MOVOU X15, 16(DI)
155 MOVOU X15, 32(DI)
156 MOVOU X15, 48(DI)
157 MOVOU X15, -64(DI)(BX*1)
158 MOVOU X15, -48(DI)(BX*1)
159 MOVOU X15, -32(DI)(BX*1)
160 MOVOU X15, -16(DI)(BX*1)
161 RET
162 _129through256:
163 MOVOU X15, (DI)
164 MOVOU X15, 16(DI)
165 MOVOU X15, 32(DI)
166 MOVOU X15, 48(DI)
167 MOVOU X15, 64(DI)
168 MOVOU X15, 80(DI)
169 MOVOU X15, 96(DI)
170 MOVOU X15, 112(DI)
171 MOVOU X15, -128(DI)(BX*1)
172 MOVOU X15, -112(DI)(BX*1)
173 MOVOU X15, -96(DI)(BX*1)
174 MOVOU X15, -80(DI)(BX*1)
175 MOVOU X15, -64(DI)(BX*1)
176 MOVOU X15, -48(DI)(BX*1)
177 MOVOU X15, -32(DI)(BX*1)
178 MOVOU X15, -16(DI)(BX*1)
179 RET
180
View as plain text