Text file
src/runtime/memmove_arm64.s
1 // Copyright 2014 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "textflag.h"
6
7 // See memmove Go doc for important implementation constraints.
8
9 // Register map
10 //
11 // dstin R0
12 // src R1
13 // count R2
14 // dst R3 (same as R0, but gets modified in unaligned cases)
15 // srcend R4
16 // dstend R5
17 // data R6-R17
18 // tmp1 R14
19
20 // Copies are split into 3 main cases: small copies of up to 32 bytes, medium
21 // copies of up to 128 bytes, and large copies. The overhead of the overlap
22 // check is negligible since it is only required for large copies.
23 //
24 // Large copies use a software pipelined loop processing 64 bytes per iteration.
25 // The destination pointer is 16-byte aligned to minimize unaligned accesses.
26 // The loop tail is handled by always copying 64 bytes from the end.
27
28 // func memmove(to, from unsafe.Pointer, n uintptr)
29 TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24
30 #ifndef GOEXPERIMENT_regabiargs
31 MOVD to+0(FP), R0
32 MOVD from+8(FP), R1
33 MOVD n+16(FP), R2
34 #endif
35 CBZ R2, copy0
36
37 // Small copies: 1..16 bytes
38 CMP $16, R2
39 BLE copy16
40
41 // Large copies
42 CMP $128, R2
43 BHI copy_long
44 CMP $32, R2
45 BHI copy32_128
46
47 // Small copies: 17..32 bytes.
48 LDP (R1), (R6, R7)
49 ADD R1, R2, R4 // R4 points just past the last source byte
50 LDP -16(R4), (R12, R13)
51 STP (R6, R7), (R0)
52 ADD R0, R2, R5 // R5 points just past the last destination byte
53 STP (R12, R13), -16(R5)
54 RET
55
56 // Small copies: 1..16 bytes.
57 copy16:
58 ADD R1, R2, R4 // R4 points just past the last source byte
59 ADD R0, R2, R5 // R5 points just past the last destination byte
60 CMP $8, R2
61 BLT copy7
62 MOVD (R1), R6
63 MOVD -8(R4), R7
64 MOVD R6, (R0)
65 MOVD R7, -8(R5)
66 RET
67
68 copy7:
69 TBZ $2, R2, copy3
70 MOVWU (R1), R6
71 MOVWU -4(R4), R7
72 MOVW R6, (R0)
73 MOVW R7, -4(R5)
74 RET
75
76 copy3:
77 TBZ $1, R2, copy1
78 MOVHU (R1), R6
79 MOVHU -2(R4), R7
80 MOVH R6, (R0)
81 MOVH R7, -2(R5)
82 RET
83
84 copy1:
85 MOVBU (R1), R6
86 MOVB R6, (R0)
87
88 copy0:
89 RET
90
91 // Medium copies: 33..128 bytes.
92 copy32_128:
93 ADD R1, R2, R4 // R4 points just past the last source byte
94 ADD R0, R2, R5 // R5 points just past the last destination byte
95 LDP (R1), (R6, R7)
96 LDP 16(R1), (R8, R9)
97 LDP -32(R4), (R10, R11)
98 LDP -16(R4), (R12, R13)
99 CMP $64, R2
100 BHI copy128
101 STP (R6, R7), (R0)
102 STP (R8, R9), 16(R0)
103 STP (R10, R11), -32(R5)
104 STP (R12, R13), -16(R5)
105 RET
106
107 // Copy 65..128 bytes.
108 copy128:
109 LDP 32(R1), (R14, R15)
110 LDP 48(R1), (R16, R17)
111 CMP $96, R2
112 BLS copy96
113 LDP -64(R4), (R2, R3)
114 LDP -48(R4), (R1, R4)
115 STP (R2, R3), -64(R5)
116 STP (R1, R4), -48(R5)
117
118 copy96:
119 STP (R6, R7), (R0)
120 STP (R8, R9), 16(R0)
121 STP (R14, R15), 32(R0)
122 STP (R16, R17), 48(R0)
123 STP (R10, R11), -32(R5)
124 STP (R12, R13), -16(R5)
125 RET
126
127 // Copy more than 128 bytes.
128 copy_long:
129 ADD R1, R2, R4 // R4 points just past the last source byte
130 ADD R0, R2, R5 // R5 points just past the last destination byte
131 MOVD ZR, R7
132 MOVD ZR, R8
133
134 CMP $1024, R2
135 BLT backward_check
136 // feature detect to decide how to align
137 MOVBU runtime·arm64UseAlignedLoads(SB), R6
138 CBNZ R6, use_aligned_loads
139 MOVD R0, R7
140 MOVD R5, R8
141 B backward_check
142 use_aligned_loads:
143 MOVD R1, R7
144 MOVD R4, R8
145 // R7 and R8 are used here for the realignment calculation. In
146 // the use_aligned_loads case, R7 is the src pointer and R8 is
147 // srcend pointer, which is used in the backward copy case.
148 // When doing aligned stores, R7 is the dst pointer and R8 is
149 // the dstend pointer.
150
151 backward_check:
152 // Use backward copy if there is an overlap.
153 SUB R1, R0, R14
154 CBZ R14, copy0
155 CMP R2, R14
156 BCC copy_long_backward
157
158 // Copy 16 bytes and then align src (R1) or dst (R0) to 16-byte alignment.
159 LDP (R1), (R12, R13) // Load A
160 AND $15, R7, R14 // Calculate the realignment offset
161 SUB R14, R1, R1
162 SUB R14, R0, R3 // move dst back same amount as src
163 ADD R14, R2, R2
164 LDP 16(R1), (R6, R7) // Load B
165 STP (R12, R13), (R0) // Store A
166 LDP 32(R1), (R8, R9) // Load C
167 LDP 48(R1), (R10, R11) // Load D
168 LDP.W 64(R1), (R12, R13) // Load E
169 // 80 bytes have been loaded; if less than 80+64 bytes remain, copy from the end
170 SUBS $144, R2, R2
171 BLS copy64_from_end
172
173 loop64:
174 STP (R6, R7), 16(R3) // Store B
175 LDP 16(R1), (R6, R7) // Load B (next iteration)
176 STP (R8, R9), 32(R3) // Store C
177 LDP 32(R1), (R8, R9) // Load C
178 STP (R10, R11), 48(R3) // Store D
179 LDP 48(R1), (R10, R11) // Load D
180 STP.W (R12, R13), 64(R3) // Store E
181 LDP.W 64(R1), (R12, R13) // Load E
182 SUBS $64, R2, R2
183 BHI loop64
184
185 // Write the last iteration and copy 64 bytes from the end.
186 copy64_from_end:
187 LDP -64(R4), (R14, R15) // Load F
188 STP (R6, R7), 16(R3) // Store B
189 LDP -48(R4), (R6, R7) // Load G
190 STP (R8, R9), 32(R3) // Store C
191 LDP -32(R4), (R8, R9) // Load H
192 STP (R10, R11), 48(R3) // Store D
193 LDP -16(R4), (R10, R11) // Load I
194 STP (R12, R13), 64(R3) // Store E
195 STP (R14, R15), -64(R5) // Store F
196 STP (R6, R7), -48(R5) // Store G
197 STP (R8, R9), -32(R5) // Store H
198 STP (R10, R11), -16(R5) // Store I
199 RET
200
201 // Large backward copy for overlapping copies.
202 // Copy 16 bytes and then align srcend (R4) or dstend (R5) to 16-byte alignment.
203 copy_long_backward:
204 LDP -16(R4), (R12, R13)
205 AND $15, R8, R14
206 SUB R14, R4, R4
207 SUB R14, R2, R2
208 LDP -16(R4), (R6, R7)
209 STP (R12, R13), -16(R5)
210 LDP -32(R4), (R8, R9)
211 LDP -48(R4), (R10, R11)
212 LDP.W -64(R4), (R12, R13)
213 SUB R14, R5, R5
214 SUBS $128, R2, R2
215 BLS copy64_from_start
216
217 loop64_backward:
218 STP (R6, R7), -16(R5)
219 LDP -16(R4), (R6, R7)
220 STP (R8, R9), -32(R5)
221 LDP -32(R4), (R8, R9)
222 STP (R10, R11), -48(R5)
223 LDP -48(R4), (R10, R11)
224 STP.W (R12, R13), -64(R5)
225 LDP.W -64(R4), (R12, R13)
226 SUBS $64, R2, R2
227 BHI loop64_backward
228
229 // Write the last iteration and copy 64 bytes from the start.
230 copy64_from_start:
231 LDP 48(R1), (R2, R3)
232 STP (R6, R7), -16(R5)
233 LDP 32(R1), (R6, R7)
234 STP (R8, R9), -32(R5)
235 LDP 16(R1), (R8, R9)
236 STP (R10, R11), -48(R5)
237 LDP (R1), (R10, R11)
238 STP (R12, R13), -64(R5)
239 STP (R2, R3), 48(R0)
240 STP (R6, R7), 32(R0)
241 STP (R8, R9), 16(R0)
242 STP (R10, R11), (R0)
243 RET
244
View as plain text