Text file
src/math/big/arith_ppc64x.s
1 // Copyright 2013 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build !math_big_pure_go && (ppc64 || ppc64le)
6 // +build !math_big_pure_go
7 // +build ppc64 ppc64le
8
9 #include "textflag.h"
10
11 // This file provides fast assembly versions for the elementary
12 // arithmetic operations on vectors implemented in arith.go.
13
14 // func mulWW(x, y Word) (z1, z0 Word)
15 TEXT ·mulWW(SB), NOSPLIT, $0
16 MOVD x+0(FP), R4
17 MOVD y+8(FP), R5
18 MULHDU R4, R5, R6
19 MULLD R4, R5, R7
20 MOVD R6, z1+16(FP)
21 MOVD R7, z0+24(FP)
22 RET
23
24 // func addVV(z, y, y []Word) (c Word)
25 // z[i] = x[i] + y[i] for all i, carrying
26 TEXT ·addVV(SB), NOSPLIT, $0
27 MOVD z_len+8(FP), R7 // R7 = z_len
28 MOVD x+24(FP), R8 // R8 = x[]
29 MOVD y+48(FP), R9 // R9 = y[]
30 MOVD z+0(FP), R10 // R10 = z[]
31
32 // If z_len = 0, we are done
33 CMP R0, R7
34 MOVD R0, R4
35 BEQ done
36
37 // Process the first iteration out of the loop so we can
38 // use MOVDU and avoid 3 index registers updates.
39 MOVD 0(R8), R11 // R11 = x[i]
40 MOVD 0(R9), R12 // R12 = y[i]
41 ADD $-1, R7 // R7 = z_len - 1
42 ADDC R12, R11, R15 // R15 = x[i] + y[i], set CA
43 CMP R0, R7
44 MOVD R15, 0(R10) // z[i]
45 BEQ final // If z_len was 1, we are done
46
47 SRD $2, R7, R5 // R5 = z_len/4
48 CMP R0, R5
49 MOVD R5, CTR // Set up loop counter
50 BEQ tail // If R5 = 0, we can't use the loop
51
52 // Process 4 elements per iteration. Unrolling this loop
53 // means a performance trade-off: we will lose performance
54 // for small values of z_len (0.90x in the worst case), but
55 // gain significant performance as z_len increases (up to
56 // 1.45x).
57 loop:
58 MOVD 8(R8), R11 // R11 = x[i]
59 MOVD 16(R8), R12 // R12 = x[i+1]
60 MOVD 24(R8), R14 // R14 = x[i+2]
61 MOVDU 32(R8), R15 // R15 = x[i+3]
62 MOVD 8(R9), R16 // R16 = y[i]
63 MOVD 16(R9), R17 // R17 = y[i+1]
64 MOVD 24(R9), R18 // R18 = y[i+2]
65 MOVDU 32(R9), R19 // R19 = y[i+3]
66 ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA
67 ADDE R12, R17, R21 // R21 = x[i+1] + y[i+1] + CA
68 ADDE R14, R18, R22 // R22 = x[i+2] + y[i+2] + CA
69 ADDE R15, R19, R23 // R23 = x[i+3] + y[i+3] + CA
70 MOVD R20, 8(R10) // z[i]
71 MOVD R21, 16(R10) // z[i+1]
72 MOVD R22, 24(R10) // z[i+2]
73 MOVDU R23, 32(R10) // z[i+3]
74 ADD $-4, R7 // R7 = z_len - 4
75 BC 16, 0, loop // bdnz
76
77 // We may have more elements to read
78 CMP R0, R7
79 BEQ final
80
81 // Process the remaining elements, one at a time
82 tail:
83 MOVDU 8(R8), R11 // R11 = x[i]
84 MOVDU 8(R9), R16 // R16 = y[i]
85 ADD $-1, R7 // R7 = z_len - 1
86 ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA
87 CMP R0, R7
88 MOVDU R20, 8(R10) // z[i]
89 BEQ final // If R7 = 0, we are done
90
91 MOVDU 8(R8), R11
92 MOVDU 8(R9), R16
93 ADD $-1, R7
94 ADDE R11, R16, R20
95 CMP R0, R7
96 MOVDU R20, 8(R10)
97 BEQ final
98
99 MOVD 8(R8), R11
100 MOVD 8(R9), R16
101 ADDE R11, R16, R20
102 MOVD R20, 8(R10)
103
104 final:
105 ADDZE R4 // Capture CA
106
107 done:
108 MOVD R4, c+72(FP)
109 RET
110
111 // func subVV(z, x, y []Word) (c Word)
112 // z[i] = x[i] - y[i] for all i, carrying
113 TEXT ·subVV(SB), NOSPLIT, $0
114 MOVD z_len+8(FP), R7 // R7 = z_len
115 MOVD x+24(FP), R8 // R8 = x[]
116 MOVD y+48(FP), R9 // R9 = y[]
117 MOVD z+0(FP), R10 // R10 = z[]
118
119 // If z_len = 0, we are done
120 CMP R0, R7
121 MOVD R0, R4
122 BEQ done
123
124 // Process the first iteration out of the loop so we can
125 // use MOVDU and avoid 3 index registers updates.
126 MOVD 0(R8), R11 // R11 = x[i]
127 MOVD 0(R9), R12 // R12 = y[i]
128 ADD $-1, R7 // R7 = z_len - 1
129 SUBC R12, R11, R15 // R15 = x[i] - y[i], set CA
130 CMP R0, R7
131 MOVD R15, 0(R10) // z[i]
132 BEQ final // If z_len was 1, we are done
133
134 SRD $2, R7, R5 // R5 = z_len/4
135 CMP R0, R5
136 MOVD R5, CTR // Set up loop counter
137 BEQ tail // If R5 = 0, we can't use the loop
138
139 // Process 4 elements per iteration. Unrolling this loop
140 // means a performance trade-off: we will lose performance
141 // for small values of z_len (0.92x in the worst case), but
142 // gain significant performance as z_len increases (up to
143 // 1.45x).
144 loop:
145 MOVD 8(R8), R11 // R11 = x[i]
146 MOVD 16(R8), R12 // R12 = x[i+1]
147 MOVD 24(R8), R14 // R14 = x[i+2]
148 MOVDU 32(R8), R15 // R15 = x[i+3]
149 MOVD 8(R9), R16 // R16 = y[i]
150 MOVD 16(R9), R17 // R17 = y[i+1]
151 MOVD 24(R9), R18 // R18 = y[i+2]
152 MOVDU 32(R9), R19 // R19 = y[i+3]
153 SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA
154 SUBE R17, R12, R21 // R21 = x[i+1] - y[i+1] + CA
155 SUBE R18, R14, R22 // R22 = x[i+2] - y[i+2] + CA
156 SUBE R19, R15, R23 // R23 = x[i+3] - y[i+3] + CA
157 MOVD R20, 8(R10) // z[i]
158 MOVD R21, 16(R10) // z[i+1]
159 MOVD R22, 24(R10) // z[i+2]
160 MOVDU R23, 32(R10) // z[i+3]
161 ADD $-4, R7 // R7 = z_len - 4
162 BC 16, 0, loop // bdnz
163
164 // We may have more elements to read
165 CMP R0, R7
166 BEQ final
167
168 // Process the remaining elements, one at a time
169 tail:
170 MOVDU 8(R8), R11 // R11 = x[i]
171 MOVDU 8(R9), R16 // R16 = y[i]
172 ADD $-1, R7 // R7 = z_len - 1
173 SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA
174 CMP R0, R7
175 MOVDU R20, 8(R10) // z[i]
176 BEQ final // If R7 = 0, we are done
177
178 MOVDU 8(R8), R11
179 MOVDU 8(R9), R16
180 ADD $-1, R7
181 SUBE R16, R11, R20
182 CMP R0, R7
183 MOVDU R20, 8(R10)
184 BEQ final
185
186 MOVD 8(R8), R11
187 MOVD 8(R9), R16
188 SUBE R16, R11, R20
189 MOVD R20, 8(R10)
190
191 final:
192 ADDZE R4
193 XOR $1, R4
194
195 done:
196 MOVD R4, c+72(FP)
197 RET
198
199 // func addVW(z, x []Word, y Word) (c Word)
200 TEXT ·addVW(SB), NOSPLIT, $0
201 MOVD z+0(FP), R10 // R10 = z[]
202 MOVD x+24(FP), R8 // R8 = x[]
203 MOVD y+48(FP), R4 // R4 = y = c
204 MOVD z_len+8(FP), R11 // R11 = z_len
205
206 CMP R0, R11 // If z_len is zero, return
207 BEQ done
208
209 // We will process the first iteration out of the loop so we capture
210 // the value of c. In the subsequent iterations, we will rely on the
211 // value of CA set here.
212 MOVD 0(R8), R20 // R20 = x[i]
213 ADD $-1, R11 // R11 = z_len - 1
214 ADDC R20, R4, R6 // R6 = x[i] + c
215 CMP R0, R11 // If z_len was 1, we are done
216 MOVD R6, 0(R10) // z[i]
217 BEQ final
218
219 // We will read 4 elements per iteration
220 SRD $2, R11, R9 // R9 = z_len/4
221 DCBT (R8)
222 CMP R0, R9
223 MOVD R9, CTR // Set up the loop counter
224 BEQ tail // If R9 = 0, we can't use the loop
225
226 loop:
227 MOVD 8(R8), R20 // R20 = x[i]
228 MOVD 16(R8), R21 // R21 = x[i+1]
229 MOVD 24(R8), R22 // R22 = x[i+2]
230 MOVDU 32(R8), R23 // R23 = x[i+3]
231 ADDZE R20, R24 // R24 = x[i] + CA
232 ADDZE R21, R25 // R25 = x[i+1] + CA
233 ADDZE R22, R26 // R26 = x[i+2] + CA
234 ADDZE R23, R27 // R27 = x[i+3] + CA
235 MOVD R24, 8(R10) // z[i]
236 MOVD R25, 16(R10) // z[i+1]
237 MOVD R26, 24(R10) // z[i+2]
238 MOVDU R27, 32(R10) // z[i+3]
239 ADD $-4, R11 // R11 = z_len - 4
240 BC 16, 0, loop // bdnz
241
242 // We may have some elements to read
243 CMP R0, R11
244 BEQ final
245
246 tail:
247 MOVDU 8(R8), R20
248 ADDZE R20, R24
249 ADD $-1, R11
250 MOVDU R24, 8(R10)
251 CMP R0, R11
252 BEQ final
253
254 MOVDU 8(R8), R20
255 ADDZE R20, R24
256 ADD $-1, R11
257 MOVDU R24, 8(R10)
258 CMP R0, R11
259 BEQ final
260
261 MOVD 8(R8), R20
262 ADDZE R20, R24
263 MOVD R24, 8(R10)
264
265 final:
266 ADDZE R0, R4 // c = CA
267 done:
268 MOVD R4, c+56(FP)
269 RET
270
271 // func subVW(z, x []Word, y Word) (c Word)
272 TEXT ·subVW(SB), NOSPLIT, $0
273 MOVD z+0(FP), R10 // R10 = z[]
274 MOVD x+24(FP), R8 // R8 = x[]
275 MOVD y+48(FP), R4 // R4 = y = c
276 MOVD z_len+8(FP), R11 // R11 = z_len
277
278 CMP R0, R11 // If z_len is zero, return
279 BEQ done
280
281 // We will process the first iteration out of the loop so we capture
282 // the value of c. In the subsequent iterations, we will rely on the
283 // value of CA set here.
284 MOVD 0(R8), R20 // R20 = x[i]
285 ADD $-1, R11 // R11 = z_len - 1
286 SUBC R4, R20, R6 // R6 = x[i] - c
287 CMP R0, R11 // If z_len was 1, we are done
288 MOVD R6, 0(R10) // z[i]
289 BEQ final
290
291 // We will read 4 elements per iteration
292 SRD $2, R11, R9 // R9 = z_len/4
293 DCBT (R8)
294 CMP R0, R9
295 MOVD R9, CTR // Set up the loop counter
296 BEQ tail // If R9 = 0, we can't use the loop
297
298 // The loop here is almost the same as the one used in s390x, but
299 // we don't need to capture CA every iteration because we've already
300 // done that above.
301 loop:
302 MOVD 8(R8), R20
303 MOVD 16(R8), R21
304 MOVD 24(R8), R22
305 MOVDU 32(R8), R23
306 SUBE R0, R20
307 SUBE R0, R21
308 SUBE R0, R22
309 SUBE R0, R23
310 MOVD R20, 8(R10)
311 MOVD R21, 16(R10)
312 MOVD R22, 24(R10)
313 MOVDU R23, 32(R10)
314 ADD $-4, R11
315 BC 16, 0, loop // bdnz
316
317 // We may have some elements to read
318 CMP R0, R11
319 BEQ final
320
321 tail:
322 MOVDU 8(R8), R20
323 SUBE R0, R20
324 ADD $-1, R11
325 MOVDU R20, 8(R10)
326 CMP R0, R11
327 BEQ final
328
329 MOVDU 8(R8), R20
330 SUBE R0, R20
331 ADD $-1, R11
332 MOVDU R20, 8(R10)
333 CMP R0, R11
334 BEQ final
335
336 MOVD 8(R8), R20
337 SUBE R0, R20
338 MOVD R20, 8(R10)
339
340 final:
341 // Capture CA
342 SUBE R4, R4
343 NEG R4, R4
344
345 done:
346 MOVD R4, c+56(FP)
347 RET
348
349 TEXT ·shlVU(SB), NOSPLIT, $0
350 BR ·shlVU_g(SB)
351
352 TEXT ·shrVU(SB), NOSPLIT, $0
353 BR ·shrVU_g(SB)
354
355 // func mulAddVWW(z, x []Word, y, r Word) (c Word)
356 TEXT ·mulAddVWW(SB), NOSPLIT, $0
357 MOVD z+0(FP), R10 // R10 = z[]
358 MOVD x+24(FP), R8 // R8 = x[]
359 MOVD y+48(FP), R9 // R9 = y
360 MOVD r+56(FP), R4 // R4 = r = c
361 MOVD z_len+8(FP), R11 // R11 = z_len
362
363 CMP R0, R11
364 BEQ done
365
366 MOVD 0(R8), R20
367 ADD $-1, R11
368 MULLD R9, R20, R6 // R6 = z0 = Low-order(x[i]*y)
369 MULHDU R9, R20, R7 // R7 = z1 = High-order(x[i]*y)
370 ADDC R4, R6 // R6 = z0 + r
371 ADDZE R7 // R7 = z1 + CA
372 CMP R0, R11
373 MOVD R7, R4 // R4 = c
374 MOVD R6, 0(R10) // z[i]
375 BEQ done
376
377 // We will read 4 elements per iteration
378 SRD $2, R11, R14 // R14 = z_len/4
379 DCBT (R8)
380 CMP R0, R14
381 MOVD R14, CTR // Set up the loop counter
382 BEQ tail // If R9 = 0, we can't use the loop
383
384 loop:
385 MOVD 8(R8), R20 // R20 = x[i]
386 MOVD 16(R8), R21 // R21 = x[i+1]
387 MOVD 24(R8), R22 // R22 = x[i+2]
388 MOVDU 32(R8), R23 // R23 = x[i+3]
389 MULLD R9, R20, R24 // R24 = z0[i]
390 MULHDU R9, R20, R20 // R20 = z1[i]
391 ADDC R4, R24 // R24 = z0[i] + c
392 ADDZE R20 // R7 = z1[i] + CA
393 MULLD R9, R21, R25
394 MULHDU R9, R21, R21
395 ADDC R20, R25
396 ADDZE R21
397 MULLD R9, R22, R26
398 MULHDU R9, R22, R22
399 MULLD R9, R23, R27
400 MULHDU R9, R23, R23
401 ADDC R21, R26
402 ADDZE R22
403 MOVD R24, 8(R10) // z[i]
404 MOVD R25, 16(R10) // z[i+1]
405 ADDC R22, R27
406 ADDZE R23,R4 // update carry
407 MOVD R26, 24(R10) // z[i+2]
408 MOVDU R27, 32(R10) // z[i+3]
409 ADD $-4, R11 // R11 = z_len - 4
410 BC 16, 0, loop // bdnz
411
412 // We may have some elements to read
413 CMP R0, R11
414 BEQ done
415
416 // Process the remaining elements, one at a time
417 tail:
418 MOVDU 8(R8), R20 // R20 = x[i]
419 MULLD R9, R20, R24 // R24 = z0[i]
420 MULHDU R9, R20, R25 // R25 = z1[i]
421 ADD $-1, R11 // R11 = z_len - 1
422 ADDC R4, R24
423 ADDZE R25
424 MOVDU R24, 8(R10) // z[i]
425 CMP R0, R11
426 MOVD R25, R4 // R4 = c
427 BEQ done // If R11 = 0, we are done
428
429 MOVDU 8(R8), R20
430 MULLD R9, R20, R24
431 MULHDU R9, R20, R25
432 ADD $-1, R11
433 ADDC R4, R24
434 ADDZE R25
435 MOVDU R24, 8(R10)
436 CMP R0, R11
437 MOVD R25, R4
438 BEQ done
439
440 MOVD 8(R8), R20
441 MULLD R9, R20, R24
442 MULHDU R9, R20, R25
443 ADD $-1, R11
444 ADDC R4, R24
445 ADDZE R25
446 MOVD R24, 8(R10)
447 MOVD R25, R4
448
449 done:
450 MOVD R4, c+64(FP)
451 RET
452
453 // func addMulVVW(z, x []Word, y Word) (c Word)
454 TEXT ·addMulVVW(SB), NOSPLIT, $0
455 MOVD z+0(FP), R10 // R10 = z[]
456 MOVD x+24(FP), R8 // R8 = x[]
457 MOVD y+48(FP), R9 // R9 = y
458 MOVD z_len+8(FP), R22 // R22 = z_len
459
460 MOVD R0, R3 // R3 will be the index register
461 CMP R0, R22
462 MOVD R0, R4 // R4 = c = 0
463 MOVD R22, CTR // Initialize loop counter
464 BEQ done
465
466 loop:
467 MOVD (R8)(R3), R20 // Load x[i]
468 MOVD (R10)(R3), R21 // Load z[i]
469 MULLD R9, R20, R6 // R6 = Low-order(x[i]*y)
470 MULHDU R9, R20, R7 // R7 = High-order(x[i]*y)
471 ADDC R21, R6 // R6 = z0
472 ADDZE R7 // R7 = z1
473 ADDC R4, R6 // R6 = z0 + c + 0
474 ADDZE R7, R4 // c += z1
475 MOVD R6, (R10)(R3) // Store z[i]
476 ADD $8, R3
477 BC 16, 0, loop // bdnz
478
479 done:
480 MOVD R4, c+56(FP)
481 RET
482
483
484
View as plain text