Text file
src/math/big/arith_arm64.s
1 // Copyright 2013 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build !math_big_pure_go
6 // +build !math_big_pure_go
7
8 #include "textflag.h"
9
10 // This file provides fast assembly versions for the elementary
11 // arithmetic operations on vectors implemented in arith.go.
12
13 // TODO: Consider re-implementing using Advanced SIMD
14 // once the assembler supports those instructions.
15
16 // func mulWW(x, y Word) (z1, z0 Word)
17 TEXT ·mulWW(SB),NOSPLIT,$0
18 MOVD x+0(FP), R0
19 MOVD y+8(FP), R1
20 MUL R0, R1, R2
21 UMULH R0, R1, R3
22 MOVD R3, z1+16(FP)
23 MOVD R2, z0+24(FP)
24 RET
25
26
27 // func addVV(z, x, y []Word) (c Word)
28 TEXT ·addVV(SB),NOSPLIT,$0
29 MOVD z_len+8(FP), R0
30 MOVD x+24(FP), R8
31 MOVD y+48(FP), R9
32 MOVD z+0(FP), R10
33 ADDS $0, R0 // clear carry flag
34 TBZ $0, R0, two
35 MOVD.P 8(R8), R11
36 MOVD.P 8(R9), R15
37 ADCS R15, R11
38 MOVD.P R11, 8(R10)
39 SUB $1, R0
40 two:
41 TBZ $1, R0, loop
42 LDP.P 16(R8), (R11, R12)
43 LDP.P 16(R9), (R15, R16)
44 ADCS R15, R11
45 ADCS R16, R12
46 STP.P (R11, R12), 16(R10)
47 SUB $2, R0
48 loop:
49 CBZ R0, done // careful not to touch the carry flag
50 LDP.P 32(R8), (R11, R12)
51 LDP -16(R8), (R13, R14)
52 LDP.P 32(R9), (R15, R16)
53 LDP -16(R9), (R17, R19)
54 ADCS R15, R11
55 ADCS R16, R12
56 ADCS R17, R13
57 ADCS R19, R14
58 STP.P (R11, R12), 32(R10)
59 STP (R13, R14), -16(R10)
60 SUB $4, R0
61 B loop
62 done:
63 CSET HS, R0 // extract carry flag
64 MOVD R0, c+72(FP)
65 RET
66
67
68 // func subVV(z, x, y []Word) (c Word)
69 TEXT ·subVV(SB),NOSPLIT,$0
70 MOVD z_len+8(FP), R0
71 MOVD x+24(FP), R8
72 MOVD y+48(FP), R9
73 MOVD z+0(FP), R10
74 CMP R0, R0 // set carry flag
75 TBZ $0, R0, two
76 MOVD.P 8(R8), R11
77 MOVD.P 8(R9), R15
78 SBCS R15, R11
79 MOVD.P R11, 8(R10)
80 SUB $1, R0
81 two:
82 TBZ $1, R0, loop
83 LDP.P 16(R8), (R11, R12)
84 LDP.P 16(R9), (R15, R16)
85 SBCS R15, R11
86 SBCS R16, R12
87 STP.P (R11, R12), 16(R10)
88 SUB $2, R0
89 loop:
90 CBZ R0, done // careful not to touch the carry flag
91 LDP.P 32(R8), (R11, R12)
92 LDP -16(R8), (R13, R14)
93 LDP.P 32(R9), (R15, R16)
94 LDP -16(R9), (R17, R19)
95 SBCS R15, R11
96 SBCS R16, R12
97 SBCS R17, R13
98 SBCS R19, R14
99 STP.P (R11, R12), 32(R10)
100 STP (R13, R14), -16(R10)
101 SUB $4, R0
102 B loop
103 done:
104 CSET LO, R0 // extract carry flag
105 MOVD R0, c+72(FP)
106 RET
107
108 #define vwOneOp(instr, op1) \
109 MOVD.P 8(R1), R4; \
110 instr op1, R4; \
111 MOVD.P R4, 8(R3);
112
113 // handle the first 1~4 elements before starting iteration in addVW/subVW
114 #define vwPreIter(instr1, instr2, counter, target) \
115 vwOneOp(instr1, R2); \
116 SUB $1, counter; \
117 CBZ counter, target; \
118 vwOneOp(instr2, $0); \
119 SUB $1, counter; \
120 CBZ counter, target; \
121 vwOneOp(instr2, $0); \
122 SUB $1, counter; \
123 CBZ counter, target; \
124 vwOneOp(instr2, $0);
125
126 // do one iteration of add or sub in addVW/subVW
127 #define vwOneIter(instr, counter, exit) \
128 CBZ counter, exit; \ // careful not to touch the carry flag
129 LDP.P 32(R1), (R4, R5); \
130 LDP -16(R1), (R6, R7); \
131 instr $0, R4, R8; \
132 instr $0, R5, R9; \
133 instr $0, R6, R10; \
134 instr $0, R7, R11; \
135 STP.P (R8, R9), 32(R3); \
136 STP (R10, R11), -16(R3); \
137 SUB $4, counter;
138
139 // do one iteration of copy in addVW/subVW
140 #define vwOneIterCopy(counter, exit) \
141 CBZ counter, exit; \
142 LDP.P 32(R1), (R4, R5); \
143 LDP -16(R1), (R6, R7); \
144 STP.P (R4, R5), 32(R3); \
145 STP (R6, R7), -16(R3); \
146 SUB $4, counter;
147
148 // func addVW(z, x []Word, y Word) (c Word)
149 // The 'large' branch handles large 'z'. It checks the carry flag on every iteration
150 // and switches to copy if we are done with carries. The copying is skipped as well
151 // if 'x' and 'z' happen to share the same underlying storage.
152 // The overhead of the checking and branching is visible when 'z' are small (~5%),
153 // so set a threshold of 32, and remain the small-sized part entirely untouched.
154 TEXT ·addVW(SB),NOSPLIT,$0
155 MOVD z+0(FP), R3
156 MOVD z_len+8(FP), R0
157 MOVD x+24(FP), R1
158 MOVD y+48(FP), R2
159 CMP $32, R0
160 BGE large // large-sized 'z' and 'x'
161 CBZ R0, len0 // the length of z is 0
162 MOVD.P 8(R1), R4
163 ADDS R2, R4 // z[0] = x[0] + y, set carry
164 MOVD.P R4, 8(R3)
165 SUB $1, R0
166 CBZ R0, len1 // the length of z is 1
167 TBZ $0, R0, two
168 MOVD.P 8(R1), R4 // do it once
169 ADCS $0, R4
170 MOVD.P R4, 8(R3)
171 SUB $1, R0
172 two: // do it twice
173 TBZ $1, R0, loop
174 LDP.P 16(R1), (R4, R5)
175 ADCS $0, R4, R8 // c, z[i] = x[i] + c
176 ADCS $0, R5, R9
177 STP.P (R8, R9), 16(R3)
178 SUB $2, R0
179 loop: // do four times per round
180 vwOneIter(ADCS, R0, len1)
181 B loop
182 len1:
183 CSET HS, R2 // extract carry flag
184 len0:
185 MOVD R2, c+56(FP)
186 done:
187 RET
188 large:
189 AND $0x3, R0, R10
190 AND $~0x3, R0
191 // unrolling for the first 1~4 elements to avoid saving the carry
192 // flag in each step, adjust $R0 if we unrolled 4 elements
193 vwPreIter(ADDS, ADCS, R10, add4)
194 SUB $4, R0
195 add4:
196 BCC copy
197 vwOneIter(ADCS, R0, len1)
198 B add4
199 copy:
200 MOVD ZR, c+56(FP)
201 CMP R1, R3
202 BEQ done
203 copy_4: // no carry flag, copy the rest
204 vwOneIterCopy(R0, done)
205 B copy_4
206
207 // func subVW(z, x []Word, y Word) (c Word)
208 // The 'large' branch handles large 'z'. It checks the carry flag on every iteration
209 // and switches to copy if we are done with carries. The copying is skipped as well
210 // if 'x' and 'z' happen to share the same underlying storage.
211 // The overhead of the checking and branching is visible when 'z' are small (~5%),
212 // so set a threshold of 32, and remain the small-sized part entirely untouched.
213 TEXT ·subVW(SB),NOSPLIT,$0
214 MOVD z+0(FP), R3
215 MOVD z_len+8(FP), R0
216 MOVD x+24(FP), R1
217 MOVD y+48(FP), R2
218 CMP $32, R0
219 BGE large // large-sized 'z' and 'x'
220 CBZ R0, len0 // the length of z is 0
221 MOVD.P 8(R1), R4
222 SUBS R2, R4 // z[0] = x[0] - y, set carry
223 MOVD.P R4, 8(R3)
224 SUB $1, R0
225 CBZ R0, len1 // the length of z is 1
226 TBZ $0, R0, two // do it once
227 MOVD.P 8(R1), R4
228 SBCS $0, R4
229 MOVD.P R4, 8(R3)
230 SUB $1, R0
231 two: // do it twice
232 TBZ $1, R0, loop
233 LDP.P 16(R1), (R4, R5)
234 SBCS $0, R4, R8 // c, z[i] = x[i] + c
235 SBCS $0, R5, R9
236 STP.P (R8, R9), 16(R3)
237 SUB $2, R0
238 loop: // do four times per round
239 vwOneIter(SBCS, R0, len1)
240 B loop
241 len1:
242 CSET LO, R2 // extract carry flag
243 len0:
244 MOVD R2, c+56(FP)
245 done:
246 RET
247 large:
248 AND $0x3, R0, R10
249 AND $~0x3, R0
250 // unrolling for the first 1~4 elements to avoid saving the carry
251 // flag in each step, adjust $R0 if we unrolled 4 elements
252 vwPreIter(SUBS, SBCS, R10, sub4)
253 SUB $4, R0
254 sub4:
255 BCS copy
256 vwOneIter(SBCS, R0, len1)
257 B sub4
258 copy:
259 MOVD ZR, c+56(FP)
260 CMP R1, R3
261 BEQ done
262 copy_4: // no carry flag, copy the rest
263 vwOneIterCopy(R0, done)
264 B copy_4
265
266 // func shlVU(z, x []Word, s uint) (c Word)
267 // This implementation handles the shift operation from the high word to the low word,
268 // which may be an error for the case where the low word of x overlaps with the high
269 // word of z. When calling this function directly, you need to pay attention to this
270 // situation.
271 TEXT ·shlVU(SB),NOSPLIT,$0
272 LDP z+0(FP), (R0, R1) // R0 = z.ptr, R1 = len(z)
273 MOVD x+24(FP), R2
274 MOVD s+48(FP), R3
275 ADD R1<<3, R0 // R0 = &z[n]
276 ADD R1<<3, R2 // R2 = &x[n]
277 CBZ R1, len0
278 CBZ R3, copy // if the number of shift is 0, just copy x to z
279 MOVD $64, R4
280 SUB R3, R4
281 // handling the most significant element x[n-1]
282 MOVD.W -8(R2), R6
283 LSR R4, R6, R5 // return value
284 LSL R3, R6, R8 // x[i] << s
285 SUB $1, R1
286 one: TBZ $0, R1, two
287 MOVD.W -8(R2), R6
288 LSR R4, R6, R7
289 ORR R8, R7
290 LSL R3, R6, R8
291 SUB $1, R1
292 MOVD.W R7, -8(R0)
293 two:
294 TBZ $1, R1, loop
295 LDP.W -16(R2), (R6, R7)
296 LSR R4, R7, R10
297 ORR R8, R10
298 LSL R3, R7
299 LSR R4, R6, R9
300 ORR R7, R9
301 LSL R3, R6, R8
302 SUB $2, R1
303 STP.W (R9, R10), -16(R0)
304 loop:
305 CBZ R1, done
306 LDP.W -32(R2), (R10, R11)
307 LDP 16(R2), (R12, R13)
308 LSR R4, R13, R23
309 ORR R8, R23 // z[i] = (x[i] << s) | (x[i-1] >> (64 - s))
310 LSL R3, R13
311 LSR R4, R12, R22
312 ORR R13, R22
313 LSL R3, R12
314 LSR R4, R11, R21
315 ORR R12, R21
316 LSL R3, R11
317 LSR R4, R10, R20
318 ORR R11, R20
319 LSL R3, R10, R8
320 STP.W (R20, R21), -32(R0)
321 STP (R22, R23), 16(R0)
322 SUB $4, R1
323 B loop
324 done:
325 MOVD.W R8, -8(R0) // the first element x[0]
326 MOVD R5, c+56(FP) // the part moved out from x[n-1]
327 RET
328 copy:
329 CMP R0, R2
330 BEQ len0
331 TBZ $0, R1, ctwo
332 MOVD.W -8(R2), R4
333 MOVD.W R4, -8(R0)
334 SUB $1, R1
335 ctwo:
336 TBZ $1, R1, cloop
337 LDP.W -16(R2), (R4, R5)
338 STP.W (R4, R5), -16(R0)
339 SUB $2, R1
340 cloop:
341 CBZ R1, len0
342 LDP.W -32(R2), (R4, R5)
343 LDP 16(R2), (R6, R7)
344 STP.W (R4, R5), -32(R0)
345 STP (R6, R7), 16(R0)
346 SUB $4, R1
347 B cloop
348 len0:
349 MOVD $0, c+56(FP)
350 RET
351
352 // func shrVU(z, x []Word, s uint) (c Word)
353 // This implementation handles the shift operation from the low word to the high word,
354 // which may be an error for the case where the high word of x overlaps with the low
355 // word of z. When calling this function directly, you need to pay attention to this
356 // situation.
357 TEXT ·shrVU(SB),NOSPLIT,$0
358 MOVD z+0(FP), R0
359 MOVD z_len+8(FP), R1
360 MOVD x+24(FP), R2
361 MOVD s+48(FP), R3
362 MOVD $0, R8
363 MOVD $64, R4
364 SUB R3, R4
365 CBZ R1, len0
366 CBZ R3, copy // if the number of shift is 0, just copy x to z
367
368 MOVD.P 8(R2), R20
369 LSR R3, R20, R8
370 LSL R4, R20
371 MOVD R20, c+56(FP) // deal with the first element
372 SUB $1, R1
373
374 TBZ $0, R1, two
375 MOVD.P 8(R2), R6
376 LSL R4, R6, R20
377 ORR R8, R20
378 LSR R3, R6, R8
379 MOVD.P R20, 8(R0)
380 SUB $1, R1
381 two:
382 TBZ $1, R1, loop
383 LDP.P 16(R2), (R6, R7)
384 LSL R4, R6, R20
385 LSR R3, R6
386 ORR R8, R20
387 LSL R4, R7, R21
388 LSR R3, R7, R8
389 ORR R6, R21
390 STP.P (R20, R21), 16(R0)
391 SUB $2, R1
392 loop:
393 CBZ R1, done
394 LDP.P 32(R2), (R10, R11)
395 LDP -16(R2), (R12, R13)
396 LSL R4, R10, R20
397 LSR R3, R10
398 ORR R8, R20 // z[i] = (x[i] >> s) | (x[i+1] << (64 - s))
399 LSL R4, R11, R21
400 LSR R3, R11
401 ORR R10, R21
402 LSL R4, R12, R22
403 LSR R3, R12
404 ORR R11, R22
405 LSL R4, R13, R23
406 LSR R3, R13, R8
407 ORR R12, R23
408 STP.P (R20, R21), 32(R0)
409 STP (R22, R23), -16(R0)
410 SUB $4, R1
411 B loop
412 done:
413 MOVD R8, (R0) // deal with the last element
414 RET
415 copy:
416 CMP R0, R2
417 BEQ len0
418 TBZ $0, R1, ctwo
419 MOVD.P 8(R2), R3
420 MOVD.P R3, 8(R0)
421 SUB $1, R1
422 ctwo:
423 TBZ $1, R1, cloop
424 LDP.P 16(R2), (R4, R5)
425 STP.P (R4, R5), 16(R0)
426 SUB $2, R1
427 cloop:
428 CBZ R1, len0
429 LDP.P 32(R2), (R4, R5)
430 LDP -16(R2), (R6, R7)
431 STP.P (R4, R5), 32(R0)
432 STP (R6, R7), -16(R0)
433 SUB $4, R1
434 B cloop
435 len0:
436 MOVD $0, c+56(FP)
437 RET
438
439
440 // func mulAddVWW(z, x []Word, y, r Word) (c Word)
441 TEXT ·mulAddVWW(SB),NOSPLIT,$0
442 MOVD z+0(FP), R1
443 MOVD z_len+8(FP), R0
444 MOVD x+24(FP), R2
445 MOVD y+48(FP), R3
446 MOVD r+56(FP), R4
447 // c, z = x * y + r
448 TBZ $0, R0, two
449 MOVD.P 8(R2), R5
450 MUL R3, R5, R7
451 UMULH R3, R5, R8
452 ADDS R4, R7
453 ADC $0, R8, R4 // c, z[i] = x[i] * y + r
454 MOVD.P R7, 8(R1)
455 SUB $1, R0
456 two:
457 TBZ $1, R0, loop
458 LDP.P 16(R2), (R5, R6)
459 MUL R3, R5, R10
460 UMULH R3, R5, R11
461 ADDS R4, R10
462 MUL R3, R6, R12
463 UMULH R3, R6, R13
464 ADCS R12, R11
465 ADC $0, R13, R4
466
467 STP.P (R10, R11), 16(R1)
468 SUB $2, R0
469 loop:
470 CBZ R0, done
471 LDP.P 32(R2), (R5, R6)
472 LDP -16(R2), (R7, R8)
473
474 MUL R3, R5, R10
475 UMULH R3, R5, R11
476 ADDS R4, R10
477 MUL R3, R6, R12
478 UMULH R3, R6, R13
479 ADCS R11, R12
480
481 MUL R3, R7, R14
482 UMULH R3, R7, R15
483 ADCS R13, R14
484 MUL R3, R8, R16
485 UMULH R3, R8, R17
486 ADCS R15, R16
487 ADC $0, R17, R4
488
489 STP.P (R10, R12), 32(R1)
490 STP (R14, R16), -16(R1)
491 SUB $4, R0
492 B loop
493 done:
494 MOVD R4, c+64(FP)
495 RET
496
497
498 // func addMulVVW(z, x []Word, y Word) (c Word)
499 TEXT ·addMulVVW(SB),NOSPLIT,$0
500 MOVD z+0(FP), R1
501 MOVD z_len+8(FP), R0
502 MOVD x+24(FP), R2
503 MOVD y+48(FP), R3
504 MOVD $0, R4
505
506 TBZ $0, R0, two
507
508 MOVD.P 8(R2), R5
509 MOVD (R1), R6
510
511 MUL R5, R3, R7
512 UMULH R5, R3, R8
513
514 ADDS R7, R6
515 ADC $0, R8, R4
516
517 MOVD.P R6, 8(R1)
518 SUB $1, R0
519
520 two:
521 TBZ $1, R0, loop
522
523 LDP.P 16(R2), (R5, R10)
524 LDP (R1), (R6, R11)
525
526 MUL R10, R3, R13
527 UMULH R10, R3, R12
528
529 MUL R5, R3, R7
530 UMULH R5, R3, R8
531
532 ADDS R4, R6
533 ADCS R13, R11
534 ADC $0, R12
535
536 ADDS R7, R6
537 ADCS R8, R11
538 ADC $0, R12, R4
539
540 STP.P (R6, R11), 16(R1)
541 SUB $2, R0
542
543 // The main loop of this code operates on a block of 4 words every iteration
544 // performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9]
545 // where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next
546 // 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z.
547 loop:
548 CBZ R0, done
549
550 LDP.P 16(R2), (R5, R6)
551 LDP.P 16(R2), (R7, R8)
552
553 LDP (R1), (R9, R10)
554 ADDS R4, R9
555 MUL R6, R3, R14
556 ADCS R14, R10
557 MUL R7, R3, R15
558 LDP 16(R1), (R11, R12)
559 ADCS R15, R11
560 MUL R8, R3, R16
561 ADCS R16, R12
562 UMULH R8, R3, R20
563 ADC $0, R20
564
565 MUL R5, R3, R13
566 ADDS R13, R9
567 UMULH R5, R3, R17
568 ADCS R17, R10
569 UMULH R6, R3, R21
570 STP.P (R9, R10), 16(R1)
571 ADCS R21, R11
572 UMULH R7, R3, R19
573 ADCS R19, R12
574 STP.P (R11, R12), 16(R1)
575 ADC $0, R20, R4
576
577 SUB $4, R0
578 B loop
579
580 done:
581 MOVD R4, c+56(FP)
582 RET
583
584
585
View as plain text