Text file
src/math/big/arith_s390x.s
1 // Copyright 2016 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build !math_big_pure_go
6 // +build !math_big_pure_go
7
8 #include "textflag.h"
9
10 // This file provides fast assembly versions for the elementary
11 // arithmetic operations on vectors implemented in arith.go.
12
13 TEXT ·mulWW(SB), NOSPLIT, $0
14 MOVD x+0(FP), R3
15 MOVD y+8(FP), R4
16 MULHDU R3, R4
17 MOVD R10, z1+16(FP)
18 MOVD R11, z0+24(FP)
19 RET
20
21
22 // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
23 // func addVV(z, x, y []Word) (c Word)
24
25 TEXT ·addVV(SB), NOSPLIT, $0
26 MOVD addvectorfacility+0x00(SB), R1
27 BR (R1)
28
29 TEXT ·addVV_check(SB), NOSPLIT, $0
30 MOVB ·hasVX(SB), R1
31 CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported
32 MOVD $addvectorfacility+0x00(SB), R1
33 MOVD $·addVV_novec(SB), R2
34 MOVD R2, 0(R1)
35
36 // MOVD $·addVV_novec(SB), 0(R1)
37 BR ·addVV_novec(SB)
38
39 vectorimpl:
40 MOVD $addvectorfacility+0x00(SB), R1
41 MOVD $·addVV_vec(SB), R2
42 MOVD R2, 0(R1)
43
44 // MOVD $·addVV_vec(SB), 0(R1)
45 BR ·addVV_vec(SB)
46
47 GLOBL addvectorfacility+0x00(SB), NOPTR, $8
48 DATA addvectorfacility+0x00(SB)/8, $·addVV_check(SB)
49
50 TEXT ·addVV_vec(SB), NOSPLIT, $0
51 MOVD z_len+8(FP), R3
52 MOVD x+24(FP), R8
53 MOVD y+48(FP), R9
54 MOVD z+0(FP), R2
55
56 MOVD $0, R4 // c = 0
57 MOVD $0, R0 // make sure it's zero
58 MOVD $0, R10 // i = 0
59
60 // s/JL/JMP/ below to disable the unrolled loop
61 SUB $4, R3
62 BLT v1
63 SUB $12, R3 // n -= 16
64 BLT A1 // if n < 0 goto A1
65
66 MOVD R8, R5
67 MOVD R9, R6
68 MOVD R2, R7
69
70 // n >= 0
71 // regular loop body unrolled 16x
72 VZERO V0 // c = 0
73
74 UU1:
75 VLM 0(R5), V1, V4 // 64-bytes into V1..V8
76 ADD $64, R5
77 VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
78 VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
79
80 VLM 0(R6), V9, V12 // 64-bytes into V9..V16
81 ADD $64, R6
82 VPDI $0x4, V9, V9, V9 // flip the doublewords to big-endian order
83 VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order
84
85 VACCCQ V1, V9, V0, V25
86 VACQ V1, V9, V0, V17
87 VACCCQ V2, V10, V25, V26
88 VACQ V2, V10, V25, V18
89
90 VLM 0(R5), V5, V6 // 32-bytes into V1..V8
91 VLM 0(R6), V13, V14 // 32-bytes into V9..V16
92 ADD $32, R5
93 ADD $32, R6
94
95 VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order
96 VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order
97 VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order
98 VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order
99
100 VACCCQ V3, V11, V26, V27
101 VACQ V3, V11, V26, V19
102 VACCCQ V4, V12, V27, V28
103 VACQ V4, V12, V27, V20
104
105 VLM 0(R5), V7, V8 // 32-bytes into V1..V8
106 VLM 0(R6), V15, V16 // 32-bytes into V9..V16
107 ADD $32, R5
108 ADD $32, R6
109
110 VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order
111 VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order
112 VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order
113 VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order
114
115 VACCCQ V5, V13, V28, V29
116 VACQ V5, V13, V28, V21
117 VACCCQ V6, V14, V29, V30
118 VACQ V6, V14, V29, V22
119
120 VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order
121 VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order
122 VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order
123 VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order
124
125 VACCCQ V7, V15, V30, V31
126 VACQ V7, V15, V30, V23
127 VACCCQ V8, V16, V31, V0 // V0 has carry-over
128 VACQ V8, V16, V31, V24
129
130 VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order
131 VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order
132 VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order
133 VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order
134 VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order
135 VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order
136 VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order
137 VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order
138 VSTM V17, V24, 0(R7) // 128-bytes into z
139 ADD $128, R7
140 ADD $128, R10 // i += 16
141 SUB $16, R3 // n -= 16
142 BGE UU1 // if n >= 0 goto U1
143 VLGVG $1, V0, R4 // put cf into R4
144 NEG R4, R4 // save cf
145
146 A1:
147 ADD $12, R3 // n += 16
148
149 // s/JL/JMP/ below to disable the unrolled loop
150 BLT v1 // if n < 0 goto v1
151
152 U1: // n >= 0
153 // regular loop body unrolled 4x
154 MOVD 0(R8)(R10*1), R5
155 MOVD 8(R8)(R10*1), R6
156 MOVD 16(R8)(R10*1), R7
157 MOVD 24(R8)(R10*1), R1
158 ADDC R4, R4 // restore CF
159 MOVD 0(R9)(R10*1), R11
160 ADDE R11, R5
161 MOVD 8(R9)(R10*1), R11
162 ADDE R11, R6
163 MOVD 16(R9)(R10*1), R11
164 ADDE R11, R7
165 MOVD 24(R9)(R10*1), R11
166 ADDE R11, R1
167 MOVD R0, R4
168 ADDE R4, R4 // save CF
169 NEG R4, R4
170 MOVD R5, 0(R2)(R10*1)
171 MOVD R6, 8(R2)(R10*1)
172 MOVD R7, 16(R2)(R10*1)
173 MOVD R1, 24(R2)(R10*1)
174
175 ADD $32, R10 // i += 4
176 SUB $4, R3 // n -= 4
177 BGE U1 // if n >= 0 goto U1
178
179 v1:
180 ADD $4, R3 // n += 4
181 BLE E1 // if n <= 0 goto E1
182
183 L1: // n > 0
184 ADDC R4, R4 // restore CF
185 MOVD 0(R8)(R10*1), R5
186 MOVD 0(R9)(R10*1), R11
187 ADDE R11, R5
188 MOVD R5, 0(R2)(R10*1)
189 MOVD R0, R4
190 ADDE R4, R4 // save CF
191 NEG R4, R4
192
193 ADD $8, R10 // i++
194 SUB $1, R3 // n--
195 BGT L1 // if n > 0 goto L1
196
197 E1:
198 NEG R4, R4
199 MOVD R4, c+72(FP) // return c
200 RET
201
202 TEXT ·addVV_novec(SB), NOSPLIT, $0
203 novec:
204 MOVD z_len+8(FP), R3
205 MOVD x+24(FP), R8
206 MOVD y+48(FP), R9
207 MOVD z+0(FP), R2
208
209 MOVD $0, R4 // c = 0
210 MOVD $0, R0 // make sure it's zero
211 MOVD $0, R10 // i = 0
212
213 // s/JL/JMP/ below to disable the unrolled loop
214 SUB $4, R3 // n -= 4
215 BLT v1n // if n < 0 goto v1n
216
217 U1n: // n >= 0
218 // regular loop body unrolled 4x
219 MOVD 0(R8)(R10*1), R5
220 MOVD 8(R8)(R10*1), R6
221 MOVD 16(R8)(R10*1), R7
222 MOVD 24(R8)(R10*1), R1
223 ADDC R4, R4 // restore CF
224 MOVD 0(R9)(R10*1), R11
225 ADDE R11, R5
226 MOVD 8(R9)(R10*1), R11
227 ADDE R11, R6
228 MOVD 16(R9)(R10*1), R11
229 ADDE R11, R7
230 MOVD 24(R9)(R10*1), R11
231 ADDE R11, R1
232 MOVD R0, R4
233 ADDE R4, R4 // save CF
234 NEG R4, R4
235 MOVD R5, 0(R2)(R10*1)
236 MOVD R6, 8(R2)(R10*1)
237 MOVD R7, 16(R2)(R10*1)
238 MOVD R1, 24(R2)(R10*1)
239
240 ADD $32, R10 // i += 4
241 SUB $4, R3 // n -= 4
242 BGE U1n // if n >= 0 goto U1n
243
244 v1n:
245 ADD $4, R3 // n += 4
246 BLE E1n // if n <= 0 goto E1n
247
248 L1n: // n > 0
249 ADDC R4, R4 // restore CF
250 MOVD 0(R8)(R10*1), R5
251 MOVD 0(R9)(R10*1), R11
252 ADDE R11, R5
253 MOVD R5, 0(R2)(R10*1)
254 MOVD R0, R4
255 ADDE R4, R4 // save CF
256 NEG R4, R4
257
258 ADD $8, R10 // i++
259 SUB $1, R3 // n--
260 BGT L1n // if n > 0 goto L1n
261
262 E1n:
263 NEG R4, R4
264 MOVD R4, c+72(FP) // return c
265 RET
266
267 TEXT ·subVV(SB), NOSPLIT, $0
268 MOVD subvectorfacility+0x00(SB), R1
269 BR (R1)
270
271 TEXT ·subVV_check(SB), NOSPLIT, $0
272 MOVB ·hasVX(SB), R1
273 CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported
274 MOVD $subvectorfacility+0x00(SB), R1
275 MOVD $·subVV_novec(SB), R2
276 MOVD R2, 0(R1)
277
278 // MOVD $·subVV_novec(SB), 0(R1)
279 BR ·subVV_novec(SB)
280
281 vectorimpl:
282 MOVD $subvectorfacility+0x00(SB), R1
283 MOVD $·subVV_vec(SB), R2
284 MOVD R2, 0(R1)
285
286 // MOVD $·subVV_vec(SB), 0(R1)
287 BR ·subVV_vec(SB)
288
289 GLOBL subvectorfacility+0x00(SB), NOPTR, $8
290 DATA subvectorfacility+0x00(SB)/8, $·subVV_check(SB)
291
292 // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
293 // func subVV(z, x, y []Word) (c Word)
294 // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
295 TEXT ·subVV_vec(SB), NOSPLIT, $0
296 MOVD z_len+8(FP), R3
297 MOVD x+24(FP), R8
298 MOVD y+48(FP), R9
299 MOVD z+0(FP), R2
300 MOVD $0, R4 // c = 0
301 MOVD $0, R0 // make sure it's zero
302 MOVD $0, R10 // i = 0
303
304 // s/JL/JMP/ below to disable the unrolled loop
305 SUB $4, R3 // n -= 4
306 BLT v1 // if n < 0 goto v1
307 SUB $12, R3 // n -= 16
308 BLT A1 // if n < 0 goto A1
309
310 MOVD R8, R5
311 MOVD R9, R6
312 MOVD R2, R7
313
314 // n >= 0
315 // regular loop body unrolled 16x
316 VZERO V0 // cf = 0
317 MOVD $1, R4 // for 390 subtraction cf starts as 1 (no borrow)
318 VLVGG $1, R4, V0 // put carry into V0
319
320 UU1:
321 VLM 0(R5), V1, V4 // 64-bytes into V1..V8
322 ADD $64, R5
323 VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
324 VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
325
326 VLM 0(R6), V9, V12 // 64-bytes into V9..V16
327 ADD $64, R6
328 VPDI $0x4, V9, V9, V9 // flip the doublewords to big-endian order
329 VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order
330
331 VSBCBIQ V1, V9, V0, V25
332 VSBIQ V1, V9, V0, V17
333 VSBCBIQ V2, V10, V25, V26
334 VSBIQ V2, V10, V25, V18
335
336 VLM 0(R5), V5, V6 // 32-bytes into V1..V8
337 VLM 0(R6), V13, V14 // 32-bytes into V9..V16
338 ADD $32, R5
339 ADD $32, R6
340
341 VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order
342 VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order
343 VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order
344 VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order
345
346 VSBCBIQ V3, V11, V26, V27
347 VSBIQ V3, V11, V26, V19
348 VSBCBIQ V4, V12, V27, V28
349 VSBIQ V4, V12, V27, V20
350
351 VLM 0(R5), V7, V8 // 32-bytes into V1..V8
352 VLM 0(R6), V15, V16 // 32-bytes into V9..V16
353 ADD $32, R5
354 ADD $32, R6
355
356 VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order
357 VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order
358 VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order
359 VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order
360
361 VSBCBIQ V5, V13, V28, V29
362 VSBIQ V5, V13, V28, V21
363 VSBCBIQ V6, V14, V29, V30
364 VSBIQ V6, V14, V29, V22
365
366 VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order
367 VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order
368 VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order
369 VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order
370
371 VSBCBIQ V7, V15, V30, V31
372 VSBIQ V7, V15, V30, V23
373 VSBCBIQ V8, V16, V31, V0 // V0 has carry-over
374 VSBIQ V8, V16, V31, V24
375
376 VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order
377 VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order
378 VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order
379 VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order
380 VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order
381 VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order
382 VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order
383 VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order
384 VSTM V17, V24, 0(R7) // 128-bytes into z
385 ADD $128, R7
386 ADD $128, R10 // i += 16
387 SUB $16, R3 // n -= 16
388 BGE UU1 // if n >= 0 goto U1
389 VLGVG $1, V0, R4 // put cf into R4
390 SUB $1, R4 // save cf
391
392 A1:
393 ADD $12, R3 // n += 16
394 BLT v1 // if n < 0 goto v1
395
396 U1: // n >= 0
397 // regular loop body unrolled 4x
398 MOVD 0(R8)(R10*1), R5
399 MOVD 8(R8)(R10*1), R6
400 MOVD 16(R8)(R10*1), R7
401 MOVD 24(R8)(R10*1), R1
402 MOVD R0, R11
403 SUBC R4, R11 // restore CF
404 MOVD 0(R9)(R10*1), R11
405 SUBE R11, R5
406 MOVD 8(R9)(R10*1), R11
407 SUBE R11, R6
408 MOVD 16(R9)(R10*1), R11
409 SUBE R11, R7
410 MOVD 24(R9)(R10*1), R11
411 SUBE R11, R1
412 MOVD R0, R4
413 SUBE R4, R4 // save CF
414 MOVD R5, 0(R2)(R10*1)
415 MOVD R6, 8(R2)(R10*1)
416 MOVD R7, 16(R2)(R10*1)
417 MOVD R1, 24(R2)(R10*1)
418
419 ADD $32, R10 // i += 4
420 SUB $4, R3 // n -= 4
421 BGE U1 // if n >= 0 goto U1n
422
423 v1:
424 ADD $4, R3 // n += 4
425 BLE E1 // if n <= 0 goto E1
426
427 L1: // n > 0
428 MOVD R0, R11
429 SUBC R4, R11 // restore CF
430 MOVD 0(R8)(R10*1), R5
431 MOVD 0(R9)(R10*1), R11
432 SUBE R11, R5
433 MOVD R5, 0(R2)(R10*1)
434 MOVD R0, R4
435 SUBE R4, R4 // save CF
436
437 ADD $8, R10 // i++
438 SUB $1, R3 // n--
439 BGT L1 // if n > 0 goto L1n
440
441 E1:
442 NEG R4, R4
443 MOVD R4, c+72(FP) // return c
444 RET
445
446 // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
447 // func subVV(z, x, y []Word) (c Word)
448 // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
449 TEXT ·subVV_novec(SB), NOSPLIT, $0
450 MOVD z_len+8(FP), R3
451 MOVD x+24(FP), R8
452 MOVD y+48(FP), R9
453 MOVD z+0(FP), R2
454
455 MOVD $0, R4 // c = 0
456 MOVD $0, R0 // make sure it's zero
457 MOVD $0, R10 // i = 0
458
459 // s/JL/JMP/ below to disable the unrolled loop
460 SUB $4, R3 // n -= 4
461 BLT v1 // if n < 0 goto v1
462
463 U1: // n >= 0
464 // regular loop body unrolled 4x
465 MOVD 0(R8)(R10*1), R5
466 MOVD 8(R8)(R10*1), R6
467 MOVD 16(R8)(R10*1), R7
468 MOVD 24(R8)(R10*1), R1
469 MOVD R0, R11
470 SUBC R4, R11 // restore CF
471 MOVD 0(R9)(R10*1), R11
472 SUBE R11, R5
473 MOVD 8(R9)(R10*1), R11
474 SUBE R11, R6
475 MOVD 16(R9)(R10*1), R11
476 SUBE R11, R7
477 MOVD 24(R9)(R10*1), R11
478 SUBE R11, R1
479 MOVD R0, R4
480 SUBE R4, R4 // save CF
481 MOVD R5, 0(R2)(R10*1)
482 MOVD R6, 8(R2)(R10*1)
483 MOVD R7, 16(R2)(R10*1)
484 MOVD R1, 24(R2)(R10*1)
485
486 ADD $32, R10 // i += 4
487 SUB $4, R3 // n -= 4
488 BGE U1 // if n >= 0 goto U1
489
490 v1:
491 ADD $4, R3 // n += 4
492 BLE E1 // if n <= 0 goto E1
493
494 L1: // n > 0
495 MOVD R0, R11
496 SUBC R4, R11 // restore CF
497 MOVD 0(R8)(R10*1), R5
498 MOVD 0(R9)(R10*1), R11
499 SUBE R11, R5
500 MOVD R5, 0(R2)(R10*1)
501 MOVD R0, R4
502 SUBE R4, R4 // save CF
503
504 ADD $8, R10 // i++
505 SUB $1, R3 // n--
506 BGT L1 // if n > 0 goto L1
507
508 E1:
509 NEG R4, R4
510 MOVD R4, c+72(FP) // return c
511 RET
512
513 TEXT ·addVW(SB), NOSPLIT, $0
514 MOVD z_len+8(FP), R5 // length of z
515 MOVD x+24(FP), R6
516 MOVD y+48(FP), R7 // c = y
517 MOVD z+0(FP), R8
518
519 CMPBEQ R5, $0, returnC // if len(z) == 0, we can have an early return
520
521 // Add the first two words, and determine which path (copy path or loop path) to take based on the carry flag.
522 ADDC 0(R6), R7
523 MOVD R7, 0(R8)
524 CMPBEQ R5, $1, returnResult // len(z) == 1
525 MOVD $0, R9
526 ADDE 8(R6), R9
527 MOVD R9, 8(R8)
528 CMPBEQ R5, $2, returnResult // len(z) == 2
529
530 // Update the counters
531 MOVD $16, R12 // i = 2
532 MOVD $-2(R5), R5 // n = n - 2
533
534 loopOverEachWord:
535 BRC $12, copySetup // carry = 0, copy the rest
536 MOVD $1, R9
537
538 // Originally we used the carry flag generated in the previous iteration
539 // (i.e: ADDE could be used here to do the addition). However, since we
540 // already know carry is 1 (otherwise we will go to copy section), we can use
541 // ADDC here so the current iteration does not depend on the carry flag
542 // generated in the previous iteration. This could be useful when branch prediction happens.
543 ADDC 0(R6)(R12*1), R9
544 MOVD R9, 0(R8)(R12*1) // z[i] = x[i] + c
545
546 MOVD $8(R12), R12 // i++
547 BRCTG R5, loopOverEachWord // n--
548
549 // Return the current carry value
550 returnResult:
551 MOVD $0, R0
552 ADDE R0, R0
553 MOVD R0, c+56(FP)
554 RET
555
556 // Update position of x(R6) and z(R8) based on the current counter value and perform copying.
557 // With the assumption that x and z will not overlap with each other or x and z will
558 // point to same memory region, we can use a faster version of copy using only MVC here.
559 // In the following implementation, we have three copy loops, each copying a word, 4 words, and
560 // 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove.
561 copySetup:
562 ADD R12, R6
563 ADD R12, R8
564
565 CMPBGE R5, $4, mediumLoop
566
567 smallLoop: // does a loop unrolling to copy word when n < 4
568 CMPBEQ R5, $0, returnZero
569 MVC $8, 0(R6), 0(R8)
570 CMPBEQ R5, $1, returnZero
571 MVC $8, 8(R6), 8(R8)
572 CMPBEQ R5, $2, returnZero
573 MVC $8, 16(R6), 16(R8)
574
575 returnZero:
576 MOVD $0, c+56(FP) // return 0 as carry
577 RET
578
579 mediumLoop:
580 CMPBLT R5, $4, smallLoop
581 CMPBLT R5, $32, mediumLoopBody
582
583 largeLoop: // Copying 256 bytes at a time.
584 MVC $256, 0(R6), 0(R8)
585 MOVD $256(R6), R6
586 MOVD $256(R8), R8
587 MOVD $-32(R5), R5
588 CMPBGE R5, $32, largeLoop
589 BR mediumLoop
590
591 mediumLoopBody: // Copying 32 bytes at a time
592 MVC $32, 0(R6), 0(R8)
593 MOVD $32(R6), R6
594 MOVD $32(R8), R8
595 MOVD $-4(R5), R5
596 CMPBGE R5, $4, mediumLoopBody
597 BR smallLoop
598
599 returnC:
600 MOVD R7, c+56(FP)
601 RET
602
603 TEXT ·subVW(SB), NOSPLIT, $0
604 MOVD z_len+8(FP), R5
605 MOVD x+24(FP), R6
606 MOVD y+48(FP), R7 // The borrow bit passed in
607 MOVD z+0(FP), R8
608 MOVD $0, R0 // R0 is a temporary variable used during computation. Ensure it has zero in it.
609
610 CMPBEQ R5, $0, returnC // len(z) == 0, have an early return
611
612 // Subtract the first two words, and determine which path (copy path or loop path) to take based on the borrow flag
613 MOVD 0(R6), R9
614 SUBC R7, R9
615 MOVD R9, 0(R8)
616 CMPBEQ R5, $1, returnResult
617 MOVD 8(R6), R9
618 SUBE R0, R9
619 MOVD R9, 8(R8)
620 CMPBEQ R5, $2, returnResult
621
622 // Update the counters
623 MOVD $16, R12 // i = 2
624 MOVD $-2(R5), R5 // n = n - 2
625
626 loopOverEachWord:
627 BRC $3, copySetup // no borrow, copy the rest
628 MOVD 0(R6)(R12*1), R9
629
630 // Originally we used the borrow flag generated in the previous iteration
631 // (i.e: SUBE could be used here to do the subtraction). However, since we
632 // already know borrow is 1 (otherwise we will go to copy section), we can
633 // use SUBC here so the current iteration does not depend on the borrow flag
634 // generated in the previous iteration. This could be useful when branch prediction happens.
635 SUBC $1, R9
636 MOVD R9, 0(R8)(R12*1) // z[i] = x[i] - 1
637
638 MOVD $8(R12), R12 // i++
639 BRCTG R5, loopOverEachWord // n--
640
641 // return the current borrow value
642 returnResult:
643 SUBE R0, R0
644 NEG R0, R0
645 MOVD R0, c+56(FP)
646 RET
647
648 // Update position of x(R6) and z(R8) based on the current counter value and perform copying.
649 // With the assumption that x and z will not overlap with each other or x and z will
650 // point to same memory region, we can use a faster version of copy using only MVC here.
651 // In the following implementation, we have three copy loops, each copying a word, 4 words, and
652 // 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove.
653 copySetup:
654 ADD R12, R6
655 ADD R12, R8
656
657 CMPBGE R5, $4, mediumLoop
658
659 smallLoop: // does a loop unrolling to copy word when n < 4
660 CMPBEQ R5, $0, returnZero
661 MVC $8, 0(R6), 0(R8)
662 CMPBEQ R5, $1, returnZero
663 MVC $8, 8(R6), 8(R8)
664 CMPBEQ R5, $2, returnZero
665 MVC $8, 16(R6), 16(R8)
666
667 returnZero:
668 MOVD $0, c+56(FP) // return 0 as borrow
669 RET
670
671 mediumLoop:
672 CMPBLT R5, $4, smallLoop
673 CMPBLT R5, $32, mediumLoopBody
674
675 largeLoop: // Copying 256 bytes at a time
676 MVC $256, 0(R6), 0(R8)
677 MOVD $256(R6), R6
678 MOVD $256(R8), R8
679 MOVD $-32(R5), R5
680 CMPBGE R5, $32, largeLoop
681 BR mediumLoop
682
683 mediumLoopBody: // Copying 32 bytes at a time
684 MVC $32, 0(R6), 0(R8)
685 MOVD $32(R6), R6
686 MOVD $32(R8), R8
687 MOVD $-4(R5), R5
688 CMPBGE R5, $4, mediumLoopBody
689 BR smallLoop
690
691 returnC:
692 MOVD R7, c+56(FP)
693 RET
694
695 // func shlVU(z, x []Word, s uint) (c Word)
696 TEXT ·shlVU(SB), NOSPLIT, $0
697 BR ·shlVU_g(SB)
698
699 // func shrVU(z, x []Word, s uint) (c Word)
700 TEXT ·shrVU(SB), NOSPLIT, $0
701 BR ·shrVU_g(SB)
702
703 // CX = R4, r8 = r8, r9=r9, r10 = r2, r11 = r5, DX = r3, AX = r6, BX = R1, (R0 set to 0) + use R11 + use R7 for i
704 // func mulAddVWW(z, x []Word, y, r Word) (c Word)
705 TEXT ·mulAddVWW(SB), NOSPLIT, $0
706 MOVD z+0(FP), R2
707 MOVD x+24(FP), R8
708 MOVD y+48(FP), R9
709 MOVD r+56(FP), R4 // c = r
710 MOVD z_len+8(FP), R5
711 MOVD $0, R1 // i = 0
712 MOVD $0, R7 // i*8 = 0
713 MOVD $0, R0 // make sure it's zero
714 BR E5
715
716 L5:
717 MOVD (R8)(R1*1), R6
718 MULHDU R9, R6
719 ADDC R4, R11 // add to low order bits
720 ADDE R0, R6
721 MOVD R11, (R2)(R1*1)
722 MOVD R6, R4
723 ADD $8, R1 // i*8 + 8
724 ADD $1, R7 // i++
725
726 E5:
727 CMPBLT R7, R5, L5 // i < n
728
729 MOVD R4, c+64(FP)
730 RET
731
732 // func addMulVVW(z, x []Word, y Word) (c Word)
733 // CX = R4, r8 = r8, r9=r9, r10 = r2, r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1, (R0 set to 0) + use R11 + use R7 for i
734 TEXT ·addMulVVW(SB), NOSPLIT, $0
735 MOVD z+0(FP), R2
736 MOVD x+24(FP), R8
737 MOVD y+48(FP), R9
738 MOVD z_len+8(FP), R5
739
740 MOVD $0, R1 // i*8 = 0
741 MOVD $0, R7 // i = 0
742 MOVD $0, R0 // make sure it's zero
743 MOVD $0, R4 // c = 0
744
745 MOVD R5, R12
746 AND $-2, R12
747 CMPBGE R5, $2, A6
748 BR E6
749
750 A6:
751 MOVD (R8)(R1*1), R6
752 MULHDU R9, R6
753 MOVD (R2)(R1*1), R10
754 ADDC R10, R11 // add to low order bits
755 ADDE R0, R6
756 ADDC R4, R11
757 ADDE R0, R6
758 MOVD R6, R4
759 MOVD R11, (R2)(R1*1)
760
761 MOVD (8)(R8)(R1*1), R6
762 MULHDU R9, R6
763 MOVD (8)(R2)(R1*1), R10
764 ADDC R10, R11 // add to low order bits
765 ADDE R0, R6
766 ADDC R4, R11
767 ADDE R0, R6
768 MOVD R6, R4
769 MOVD R11, (8)(R2)(R1*1)
770
771 ADD $16, R1 // i*8 + 8
772 ADD $2, R7 // i++
773
774 CMPBLT R7, R12, A6
775 BR E6
776
777 L6:
778 MOVD (R8)(R1*1), R6
779 MULHDU R9, R6
780 MOVD (R2)(R1*1), R10
781 ADDC R10, R11 // add to low order bits
782 ADDE R0, R6
783 ADDC R4, R11
784 ADDE R0, R6
785 MOVD R6, R4
786 MOVD R11, (R2)(R1*1)
787
788 ADD $8, R1 // i*8 + 8
789 ADD $1, R7 // i++
790
791 E6:
792 CMPBLT R7, R5, L6 // i < n
793
794 MOVD R4, c+56(FP)
795 RET
796
797
View as plain text