1 // Copyright 2019 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Based on CRYPTOGAMS code with the following comment:
6 // # ====================================================================
7 // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
8 // # project. The module is, however, dual licensed under OpenSSL and
9 // # CRYPTOGAMS licenses depending on where you obtain it. For further
10 // # details see http://www.openssl.org/~appro/cryptogams/.
11 // # ====================================================================
12
13 // Code for the perl script that generates the ppc64 assembler
14 // can be found in the cryptogams repository at the link below. It is based on
15 // the original from openssl.
16
17 // https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91
18
19 // The differences in this and the original implementation are
20 // due to the calling conventions and initialization of constants.
21
22 //go:build gc && !purego
23 // +build gc,!purego
24
25 #include "textflag.h"
26
27 #define OUT R3
28 #define INP R4
29 #define LEN R5
30 #define KEY R6
31 #define CNT R7
32 #define TMP R15
33
34 #define CONSTBASE R16
35 #define BLOCKS R17
36
37 DATA consts<>+0x00(SB)/8, $0x3320646e61707865
38 DATA consts<>+0x08(SB)/8, $0x6b20657479622d32
39 DATA consts<>+0x10(SB)/8, $0x0000000000000001
40 DATA consts<>+0x18(SB)/8, $0x0000000000000000
41 DATA consts<>+0x20(SB)/8, $0x0000000000000004
42 DATA consts<>+0x28(SB)/8, $0x0000000000000000
43 DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d
44 DATA consts<>+0x38(SB)/8, $0x0203000106070405
45 DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c
46 DATA consts<>+0x48(SB)/8, $0x0102030005060704
47 DATA consts<>+0x50(SB)/8, $0x6170786561707865
48 DATA consts<>+0x58(SB)/8, $0x6170786561707865
49 DATA consts<>+0x60(SB)/8, $0x3320646e3320646e
50 DATA consts<>+0x68(SB)/8, $0x3320646e3320646e
51 DATA consts<>+0x70(SB)/8, $0x79622d3279622d32
52 DATA consts<>+0x78(SB)/8, $0x79622d3279622d32
53 DATA consts<>+0x80(SB)/8, $0x6b2065746b206574
54 DATA consts<>+0x88(SB)/8, $0x6b2065746b206574
55 DATA consts<>+0x90(SB)/8, $0x0000000100000000
56 DATA consts<>+0x98(SB)/8, $0x0000000300000002
57 GLOBL consts<>(SB), RODATA, $0xa0
58
59 //func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
60 TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
61 MOVD out+0(FP), OUT
62 MOVD inp+8(FP), INP
63 MOVD len+16(FP), LEN
64 MOVD key+24(FP), KEY
65 MOVD counter+32(FP), CNT
66
67 // Addressing for constants
68 MOVD $consts<>+0x00(SB), CONSTBASE
69 MOVD $16, R8
70 MOVD $32, R9
71 MOVD $48, R10
72 MOVD $64, R11
73 SRD $6, LEN, BLOCKS
74 // V16
75 LXVW4X (CONSTBASE)(R0), VS48
76 ADD $80,CONSTBASE
77
78 // Load key into V17,V18
79 LXVW4X (KEY)(R0), VS49
80 LXVW4X (KEY)(R8), VS50
81
82 // Load CNT, NONCE into V19
83 LXVW4X (CNT)(R0), VS51
84
85 // Clear V27
86 VXOR V27, V27, V27
87
88 // V28
89 LXVW4X (CONSTBASE)(R11), VS60
90
91 // splat slot from V19 -> V26
92 VSPLTW $0, V19, V26
93
94 VSLDOI $4, V19, V27, V19
95 VSLDOI $12, V27, V19, V19
96
97 VADDUWM V26, V28, V26
98
99 MOVD $10, R14
100 MOVD R14, CTR
101
102 loop_outer_vsx:
103 // V0, V1, V2, V3
104 LXVW4X (R0)(CONSTBASE), VS32
105 LXVW4X (R8)(CONSTBASE), VS33
106 LXVW4X (R9)(CONSTBASE), VS34
107 LXVW4X (R10)(CONSTBASE), VS35
108
109 // splat values from V17, V18 into V4-V11
110 VSPLTW $0, V17, V4
111 VSPLTW $1, V17, V5
112 VSPLTW $2, V17, V6
113 VSPLTW $3, V17, V7
114 VSPLTW $0, V18, V8
115 VSPLTW $1, V18, V9
116 VSPLTW $2, V18, V10
117 VSPLTW $3, V18, V11
118
119 // VOR
120 VOR V26, V26, V12
121
122 // splat values from V19 -> V13, V14, V15
123 VSPLTW $1, V19, V13
124 VSPLTW $2, V19, V14
125 VSPLTW $3, V19, V15
126
127 // splat const values
128 VSPLTISW $-16, V27
129 VSPLTISW $12, V28
130 VSPLTISW $8, V29
131 VSPLTISW $7, V30
132
133 loop_vsx:
134 VADDUWM V0, V4, V0
135 VADDUWM V1, V5, V1
136 VADDUWM V2, V6, V2
137 VADDUWM V3, V7, V3
138
139 VXOR V12, V0, V12
140 VXOR V13, V1, V13
141 VXOR V14, V2, V14
142 VXOR V15, V3, V15
143
144 VRLW V12, V27, V12
145 VRLW V13, V27, V13
146 VRLW V14, V27, V14
147 VRLW V15, V27, V15
148
149 VADDUWM V8, V12, V8
150 VADDUWM V9, V13, V9
151 VADDUWM V10, V14, V10
152 VADDUWM V11, V15, V11
153
154 VXOR V4, V8, V4
155 VXOR V5, V9, V5
156 VXOR V6, V10, V6
157 VXOR V7, V11, V7
158
159 VRLW V4, V28, V4
160 VRLW V5, V28, V5
161 VRLW V6, V28, V6
162 VRLW V7, V28, V7
163
164 VADDUWM V0, V4, V0
165 VADDUWM V1, V5, V1
166 VADDUWM V2, V6, V2
167 VADDUWM V3, V7, V3
168
169 VXOR V12, V0, V12
170 VXOR V13, V1, V13
171 VXOR V14, V2, V14
172 VXOR V15, V3, V15
173
174 VRLW V12, V29, V12
175 VRLW V13, V29, V13
176 VRLW V14, V29, V14
177 VRLW V15, V29, V15
178
179 VADDUWM V8, V12, V8
180 VADDUWM V9, V13, V9
181 VADDUWM V10, V14, V10
182 VADDUWM V11, V15, V11
183
184 VXOR V4, V8, V4
185 VXOR V5, V9, V5
186 VXOR V6, V10, V6
187 VXOR V7, V11, V7
188
189 VRLW V4, V30, V4
190 VRLW V5, V30, V5
191 VRLW V6, V30, V6
192 VRLW V7, V30, V7
193
194 VADDUWM V0, V5, V0
195 VADDUWM V1, V6, V1
196 VADDUWM V2, V7, V2
197 VADDUWM V3, V4, V3
198
199 VXOR V15, V0, V15
200 VXOR V12, V1, V12
201 VXOR V13, V2, V13
202 VXOR V14, V3, V14
203
204 VRLW V15, V27, V15
205 VRLW V12, V27, V12
206 VRLW V13, V27, V13
207 VRLW V14, V27, V14
208
209 VADDUWM V10, V15, V10
210 VADDUWM V11, V12, V11
211 VADDUWM V8, V13, V8
212 VADDUWM V9, V14, V9
213
214 VXOR V5, V10, V5
215 VXOR V6, V11, V6
216 VXOR V7, V8, V7
217 VXOR V4, V9, V4
218
219 VRLW V5, V28, V5
220 VRLW V6, V28, V6
221 VRLW V7, V28, V7
222 VRLW V4, V28, V4
223
224 VADDUWM V0, V5, V0
225 VADDUWM V1, V6, V1
226 VADDUWM V2, V7, V2
227 VADDUWM V3, V4, V3
228
229 VXOR V15, V0, V15
230 VXOR V12, V1, V12
231 VXOR V13, V2, V13
232 VXOR V14, V3, V14
233
234 VRLW V15, V29, V15
235 VRLW V12, V29, V12
236 VRLW V13, V29, V13
237 VRLW V14, V29, V14
238
239 VADDUWM V10, V15, V10
240 VADDUWM V11, V12, V11
241 VADDUWM V8, V13, V8
242 VADDUWM V9, V14, V9
243
244 VXOR V5, V10, V5
245 VXOR V6, V11, V6
246 VXOR V7, V8, V7
247 VXOR V4, V9, V4
248
249 VRLW V5, V30, V5
250 VRLW V6, V30, V6
251 VRLW V7, V30, V7
252 VRLW V4, V30, V4
253 BC 16, LT, loop_vsx
254
255 VADDUWM V12, V26, V12
256
257 WORD $0x13600F8C // VMRGEW V0, V1, V27
258 WORD $0x13821F8C // VMRGEW V2, V3, V28
259
260 WORD $0x10000E8C // VMRGOW V0, V1, V0
261 WORD $0x10421E8C // VMRGOW V2, V3, V2
262
263 WORD $0x13A42F8C // VMRGEW V4, V5, V29
264 WORD $0x13C63F8C // VMRGEW V6, V7, V30
265
266 XXPERMDI VS32, VS34, $0, VS33
267 XXPERMDI VS32, VS34, $3, VS35
268 XXPERMDI VS59, VS60, $0, VS32
269 XXPERMDI VS59, VS60, $3, VS34
270
271 WORD $0x10842E8C // VMRGOW V4, V5, V4
272 WORD $0x10C63E8C // VMRGOW V6, V7, V6
273
274 WORD $0x13684F8C // VMRGEW V8, V9, V27
275 WORD $0x138A5F8C // VMRGEW V10, V11, V28
276
277 XXPERMDI VS36, VS38, $0, VS37
278 XXPERMDI VS36, VS38, $3, VS39
279 XXPERMDI VS61, VS62, $0, VS36
280 XXPERMDI VS61, VS62, $3, VS38
281
282 WORD $0x11084E8C // VMRGOW V8, V9, V8
283 WORD $0x114A5E8C // VMRGOW V10, V11, V10
284
285 WORD $0x13AC6F8C // VMRGEW V12, V13, V29
286 WORD $0x13CE7F8C // VMRGEW V14, V15, V30
287
288 XXPERMDI VS40, VS42, $0, VS41
289 XXPERMDI VS40, VS42, $3, VS43
290 XXPERMDI VS59, VS60, $0, VS40
291 XXPERMDI VS59, VS60, $3, VS42
292
293 WORD $0x118C6E8C // VMRGOW V12, V13, V12
294 WORD $0x11CE7E8C // VMRGOW V14, V15, V14
295
296 VSPLTISW $4, V27
297 VADDUWM V26, V27, V26
298
299 XXPERMDI VS44, VS46, $0, VS45
300 XXPERMDI VS44, VS46, $3, VS47
301 XXPERMDI VS61, VS62, $0, VS44
302 XXPERMDI VS61, VS62, $3, VS46
303
304 VADDUWM V0, V16, V0
305 VADDUWM V4, V17, V4
306 VADDUWM V8, V18, V8
307 VADDUWM V12, V19, V12
308
309 CMPU LEN, $64
310 BLT tail_vsx
311
312 // Bottom of loop
313 LXVW4X (INP)(R0), VS59
314 LXVW4X (INP)(R8), VS60
315 LXVW4X (INP)(R9), VS61
316 LXVW4X (INP)(R10), VS62
317
318 VXOR V27, V0, V27
319 VXOR V28, V4, V28
320 VXOR V29, V8, V29
321 VXOR V30, V12, V30
322
323 STXVW4X VS59, (OUT)(R0)
324 STXVW4X VS60, (OUT)(R8)
325 ADD $64, INP
326 STXVW4X VS61, (OUT)(R9)
327 ADD $-64, LEN
328 STXVW4X VS62, (OUT)(R10)
329 ADD $64, OUT
330 BEQ done_vsx
331
332 VADDUWM V1, V16, V0
333 VADDUWM V5, V17, V4
334 VADDUWM V9, V18, V8
335 VADDUWM V13, V19, V12
336
337 CMPU LEN, $64
338 BLT tail_vsx
339
340 LXVW4X (INP)(R0), VS59
341 LXVW4X (INP)(R8), VS60
342 LXVW4X (INP)(R9), VS61
343 LXVW4X (INP)(R10), VS62
344 VXOR V27, V0, V27
345
346 VXOR V28, V4, V28
347 VXOR V29, V8, V29
348 VXOR V30, V12, V30
349
350 STXVW4X VS59, (OUT)(R0)
351 STXVW4X VS60, (OUT)(R8)
352 ADD $64, INP
353 STXVW4X VS61, (OUT)(R9)
354 ADD $-64, LEN
355 STXVW4X VS62, (OUT)(V10)
356 ADD $64, OUT
357 BEQ done_vsx
358
359 VADDUWM V2, V16, V0
360 VADDUWM V6, V17, V4
361 VADDUWM V10, V18, V8
362 VADDUWM V14, V19, V12
363
364 CMPU LEN, $64
365 BLT tail_vsx
366
367 LXVW4X (INP)(R0), VS59
368 LXVW4X (INP)(R8), VS60
369 LXVW4X (INP)(R9), VS61
370 LXVW4X (INP)(R10), VS62
371
372 VXOR V27, V0, V27
373 VXOR V28, V4, V28
374 VXOR V29, V8, V29
375 VXOR V30, V12, V30
376
377 STXVW4X VS59, (OUT)(R0)
378 STXVW4X VS60, (OUT)(R8)
379 ADD $64, INP
380 STXVW4X VS61, (OUT)(R9)
381 ADD $-64, LEN
382 STXVW4X VS62, (OUT)(R10)
383 ADD $64, OUT
384 BEQ done_vsx
385
386 VADDUWM V3, V16, V0
387 VADDUWM V7, V17, V4
388 VADDUWM V11, V18, V8
389 VADDUWM V15, V19, V12
390
391 CMPU LEN, $64
392 BLT tail_vsx
393
394 LXVW4X (INP)(R0), VS59
395 LXVW4X (INP)(R8), VS60
396 LXVW4X (INP)(R9), VS61
397 LXVW4X (INP)(R10), VS62
398
399 VXOR V27, V0, V27
400 VXOR V28, V4, V28
401 VXOR V29, V8, V29
402 VXOR V30, V12, V30
403
404 STXVW4X VS59, (OUT)(R0)
405 STXVW4X VS60, (OUT)(R8)
406 ADD $64, INP
407 STXVW4X VS61, (OUT)(R9)
408 ADD $-64, LEN
409 STXVW4X VS62, (OUT)(R10)
410 ADD $64, OUT
411
412 MOVD $10, R14
413 MOVD R14, CTR
414 BNE loop_outer_vsx
415
416 done_vsx:
417 // Increment counter by number of 64 byte blocks
418 MOVD (CNT), R14
419 ADD BLOCKS, R14
420 MOVD R14, (CNT)
421 RET
422
423 tail_vsx:
424 ADD $32, R1, R11
425 MOVD LEN, CTR
426
427 // Save values on stack to copy from
428 STXVW4X VS32, (R11)(R0)
429 STXVW4X VS36, (R11)(R8)
430 STXVW4X VS40, (R11)(R9)
431 STXVW4X VS44, (R11)(R10)
432 ADD $-1, R11, R12
433 ADD $-1, INP
434 ADD $-1, OUT
435
436 looptail_vsx:
437 // Copying the result to OUT
438 // in bytes.
439 MOVBZU 1(R12), KEY
440 MOVBZU 1(INP), TMP
441 XOR KEY, TMP, KEY
442 MOVBU KEY, 1(OUT)
443 BC 16, LT, looptail_vsx
444
445 // Clear the stack values
446 STXVW4X VS48, (R11)(R0)
447 STXVW4X VS48, (R11)(R8)
448 STXVW4X VS48, (R11)(R9)
449 STXVW4X VS48, (R11)(R10)
450 BR done_vsx
451
View as plain text