1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build go1.11 && gc && !purego
6 // +build go1.11,gc,!purego
7
8 #include "textflag.h"
9
10 #define NUM_ROUNDS 10
11
12 // func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
13 TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
14 MOVD dst+0(FP), R1
15 MOVD src+24(FP), R2
16 MOVD src_len+32(FP), R3
17 MOVD key+48(FP), R4
18 MOVD nonce+56(FP), R6
19 MOVD counter+64(FP), R7
20
21 MOVD $·constants(SB), R10
22 MOVD $·incRotMatrix(SB), R11
23
24 MOVW (R7), R20
25
26 AND $~255, R3, R13
27 ADD R2, R13, R12 // R12 for block end
28 AND $255, R3, R13
29 loop:
30 MOVD $NUM_ROUNDS, R21
31 VLD1 (R11), [V30.S4, V31.S4]
32
33 // load contants
34 // VLD4R (R10), [V0.S4, V1.S4, V2.S4, V3.S4]
35 WORD $0x4D60E940
36
37 // load keys
38 // VLD4R 16(R4), [V4.S4, V5.S4, V6.S4, V7.S4]
39 WORD $0x4DFFE884
40 // VLD4R 16(R4), [V8.S4, V9.S4, V10.S4, V11.S4]
41 WORD $0x4DFFE888
42 SUB $32, R4
43
44 // load counter + nonce
45 // VLD1R (R7), [V12.S4]
46 WORD $0x4D40C8EC
47
48 // VLD3R (R6), [V13.S4, V14.S4, V15.S4]
49 WORD $0x4D40E8CD
50
51 // update counter
52 VADD V30.S4, V12.S4, V12.S4
53
54 chacha:
55 // V0..V3 += V4..V7
56 // V12..V15 <<<= ((V12..V15 XOR V0..V3), 16)
57 VADD V0.S4, V4.S4, V0.S4
58 VADD V1.S4, V5.S4, V1.S4
59 VADD V2.S4, V6.S4, V2.S4
60 VADD V3.S4, V7.S4, V3.S4
61 VEOR V12.B16, V0.B16, V12.B16
62 VEOR V13.B16, V1.B16, V13.B16
63 VEOR V14.B16, V2.B16, V14.B16
64 VEOR V15.B16, V3.B16, V15.B16
65 VREV32 V12.H8, V12.H8
66 VREV32 V13.H8, V13.H8
67 VREV32 V14.H8, V14.H8
68 VREV32 V15.H8, V15.H8
69 // V8..V11 += V12..V15
70 // V4..V7 <<<= ((V4..V7 XOR V8..V11), 12)
71 VADD V8.S4, V12.S4, V8.S4
72 VADD V9.S4, V13.S4, V9.S4
73 VADD V10.S4, V14.S4, V10.S4
74 VADD V11.S4, V15.S4, V11.S4
75 VEOR V8.B16, V4.B16, V16.B16
76 VEOR V9.B16, V5.B16, V17.B16
77 VEOR V10.B16, V6.B16, V18.B16
78 VEOR V11.B16, V7.B16, V19.B16
79 VSHL $12, V16.S4, V4.S4
80 VSHL $12, V17.S4, V5.S4
81 VSHL $12, V18.S4, V6.S4
82 VSHL $12, V19.S4, V7.S4
83 VSRI $20, V16.S4, V4.S4
84 VSRI $20, V17.S4, V5.S4
85 VSRI $20, V18.S4, V6.S4
86 VSRI $20, V19.S4, V7.S4
87
88 // V0..V3 += V4..V7
89 // V12..V15 <<<= ((V12..V15 XOR V0..V3), 8)
90 VADD V0.S4, V4.S4, V0.S4
91 VADD V1.S4, V5.S4, V1.S4
92 VADD V2.S4, V6.S4, V2.S4
93 VADD V3.S4, V7.S4, V3.S4
94 VEOR V12.B16, V0.B16, V12.B16
95 VEOR V13.B16, V1.B16, V13.B16
96 VEOR V14.B16, V2.B16, V14.B16
97 VEOR V15.B16, V3.B16, V15.B16
98 VTBL V31.B16, [V12.B16], V12.B16
99 VTBL V31.B16, [V13.B16], V13.B16
100 VTBL V31.B16, [V14.B16], V14.B16
101 VTBL V31.B16, [V15.B16], V15.B16
102
103 // V8..V11 += V12..V15
104 // V4..V7 <<<= ((V4..V7 XOR V8..V11), 7)
105 VADD V12.S4, V8.S4, V8.S4
106 VADD V13.S4, V9.S4, V9.S4
107 VADD V14.S4, V10.S4, V10.S4
108 VADD V15.S4, V11.S4, V11.S4
109 VEOR V8.B16, V4.B16, V16.B16
110 VEOR V9.B16, V5.B16, V17.B16
111 VEOR V10.B16, V6.B16, V18.B16
112 VEOR V11.B16, V7.B16, V19.B16
113 VSHL $7, V16.S4, V4.S4
114 VSHL $7, V17.S4, V5.S4
115 VSHL $7, V18.S4, V6.S4
116 VSHL $7, V19.S4, V7.S4
117 VSRI $25, V16.S4, V4.S4
118 VSRI $25, V17.S4, V5.S4
119 VSRI $25, V18.S4, V6.S4
120 VSRI $25, V19.S4, V7.S4
121
122 // V0..V3 += V5..V7, V4
123 // V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 16)
124 VADD V0.S4, V5.S4, V0.S4
125 VADD V1.S4, V6.S4, V1.S4
126 VADD V2.S4, V7.S4, V2.S4
127 VADD V3.S4, V4.S4, V3.S4
128 VEOR V15.B16, V0.B16, V15.B16
129 VEOR V12.B16, V1.B16, V12.B16
130 VEOR V13.B16, V2.B16, V13.B16
131 VEOR V14.B16, V3.B16, V14.B16
132 VREV32 V12.H8, V12.H8
133 VREV32 V13.H8, V13.H8
134 VREV32 V14.H8, V14.H8
135 VREV32 V15.H8, V15.H8
136
137 // V10 += V15; V5 <<<= ((V10 XOR V5), 12)
138 // ...
139 VADD V15.S4, V10.S4, V10.S4
140 VADD V12.S4, V11.S4, V11.S4
141 VADD V13.S4, V8.S4, V8.S4
142 VADD V14.S4, V9.S4, V9.S4
143 VEOR V10.B16, V5.B16, V16.B16
144 VEOR V11.B16, V6.B16, V17.B16
145 VEOR V8.B16, V7.B16, V18.B16
146 VEOR V9.B16, V4.B16, V19.B16
147 VSHL $12, V16.S4, V5.S4
148 VSHL $12, V17.S4, V6.S4
149 VSHL $12, V18.S4, V7.S4
150 VSHL $12, V19.S4, V4.S4
151 VSRI $20, V16.S4, V5.S4
152 VSRI $20, V17.S4, V6.S4
153 VSRI $20, V18.S4, V7.S4
154 VSRI $20, V19.S4, V4.S4
155
156 // V0 += V5; V15 <<<= ((V0 XOR V15), 8)
157 // ...
158 VADD V5.S4, V0.S4, V0.S4
159 VADD V6.S4, V1.S4, V1.S4
160 VADD V7.S4, V2.S4, V2.S4
161 VADD V4.S4, V3.S4, V3.S4
162 VEOR V0.B16, V15.B16, V15.B16
163 VEOR V1.B16, V12.B16, V12.B16
164 VEOR V2.B16, V13.B16, V13.B16
165 VEOR V3.B16, V14.B16, V14.B16
166 VTBL V31.B16, [V12.B16], V12.B16
167 VTBL V31.B16, [V13.B16], V13.B16
168 VTBL V31.B16, [V14.B16], V14.B16
169 VTBL V31.B16, [V15.B16], V15.B16
170
171 // V10 += V15; V5 <<<= ((V10 XOR V5), 7)
172 // ...
173 VADD V15.S4, V10.S4, V10.S4
174 VADD V12.S4, V11.S4, V11.S4
175 VADD V13.S4, V8.S4, V8.S4
176 VADD V14.S4, V9.S4, V9.S4
177 VEOR V10.B16, V5.B16, V16.B16
178 VEOR V11.B16, V6.B16, V17.B16
179 VEOR V8.B16, V7.B16, V18.B16
180 VEOR V9.B16, V4.B16, V19.B16
181 VSHL $7, V16.S4, V5.S4
182 VSHL $7, V17.S4, V6.S4
183 VSHL $7, V18.S4, V7.S4
184 VSHL $7, V19.S4, V4.S4
185 VSRI $25, V16.S4, V5.S4
186 VSRI $25, V17.S4, V6.S4
187 VSRI $25, V18.S4, V7.S4
188 VSRI $25, V19.S4, V4.S4
189
190 SUB $1, R21
191 CBNZ R21, chacha
192
193 // VLD4R (R10), [V16.S4, V17.S4, V18.S4, V19.S4]
194 WORD $0x4D60E950
195
196 // VLD4R 16(R4), [V20.S4, V21.S4, V22.S4, V23.S4]
197 WORD $0x4DFFE894
198 VADD V30.S4, V12.S4, V12.S4
199 VADD V16.S4, V0.S4, V0.S4
200 VADD V17.S4, V1.S4, V1.S4
201 VADD V18.S4, V2.S4, V2.S4
202 VADD V19.S4, V3.S4, V3.S4
203 // VLD4R 16(R4), [V24.S4, V25.S4, V26.S4, V27.S4]
204 WORD $0x4DFFE898
205 // restore R4
206 SUB $32, R4
207
208 // load counter + nonce
209 // VLD1R (R7), [V28.S4]
210 WORD $0x4D40C8FC
211 // VLD3R (R6), [V29.S4, V30.S4, V31.S4]
212 WORD $0x4D40E8DD
213
214 VADD V20.S4, V4.S4, V4.S4
215 VADD V21.S4, V5.S4, V5.S4
216 VADD V22.S4, V6.S4, V6.S4
217 VADD V23.S4, V7.S4, V7.S4
218 VADD V24.S4, V8.S4, V8.S4
219 VADD V25.S4, V9.S4, V9.S4
220 VADD V26.S4, V10.S4, V10.S4
221 VADD V27.S4, V11.S4, V11.S4
222 VADD V28.S4, V12.S4, V12.S4
223 VADD V29.S4, V13.S4, V13.S4
224 VADD V30.S4, V14.S4, V14.S4
225 VADD V31.S4, V15.S4, V15.S4
226
227 VZIP1 V1.S4, V0.S4, V16.S4
228 VZIP2 V1.S4, V0.S4, V17.S4
229 VZIP1 V3.S4, V2.S4, V18.S4
230 VZIP2 V3.S4, V2.S4, V19.S4
231 VZIP1 V5.S4, V4.S4, V20.S4
232 VZIP2 V5.S4, V4.S4, V21.S4
233 VZIP1 V7.S4, V6.S4, V22.S4
234 VZIP2 V7.S4, V6.S4, V23.S4
235 VZIP1 V9.S4, V8.S4, V24.S4
236 VZIP2 V9.S4, V8.S4, V25.S4
237 VZIP1 V11.S4, V10.S4, V26.S4
238 VZIP2 V11.S4, V10.S4, V27.S4
239 VZIP1 V13.S4, V12.S4, V28.S4
240 VZIP2 V13.S4, V12.S4, V29.S4
241 VZIP1 V15.S4, V14.S4, V30.S4
242 VZIP2 V15.S4, V14.S4, V31.S4
243 VZIP1 V18.D2, V16.D2, V0.D2
244 VZIP2 V18.D2, V16.D2, V4.D2
245 VZIP1 V19.D2, V17.D2, V8.D2
246 VZIP2 V19.D2, V17.D2, V12.D2
247 VLD1.P 64(R2), [V16.B16, V17.B16, V18.B16, V19.B16]
248
249 VZIP1 V22.D2, V20.D2, V1.D2
250 VZIP2 V22.D2, V20.D2, V5.D2
251 VZIP1 V23.D2, V21.D2, V9.D2
252 VZIP2 V23.D2, V21.D2, V13.D2
253 VLD1.P 64(R2), [V20.B16, V21.B16, V22.B16, V23.B16]
254 VZIP1 V26.D2, V24.D2, V2.D2
255 VZIP2 V26.D2, V24.D2, V6.D2
256 VZIP1 V27.D2, V25.D2, V10.D2
257 VZIP2 V27.D2, V25.D2, V14.D2
258 VLD1.P 64(R2), [V24.B16, V25.B16, V26.B16, V27.B16]
259 VZIP1 V30.D2, V28.D2, V3.D2
260 VZIP2 V30.D2, V28.D2, V7.D2
261 VZIP1 V31.D2, V29.D2, V11.D2
262 VZIP2 V31.D2, V29.D2, V15.D2
263 VLD1.P 64(R2), [V28.B16, V29.B16, V30.B16, V31.B16]
264 VEOR V0.B16, V16.B16, V16.B16
265 VEOR V1.B16, V17.B16, V17.B16
266 VEOR V2.B16, V18.B16, V18.B16
267 VEOR V3.B16, V19.B16, V19.B16
268 VST1.P [V16.B16, V17.B16, V18.B16, V19.B16], 64(R1)
269 VEOR V4.B16, V20.B16, V20.B16
270 VEOR V5.B16, V21.B16, V21.B16
271 VEOR V6.B16, V22.B16, V22.B16
272 VEOR V7.B16, V23.B16, V23.B16
273 VST1.P [V20.B16, V21.B16, V22.B16, V23.B16], 64(R1)
274 VEOR V8.B16, V24.B16, V24.B16
275 VEOR V9.B16, V25.B16, V25.B16
276 VEOR V10.B16, V26.B16, V26.B16
277 VEOR V11.B16, V27.B16, V27.B16
278 VST1.P [V24.B16, V25.B16, V26.B16, V27.B16], 64(R1)
279 VEOR V12.B16, V28.B16, V28.B16
280 VEOR V13.B16, V29.B16, V29.B16
281 VEOR V14.B16, V30.B16, V30.B16
282 VEOR V15.B16, V31.B16, V31.B16
283 VST1.P [V28.B16, V29.B16, V30.B16, V31.B16], 64(R1)
284
285 ADD $4, R20
286 MOVW R20, (R7) // update counter
287
288 CMP R2, R12
289 BGT loop
290
291 RET
292
293
294 DATA ·constants+0x00(SB)/4, $0x61707865
295 DATA ·constants+0x04(SB)/4, $0x3320646e
296 DATA ·constants+0x08(SB)/4, $0x79622d32
297 DATA ·constants+0x0c(SB)/4, $0x6b206574
298 GLOBL ·constants(SB), NOPTR|RODATA, $32
299
300 DATA ·incRotMatrix+0x00(SB)/4, $0x00000000
301 DATA ·incRotMatrix+0x04(SB)/4, $0x00000001
302 DATA ·incRotMatrix+0x08(SB)/4, $0x00000002
303 DATA ·incRotMatrix+0x0c(SB)/4, $0x00000003
304 DATA ·incRotMatrix+0x10(SB)/4, $0x02010003
305 DATA ·incRotMatrix+0x14(SB)/4, $0x06050407
306 DATA ·incRotMatrix+0x18(SB)/4, $0x0A09080B
307 DATA ·incRotMatrix+0x1c(SB)/4, $0x0E0D0C0F
308 GLOBL ·incRotMatrix(SB), NOPTR|RODATA, $32
309
View as plain text