1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build gc && !purego
6 // +build gc,!purego
7
8 #include "go_asm.h"
9 #include "textflag.h"
10
11 // This is an implementation of the ChaCha20 encryption algorithm as
12 // specified in RFC 7539. It uses vector instructions to compute
13 // 4 keystream blocks in parallel (256 bytes) which are then XORed
14 // with the bytes in the input slice.
15
16 GLOBL ·constants<>(SB), RODATA|NOPTR, $32
17 // BSWAP: swap bytes in each 4-byte element
18 DATA ·constants<>+0x00(SB)/4, $0x03020100
19 DATA ·constants<>+0x04(SB)/4, $0x07060504
20 DATA ·constants<>+0x08(SB)/4, $0x0b0a0908
21 DATA ·constants<>+0x0c(SB)/4, $0x0f0e0d0c
22 // J0: [j0, j1, j2, j3]
23 DATA ·constants<>+0x10(SB)/4, $0x61707865
24 DATA ·constants<>+0x14(SB)/4, $0x3320646e
25 DATA ·constants<>+0x18(SB)/4, $0x79622d32
26 DATA ·constants<>+0x1c(SB)/4, $0x6b206574
27
28 #define BSWAP V5
29 #define J0 V6
30 #define KEY0 V7
31 #define KEY1 V8
32 #define NONCE V9
33 #define CTR V10
34 #define M0 V11
35 #define M1 V12
36 #define M2 V13
37 #define M3 V14
38 #define INC V15
39 #define X0 V16
40 #define X1 V17
41 #define X2 V18
42 #define X3 V19
43 #define X4 V20
44 #define X5 V21
45 #define X6 V22
46 #define X7 V23
47 #define X8 V24
48 #define X9 V25
49 #define X10 V26
50 #define X11 V27
51 #define X12 V28
52 #define X13 V29
53 #define X14 V30
54 #define X15 V31
55
56 #define NUM_ROUNDS 20
57
58 #define ROUND4(a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3) \
59 VAF a1, a0, a0 \
60 VAF b1, b0, b0 \
61 VAF c1, c0, c0 \
62 VAF d1, d0, d0 \
63 VX a0, a2, a2 \
64 VX b0, b2, b2 \
65 VX c0, c2, c2 \
66 VX d0, d2, d2 \
67 VERLLF $16, a2, a2 \
68 VERLLF $16, b2, b2 \
69 VERLLF $16, c2, c2 \
70 VERLLF $16, d2, d2 \
71 VAF a2, a3, a3 \
72 VAF b2, b3, b3 \
73 VAF c2, c3, c3 \
74 VAF d2, d3, d3 \
75 VX a3, a1, a1 \
76 VX b3, b1, b1 \
77 VX c3, c1, c1 \
78 VX d3, d1, d1 \
79 VERLLF $12, a1, a1 \
80 VERLLF $12, b1, b1 \
81 VERLLF $12, c1, c1 \
82 VERLLF $12, d1, d1 \
83 VAF a1, a0, a0 \
84 VAF b1, b0, b0 \
85 VAF c1, c0, c0 \
86 VAF d1, d0, d0 \
87 VX a0, a2, a2 \
88 VX b0, b2, b2 \
89 VX c0, c2, c2 \
90 VX d0, d2, d2 \
91 VERLLF $8, a2, a2 \
92 VERLLF $8, b2, b2 \
93 VERLLF $8, c2, c2 \
94 VERLLF $8, d2, d2 \
95 VAF a2, a3, a3 \
96 VAF b2, b3, b3 \
97 VAF c2, c3, c3 \
98 VAF d2, d3, d3 \
99 VX a3, a1, a1 \
100 VX b3, b1, b1 \
101 VX c3, c1, c1 \
102 VX d3, d1, d1 \
103 VERLLF $7, a1, a1 \
104 VERLLF $7, b1, b1 \
105 VERLLF $7, c1, c1 \
106 VERLLF $7, d1, d1
107
108 #define PERMUTE(mask, v0, v1, v2, v3) \
109 VPERM v0, v0, mask, v0 \
110 VPERM v1, v1, mask, v1 \
111 VPERM v2, v2, mask, v2 \
112 VPERM v3, v3, mask, v3
113
114 #define ADDV(x, v0, v1, v2, v3) \
115 VAF x, v0, v0 \
116 VAF x, v1, v1 \
117 VAF x, v2, v2 \
118 VAF x, v3, v3
119
120 #define XORV(off, dst, src, v0, v1, v2, v3) \
121 VLM off(src), M0, M3 \
122 PERMUTE(BSWAP, v0, v1, v2, v3) \
123 VX v0, M0, M0 \
124 VX v1, M1, M1 \
125 VX v2, M2, M2 \
126 VX v3, M3, M3 \
127 VSTM M0, M3, off(dst)
128
129 #define SHUFFLE(a, b, c, d, t, u, v, w) \
130 VMRHF a, c, t \ // t = {a[0], c[0], a[1], c[1]}
131 VMRHF b, d, u \ // u = {b[0], d[0], b[1], d[1]}
132 VMRLF a, c, v \ // v = {a[2], c[2], a[3], c[3]}
133 VMRLF b, d, w \ // w = {b[2], d[2], b[3], d[3]}
134 VMRHF t, u, a \ // a = {a[0], b[0], c[0], d[0]}
135 VMRLF t, u, b \ // b = {a[1], b[1], c[1], d[1]}
136 VMRHF v, w, c \ // c = {a[2], b[2], c[2], d[2]}
137 VMRLF v, w, d // d = {a[3], b[3], c[3], d[3]}
138
139 // func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
140 TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
141 MOVD $·constants<>(SB), R1
142 MOVD dst+0(FP), R2 // R2=&dst[0]
143 LMG src+24(FP), R3, R4 // R3=&src[0] R4=len(src)
144 MOVD key+48(FP), R5 // R5=key
145 MOVD nonce+56(FP), R6 // R6=nonce
146 MOVD counter+64(FP), R7 // R7=counter
147
148 // load BSWAP and J0
149 VLM (R1), BSWAP, J0
150
151 // setup
152 MOVD $95, R0
153 VLM (R5), KEY0, KEY1
154 VLL R0, (R6), NONCE
155 VZERO M0
156 VLEIB $7, $32, M0
157 VSRLB M0, NONCE, NONCE
158
159 // initialize counter values
160 VLREPF (R7), CTR
161 VZERO INC
162 VLEIF $1, $1, INC
163 VLEIF $2, $2, INC
164 VLEIF $3, $3, INC
165 VAF INC, CTR, CTR
166 VREPIF $4, INC
167
168 chacha:
169 VREPF $0, J0, X0
170 VREPF $1, J0, X1
171 VREPF $2, J0, X2
172 VREPF $3, J0, X3
173 VREPF $0, KEY0, X4
174 VREPF $1, KEY0, X5
175 VREPF $2, KEY0, X6
176 VREPF $3, KEY0, X7
177 VREPF $0, KEY1, X8
178 VREPF $1, KEY1, X9
179 VREPF $2, KEY1, X10
180 VREPF $3, KEY1, X11
181 VLR CTR, X12
182 VREPF $1, NONCE, X13
183 VREPF $2, NONCE, X14
184 VREPF $3, NONCE, X15
185
186 MOVD $(NUM_ROUNDS/2), R1
187
188 loop:
189 ROUND4(X0, X4, X12, X8, X1, X5, X13, X9, X2, X6, X14, X10, X3, X7, X15, X11)
190 ROUND4(X0, X5, X15, X10, X1, X6, X12, X11, X2, X7, X13, X8, X3, X4, X14, X9)
191
192 ADD $-1, R1
193 BNE loop
194
195 // decrement length
196 ADD $-256, R4
197
198 // rearrange vectors
199 SHUFFLE(X0, X1, X2, X3, M0, M1, M2, M3)
200 ADDV(J0, X0, X1, X2, X3)
201 SHUFFLE(X4, X5, X6, X7, M0, M1, M2, M3)
202 ADDV(KEY0, X4, X5, X6, X7)
203 SHUFFLE(X8, X9, X10, X11, M0, M1, M2, M3)
204 ADDV(KEY1, X8, X9, X10, X11)
205 VAF CTR, X12, X12
206 SHUFFLE(X12, X13, X14, X15, M0, M1, M2, M3)
207 ADDV(NONCE, X12, X13, X14, X15)
208
209 // increment counters
210 VAF INC, CTR, CTR
211
212 // xor keystream with plaintext
213 XORV(0*64, R2, R3, X0, X4, X8, X12)
214 XORV(1*64, R2, R3, X1, X5, X9, X13)
215 XORV(2*64, R2, R3, X2, X6, X10, X14)
216 XORV(3*64, R2, R3, X3, X7, X11, X15)
217
218 // increment pointers
219 MOVD $256(R2), R2
220 MOVD $256(R3), R3
221
222 CMPBNE R4, $0, chacha
223
224 VSTEF $0, CTR, (R7)
225 RET
226
View as plain text