Text file
src/crypto/sha256/sha256block_amd64.s
1 // Copyright 2013 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "textflag.h"
6
7 // SHA256 block routine. See sha256block.go for Go equivalent.
8 //
9 // The algorithm is detailed in FIPS 180-4:
10 //
11 // https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
12
13 // The avx2-version is described in an Intel White-Paper:
14 // "Fast SHA-256 Implementations on Intel Architecture Processors"
15 // To find it, surf to http://www.intel.com/p/en_US/embedded
16 // and search for that title.
17 // AVX2 version by Intel, same algorithm as code in Linux kernel:
18 // https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S
19 // by
20 // James Guilford <james.guilford@intel.com>
21 // Kirk Yap <kirk.s.yap@intel.com>
22 // Tim Chen <tim.c.chen@linux.intel.com>
23
24 // Wt = Mt; for 0 <= t <= 15
25 // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
26 //
27 // a = H0
28 // b = H1
29 // c = H2
30 // d = H3
31 // e = H4
32 // f = H5
33 // g = H6
34 // h = H7
35 //
36 // for t = 0 to 63 {
37 // T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
38 // T2 = BIGSIGMA0(a) + Maj(a,b,c)
39 // h = g
40 // g = f
41 // f = e
42 // e = d + T1
43 // d = c
44 // c = b
45 // b = a
46 // a = T1 + T2
47 // }
48 //
49 // H0 = a + H0
50 // H1 = b + H1
51 // H2 = c + H2
52 // H3 = d + H3
53 // H4 = e + H4
54 // H5 = f + H5
55 // H6 = g + H6
56 // H7 = h + H7
57
58 // Wt = Mt; for 0 <= t <= 15
59 #define MSGSCHEDULE0(index) \
60 MOVL (index*4)(SI), AX; \
61 BSWAPL AX; \
62 MOVL AX, (index*4)(BP)
63
64 // Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
65 // SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x)
66 // SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x)
67 #define MSGSCHEDULE1(index) \
68 MOVL ((index-2)*4)(BP), AX; \
69 MOVL AX, CX; \
70 RORL $17, AX; \
71 MOVL CX, DX; \
72 RORL $19, CX; \
73 SHRL $10, DX; \
74 MOVL ((index-15)*4)(BP), BX; \
75 XORL CX, AX; \
76 MOVL BX, CX; \
77 XORL DX, AX; \
78 RORL $7, BX; \
79 MOVL CX, DX; \
80 SHRL $3, DX; \
81 RORL $18, CX; \
82 ADDL ((index-7)*4)(BP), AX; \
83 XORL CX, BX; \
84 XORL DX, BX; \
85 ADDL ((index-16)*4)(BP), BX; \
86 ADDL BX, AX; \
87 MOVL AX, ((index)*4)(BP)
88
89 // Calculate T1 in AX - uses AX, CX and DX registers.
90 // h is also used as an accumulator. Wt is passed in AX.
91 // T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
92 // BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x)
93 // Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
94 #define SHA256T1(const, e, f, g, h) \
95 ADDL AX, h; \
96 MOVL e, AX; \
97 ADDL $const, h; \
98 MOVL e, CX; \
99 RORL $6, AX; \
100 MOVL e, DX; \
101 RORL $11, CX; \
102 XORL CX, AX; \
103 MOVL e, CX; \
104 RORL $25, DX; \
105 ANDL f, CX; \
106 XORL AX, DX; \
107 MOVL e, AX; \
108 NOTL AX; \
109 ADDL DX, h; \
110 ANDL g, AX; \
111 XORL CX, AX; \
112 ADDL h, AX
113
114 // Calculate T2 in BX - uses BX, CX, DX and DI registers.
115 // T2 = BIGSIGMA0(a) + Maj(a, b, c)
116 // BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x)
117 // Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
118 #define SHA256T2(a, b, c) \
119 MOVL a, DI; \
120 MOVL c, BX; \
121 RORL $2, DI; \
122 MOVL a, DX; \
123 ANDL b, BX; \
124 RORL $13, DX; \
125 MOVL a, CX; \
126 ANDL c, CX; \
127 XORL DX, DI; \
128 XORL CX, BX; \
129 MOVL a, DX; \
130 MOVL b, CX; \
131 RORL $22, DX; \
132 ANDL a, CX; \
133 XORL CX, BX; \
134 XORL DX, DI; \
135 ADDL DI, BX
136
137 // Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
138 // The values for e and a are stored in d and h, ready for rotation.
139 #define SHA256ROUND(index, const, a, b, c, d, e, f, g, h) \
140 SHA256T1(const, e, f, g, h); \
141 SHA256T2(a, b, c); \
142 MOVL BX, h; \
143 ADDL AX, d; \
144 ADDL AX, h
145
146 #define SHA256ROUND0(index, const, a, b, c, d, e, f, g, h) \
147 MSGSCHEDULE0(index); \
148 SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
149
150 #define SHA256ROUND1(index, const, a, b, c, d, e, f, g, h) \
151 MSGSCHEDULE1(index); \
152 SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
153
154
155 // Definitions for AVX2 version
156
157 // addm (mem), reg
158 // Add reg to mem using reg-mem add and store
159 #define addm(P1, P2) \
160 ADDL P2, P1; \
161 MOVL P1, P2
162
163 #define XDWORD0 Y4
164 #define XDWORD1 Y5
165 #define XDWORD2 Y6
166 #define XDWORD3 Y7
167
168 #define XWORD0 X4
169 #define XWORD1 X5
170 #define XWORD2 X6
171 #define XWORD3 X7
172
173 #define XTMP0 Y0
174 #define XTMP1 Y1
175 #define XTMP2 Y2
176 #define XTMP3 Y3
177 #define XTMP4 Y8
178 #define XTMP5 Y11
179
180 #define XFER Y9
181
182 #define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE
183 #define X_BYTE_FLIP_MASK X13
184
185 #define NUM_BYTES DX
186 #define INP DI
187
188 #define CTX SI // Beginning of digest in memory (a, b, c, ... , h)
189
190 #define a AX
191 #define b BX
192 #define c CX
193 #define d R8
194 #define e DX
195 #define f R9
196 #define g R10
197 #define h R11
198
199 #define old_h R11
200
201 #define TBL BP
202
203 #define SRND SI // SRND is same register as CTX
204
205 #define T1 R12
206
207 #define y0 R13
208 #define y1 R14
209 #define y2 R15
210 #define y3 DI
211
212 // Offsets
213 #define XFER_SIZE 2*64*4
214 #define INP_END_SIZE 8
215 #define INP_SIZE 8
216
217 #define _XFER 0
218 #define _INP_END _XFER + XFER_SIZE
219 #define _INP _INP_END + INP_END_SIZE
220 #define STACK_SIZE _INP + INP_SIZE
221
222 #define ROUND_AND_SCHED_N_0(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
223 ; \ // ############################# RND N + 0 ############################//
224 MOVL a, y3; \ // y3 = a // MAJA
225 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
226 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
227 ; \
228 ADDL (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // disp = k + w
229 ORL c, y3; \ // y3 = a|c // MAJA
230 VPALIGNR $4, XDWORD2, XDWORD3, XTMP0; \ // XTMP0 = W[-7]
231 MOVL f, y2; \ // y2 = f // CH
232 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
233 ; \
234 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
235 XORL g, y2; \ // y2 = f^g // CH
236 VPADDD XDWORD0, XTMP0, XTMP0; \ // XTMP0 = W[-7] + W[-16] // y1 = (e >> 6) // S1
237 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
238 ; \
239 ANDL e, y2; \ // y2 = (f^g)&e // CH
240 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
241 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
242 ADDL h, d; \ // d = k + w + h + d // --
243 ; \
244 ANDL b, y3; \ // y3 = (a|c)&b // MAJA
245 VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15]
246 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
247 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
248 ; \
249 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
250 VPSRLD $7, XTMP1, XTMP2; \
251 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
252 MOVL a, T1; \ // T1 = a // MAJB
253 ANDL c, T1; \ // T1 = a&c // MAJB
254 ; \
255 ADDL y0, y2; \ // y2 = S1 + CH // --
256 VPSLLD $(32-7), XTMP1, XTMP3; \
257 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
258 ADDL y1, h; \ // h = k + w + h + S0 // --
259 ; \
260 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // --
261 VPOR XTMP2, XTMP3, XTMP3; \ // XTMP3 = W[-15] ror 7
262 ; \
263 VPSRLD $18, XTMP1, XTMP2; \
264 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
265 ADDL y3, h // h = t1 + S0 + MAJ // --
266
267 #define ROUND_AND_SCHED_N_1(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
268 ; \ // ################################### RND N + 1 ############################
269 ; \
270 MOVL a, y3; \ // y3 = a // MAJA
271 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
272 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
273 ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // --
274 ORL c, y3; \ // y3 = a|c // MAJA
275 ; \
276 VPSRLD $3, XTMP1, XTMP4; \ // XTMP4 = W[-15] >> 3
277 MOVL f, y2; \ // y2 = f // CH
278 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
279 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
280 XORL g, y2; \ // y2 = f^g // CH
281 ; \
282 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
283 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
284 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
285 ANDL e, y2; \ // y2 = (f^g)&e // CH
286 ADDL h, d; \ // d = k + w + h + d // --
287 ; \
288 VPSLLD $(32-18), XTMP1, XTMP1; \
289 ANDL b, y3; \ // y3 = (a|c)&b // MAJA
290 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
291 ; \
292 VPXOR XTMP1, XTMP3, XTMP3; \
293 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
294 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
295 ; \
296 VPXOR XTMP2, XTMP3, XTMP3; \ // XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
297 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
298 MOVL a, T1; \ // T1 = a // MAJB
299 ANDL c, T1; \ // T1 = a&c // MAJB
300 ADDL y0, y2; \ // y2 = S1 + CH // --
301 ; \
302 VPXOR XTMP4, XTMP3, XTMP1; \ // XTMP1 = s0
303 VPSHUFD $0xFA, XDWORD3, XTMP2; \ // XTMP2 = W[-2] {BBAA}
304 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
305 ADDL y1, h; \ // h = k + w + h + S0 // --
306 ; \
307 VPADDD XTMP1, XTMP0, XTMP0; \ // XTMP0 = W[-16] + W[-7] + s0
308 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // --
309 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
310 ADDL y3, h; \ // h = t1 + S0 + MAJ // --
311 ; \
312 VPSRLD $10, XTMP2, XTMP4 // XTMP4 = W[-2] >> 10 {BBAA}
313
314 #define ROUND_AND_SCHED_N_2(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
315 ; \ // ################################### RND N + 2 ############################
316 ; \
317 MOVL a, y3; \ // y3 = a // MAJA
318 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
319 ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // --
320 ; \
321 VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xBxA}
322 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
323 ORL c, y3; \ // y3 = a|c // MAJA
324 MOVL f, y2; \ // y2 = f // CH
325 XORL g, y2; \ // y2 = f^g // CH
326 ; \
327 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
328 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
329 VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-2] ror 17 {xBxA}
330 ANDL e, y2; \ // y2 = (f^g)&e // CH
331 ; \
332 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
333 VPXOR XTMP3, XTMP2, XTMP2; \
334 ADDL h, d; \ // d = k + w + h + d // --
335 ANDL b, y3; \ // y3 = (a|c)&b // MAJA
336 ; \
337 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
338 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
339 VPXOR XTMP2, XTMP4, XTMP4; \ // XTMP4 = s1 {xBxA}
340 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
341 ; \
342 VPSHUFB shuff_00BA<>(SB), XTMP4, XTMP4;\ // XTMP4 = s1 {00BA}
343 ; \
344 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
345 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
346 VPADDD XTMP4, XTMP0, XTMP0; \ // XTMP0 = {..., ..., W[1], W[0]}
347 ; \
348 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
349 MOVL a, T1; \ // T1 = a // MAJB
350 ANDL c, T1; \ // T1 = a&c // MAJB
351 ADDL y0, y2; \ // y2 = S1 + CH // --
352 VPSHUFD $80, XTMP0, XTMP2; \ // XTMP2 = W[-2] {DDCC}
353 ; \
354 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
355 ADDL y1, h; \ // h = k + w + h + S0 // --
356 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // --
357 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
358 ; \
359 ADDL y3, h // h = t1 + S0 + MAJ // --
360
361 #define ROUND_AND_SCHED_N_3(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
362 ; \ // ################################### RND N + 3 ############################
363 ; \
364 MOVL a, y3; \ // y3 = a // MAJA
365 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
366 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
367 ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // --
368 ORL c, y3; \ // y3 = a|c // MAJA
369 ; \
370 VPSRLD $10, XTMP2, XTMP5; \ // XTMP5 = W[-2] >> 10 {DDCC}
371 MOVL f, y2; \ // y2 = f // CH
372 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
373 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
374 XORL g, y2; \ // y2 = f^g // CH
375 ; \
376 VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xDxC}
377 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
378 ANDL e, y2; \ // y2 = (f^g)&e // CH
379 ADDL h, d; \ // d = k + w + h + d // --
380 ANDL b, y3; \ // y3 = (a|c)&b // MAJA
381 ; \
382 VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-2] ror 17 {xDxC}
383 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
384 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
385 ; \
386 VPXOR XTMP3, XTMP2, XTMP2; \
387 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
388 ADDL y0, y2; \ // y2 = S1 + CH // --
389 ; \
390 VPXOR XTMP2, XTMP5, XTMP5; \ // XTMP5 = s1 {xDxC}
391 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
392 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // --
393 ; \
394 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
395 ; \
396 VPSHUFB shuff_DC00<>(SB), XTMP5, XTMP5;\ // XTMP5 = s1 {DC00}
397 ; \
398 VPADDD XTMP0, XTMP5, XDWORD0; \ // XDWORD0 = {W[3], W[2], W[1], W[0]}
399 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
400 MOVL a, T1; \ // T1 = a // MAJB
401 ANDL c, T1; \ // T1 = a&c // MAJB
402 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
403 ; \
404 ADDL y1, h; \ // h = k + w + h + S0 // --
405 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
406 ADDL y3, h // h = t1 + S0 + MAJ // --
407
408 #define DO_ROUND_N_0(disp, a, b, c, d, e, f, g, h, old_h) \
409 ; \ // ################################### RND N + 0 ###########################
410 MOVL f, y2; \ // y2 = f // CH
411 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
412 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
413 XORL g, y2; \ // y2 = f^g // CH
414 ; \
415 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
416 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
417 ANDL e, y2; \ // y2 = (f^g)&e // CH
418 ; \
419 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
420 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
421 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
422 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
423 MOVL a, y3; \ // y3 = a // MAJA
424 ; \
425 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
426 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
427 ADDL (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // --
428 ORL c, y3; \ // y3 = a|c // MAJA
429 ; \
430 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
431 MOVL a, T1; \ // T1 = a // MAJB
432 ANDL b, y3; \ // y3 = (a|c)&b // MAJA
433 ANDL c, T1; \ // T1 = a&c // MAJB
434 ADDL y0, y2; \ // y2 = S1 + CH // --
435 ; \
436 ADDL h, d; \ // d = k + w + h + d // --
437 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
438 ADDL y1, h; \ // h = k + w + h + S0 // --
439 ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // --
440
441 #define DO_ROUND_N_1(disp, a, b, c, d, e, f, g, h, old_h) \
442 ; \ // ################################### RND N + 1 ###########################
443 ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0 // --
444 MOVL f, y2; \ // y2 = f // CH
445 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
446 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
447 XORL g, y2; \ // y2 = f^g // CH
448 ; \
449 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
450 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
451 ANDL e, y2; \ // y2 = (f^g)&e // CH
452 ADDL y3, old_h; \ // h = t1 + S0 + MAJ // --
453 ; \
454 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
455 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
456 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
457 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
458 MOVL a, y3; \ // y3 = a // MAJA
459 ; \
460 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
461 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
462 ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // --
463 ORL c, y3; \ // y3 = a|c // MAJA
464 ; \
465 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
466 MOVL a, T1; \ // T1 = a // MAJB
467 ANDL b, y3; \ // y3 = (a|c)&b // MAJA
468 ANDL c, T1; \ // T1 = a&c // MAJB
469 ADDL y0, y2; \ // y2 = S1 + CH // --
470 ; \
471 ADDL h, d; \ // d = k + w + h + d // --
472 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
473 ADDL y1, h; \ // h = k + w + h + S0 // --
474 ; \
475 ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // --
476
477 #define DO_ROUND_N_2(disp, a, b, c, d, e, f, g, h, old_h) \
478 ; \ // ################################### RND N + 2 ##############################
479 ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
480 MOVL f, y2; \ // y2 = f // CH
481 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
482 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
483 XORL g, y2; \ // y2 = f^g // CH
484 ; \
485 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
486 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
487 ANDL e, y2; \ // y2 = (f^g)&e // CH
488 ADDL y3, old_h; \ // h = t1 + S0 + MAJ // --
489 ; \
490 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
491 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
492 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
493 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
494 MOVL a, y3; \ // y3 = a // MAJA
495 ; \
496 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
497 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
498 ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // --
499 ORL c, y3; \ // y3 = a|c // MAJA
500 ; \
501 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
502 MOVL a, T1; \ // T1 = a // MAJB
503 ANDL b, y3; \ // y3 = (a|c)&b // MAJA
504 ANDL c, T1; \ // T1 = a&c // MAJB
505 ADDL y0, y2; \ // y2 = S1 + CH // --
506 ; \
507 ADDL h, d; \ // d = k + w + h + d // --
508 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
509 ADDL y1, h; \ // h = k + w + h + S0 // --
510 ; \
511 ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // --
512
513 #define DO_ROUND_N_3(disp, a, b, c, d, e, f, g, h, old_h) \
514 ; \ // ################################### RND N + 3 ###########################
515 ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
516 MOVL f, y2; \ // y2 = f // CH
517 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
518 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
519 XORL g, y2; \ // y2 = f^g // CH
520 ; \
521 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
522 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
523 ANDL e, y2; \ // y2 = (f^g)&e // CH
524 ADDL y3, old_h; \ // h = t1 + S0 + MAJ // --
525 ; \
526 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
527 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
528 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
529 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
530 MOVL a, y3; \ // y3 = a // MAJA
531 ; \
532 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
533 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
534 ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // --
535 ORL c, y3; \ // y3 = a|c // MAJA
536 ; \
537 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
538 MOVL a, T1; \ // T1 = a // MAJB
539 ANDL b, y3; \ // y3 = (a|c)&b // MAJA
540 ANDL c, T1; \ // T1 = a&c // MAJB
541 ADDL y0, y2; \ // y2 = S1 + CH // --
542 ; \
543 ADDL h, d; \ // d = k + w + h + d // --
544 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
545 ADDL y1, h; \ // h = k + w + h + S0 // --
546 ; \
547 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // --
548 ; \
549 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
550 ; \
551 ADDL y3, h // h = t1 + S0 + MAJ // --
552
553 TEXT ·block(SB), 0, $536-32
554 CMPB ·useAVX2(SB), $1
555 JE avx2
556
557 MOVQ p_base+8(FP), SI
558 MOVQ p_len+16(FP), DX
559 SHRQ $6, DX
560 SHLQ $6, DX
561
562 LEAQ (SI)(DX*1), DI
563 MOVQ DI, 256(SP)
564 CMPQ SI, DI
565 JEQ end
566
567 MOVQ dig+0(FP), BP
568 MOVL (0*4)(BP), R8 // a = H0
569 MOVL (1*4)(BP), R9 // b = H1
570 MOVL (2*4)(BP), R10 // c = H2
571 MOVL (3*4)(BP), R11 // d = H3
572 MOVL (4*4)(BP), R12 // e = H4
573 MOVL (5*4)(BP), R13 // f = H5
574 MOVL (6*4)(BP), R14 // g = H6
575 MOVL (7*4)(BP), R15 // h = H7
576
577 loop:
578 MOVQ SP, BP
579
580 SHA256ROUND0(0, 0x428a2f98, R8, R9, R10, R11, R12, R13, R14, R15)
581 SHA256ROUND0(1, 0x71374491, R15, R8, R9, R10, R11, R12, R13, R14)
582 SHA256ROUND0(2, 0xb5c0fbcf, R14, R15, R8, R9, R10, R11, R12, R13)
583 SHA256ROUND0(3, 0xe9b5dba5, R13, R14, R15, R8, R9, R10, R11, R12)
584 SHA256ROUND0(4, 0x3956c25b, R12, R13, R14, R15, R8, R9, R10, R11)
585 SHA256ROUND0(5, 0x59f111f1, R11, R12, R13, R14, R15, R8, R9, R10)
586 SHA256ROUND0(6, 0x923f82a4, R10, R11, R12, R13, R14, R15, R8, R9)
587 SHA256ROUND0(7, 0xab1c5ed5, R9, R10, R11, R12, R13, R14, R15, R8)
588 SHA256ROUND0(8, 0xd807aa98, R8, R9, R10, R11, R12, R13, R14, R15)
589 SHA256ROUND0(9, 0x12835b01, R15, R8, R9, R10, R11, R12, R13, R14)
590 SHA256ROUND0(10, 0x243185be, R14, R15, R8, R9, R10, R11, R12, R13)
591 SHA256ROUND0(11, 0x550c7dc3, R13, R14, R15, R8, R9, R10, R11, R12)
592 SHA256ROUND0(12, 0x72be5d74, R12, R13, R14, R15, R8, R9, R10, R11)
593 SHA256ROUND0(13, 0x80deb1fe, R11, R12, R13, R14, R15, R8, R9, R10)
594 SHA256ROUND0(14, 0x9bdc06a7, R10, R11, R12, R13, R14, R15, R8, R9)
595 SHA256ROUND0(15, 0xc19bf174, R9, R10, R11, R12, R13, R14, R15, R8)
596
597 SHA256ROUND1(16, 0xe49b69c1, R8, R9, R10, R11, R12, R13, R14, R15)
598 SHA256ROUND1(17, 0xefbe4786, R15, R8, R9, R10, R11, R12, R13, R14)
599 SHA256ROUND1(18, 0x0fc19dc6, R14, R15, R8, R9, R10, R11, R12, R13)
600 SHA256ROUND1(19, 0x240ca1cc, R13, R14, R15, R8, R9, R10, R11, R12)
601 SHA256ROUND1(20, 0x2de92c6f, R12, R13, R14, R15, R8, R9, R10, R11)
602 SHA256ROUND1(21, 0x4a7484aa, R11, R12, R13, R14, R15, R8, R9, R10)
603 SHA256ROUND1(22, 0x5cb0a9dc, R10, R11, R12, R13, R14, R15, R8, R9)
604 SHA256ROUND1(23, 0x76f988da, R9, R10, R11, R12, R13, R14, R15, R8)
605 SHA256ROUND1(24, 0x983e5152, R8, R9, R10, R11, R12, R13, R14, R15)
606 SHA256ROUND1(25, 0xa831c66d, R15, R8, R9, R10, R11, R12, R13, R14)
607 SHA256ROUND1(26, 0xb00327c8, R14, R15, R8, R9, R10, R11, R12, R13)
608 SHA256ROUND1(27, 0xbf597fc7, R13, R14, R15, R8, R9, R10, R11, R12)
609 SHA256ROUND1(28, 0xc6e00bf3, R12, R13, R14, R15, R8, R9, R10, R11)
610 SHA256ROUND1(29, 0xd5a79147, R11, R12, R13, R14, R15, R8, R9, R10)
611 SHA256ROUND1(30, 0x06ca6351, R10, R11, R12, R13, R14, R15, R8, R9)
612 SHA256ROUND1(31, 0x14292967, R9, R10, R11, R12, R13, R14, R15, R8)
613 SHA256ROUND1(32, 0x27b70a85, R8, R9, R10, R11, R12, R13, R14, R15)
614 SHA256ROUND1(33, 0x2e1b2138, R15, R8, R9, R10, R11, R12, R13, R14)
615 SHA256ROUND1(34, 0x4d2c6dfc, R14, R15, R8, R9, R10, R11, R12, R13)
616 SHA256ROUND1(35, 0x53380d13, R13, R14, R15, R8, R9, R10, R11, R12)
617 SHA256ROUND1(36, 0x650a7354, R12, R13, R14, R15, R8, R9, R10, R11)
618 SHA256ROUND1(37, 0x766a0abb, R11, R12, R13, R14, R15, R8, R9, R10)
619 SHA256ROUND1(38, 0x81c2c92e, R10, R11, R12, R13, R14, R15, R8, R9)
620 SHA256ROUND1(39, 0x92722c85, R9, R10, R11, R12, R13, R14, R15, R8)
621 SHA256ROUND1(40, 0xa2bfe8a1, R8, R9, R10, R11, R12, R13, R14, R15)
622 SHA256ROUND1(41, 0xa81a664b, R15, R8, R9, R10, R11, R12, R13, R14)
623 SHA256ROUND1(42, 0xc24b8b70, R14, R15, R8, R9, R10, R11, R12, R13)
624 SHA256ROUND1(43, 0xc76c51a3, R13, R14, R15, R8, R9, R10, R11, R12)
625 SHA256ROUND1(44, 0xd192e819, R12, R13, R14, R15, R8, R9, R10, R11)
626 SHA256ROUND1(45, 0xd6990624, R11, R12, R13, R14, R15, R8, R9, R10)
627 SHA256ROUND1(46, 0xf40e3585, R10, R11, R12, R13, R14, R15, R8, R9)
628 SHA256ROUND1(47, 0x106aa070, R9, R10, R11, R12, R13, R14, R15, R8)
629 SHA256ROUND1(48, 0x19a4c116, R8, R9, R10, R11, R12, R13, R14, R15)
630 SHA256ROUND1(49, 0x1e376c08, R15, R8, R9, R10, R11, R12, R13, R14)
631 SHA256ROUND1(50, 0x2748774c, R14, R15, R8, R9, R10, R11, R12, R13)
632 SHA256ROUND1(51, 0x34b0bcb5, R13, R14, R15, R8, R9, R10, R11, R12)
633 SHA256ROUND1(52, 0x391c0cb3, R12, R13, R14, R15, R8, R9, R10, R11)
634 SHA256ROUND1(53, 0x4ed8aa4a, R11, R12, R13, R14, R15, R8, R9, R10)
635 SHA256ROUND1(54, 0x5b9cca4f, R10, R11, R12, R13, R14, R15, R8, R9)
636 SHA256ROUND1(55, 0x682e6ff3, R9, R10, R11, R12, R13, R14, R15, R8)
637 SHA256ROUND1(56, 0x748f82ee, R8, R9, R10, R11, R12, R13, R14, R15)
638 SHA256ROUND1(57, 0x78a5636f, R15, R8, R9, R10, R11, R12, R13, R14)
639 SHA256ROUND1(58, 0x84c87814, R14, R15, R8, R9, R10, R11, R12, R13)
640 SHA256ROUND1(59, 0x8cc70208, R13, R14, R15, R8, R9, R10, R11, R12)
641 SHA256ROUND1(60, 0x90befffa, R12, R13, R14, R15, R8, R9, R10, R11)
642 SHA256ROUND1(61, 0xa4506ceb, R11, R12, R13, R14, R15, R8, R9, R10)
643 SHA256ROUND1(62, 0xbef9a3f7, R10, R11, R12, R13, R14, R15, R8, R9)
644 SHA256ROUND1(63, 0xc67178f2, R9, R10, R11, R12, R13, R14, R15, R8)
645
646 MOVQ dig+0(FP), BP
647 ADDL (0*4)(BP), R8 // H0 = a + H0
648 MOVL R8, (0*4)(BP)
649 ADDL (1*4)(BP), R9 // H1 = b + H1
650 MOVL R9, (1*4)(BP)
651 ADDL (2*4)(BP), R10 // H2 = c + H2
652 MOVL R10, (2*4)(BP)
653 ADDL (3*4)(BP), R11 // H3 = d + H3
654 MOVL R11, (3*4)(BP)
655 ADDL (4*4)(BP), R12 // H4 = e + H4
656 MOVL R12, (4*4)(BP)
657 ADDL (5*4)(BP), R13 // H5 = f + H5
658 MOVL R13, (5*4)(BP)
659 ADDL (6*4)(BP), R14 // H6 = g + H6
660 MOVL R14, (6*4)(BP)
661 ADDL (7*4)(BP), R15 // H7 = h + H7
662 MOVL R15, (7*4)(BP)
663
664 ADDQ $64, SI
665 CMPQ SI, 256(SP)
666 JB loop
667
668 end:
669 RET
670
671 avx2:
672 MOVQ dig+0(FP), CTX // d.h[8]
673 MOVQ p_base+8(FP), INP
674 MOVQ p_len+16(FP), NUM_BYTES
675
676 LEAQ -64(INP)(NUM_BYTES*1), NUM_BYTES // Pointer to the last block
677 MOVQ NUM_BYTES, _INP_END(SP)
678
679 CMPQ NUM_BYTES, INP
680 JE avx2_only_one_block
681
682 // Load initial digest
683 MOVL 0(CTX), a // a = H0
684 MOVL 4(CTX), b // b = H1
685 MOVL 8(CTX), c // c = H2
686 MOVL 12(CTX), d // d = H3
687 MOVL 16(CTX), e // e = H4
688 MOVL 20(CTX), f // f = H5
689 MOVL 24(CTX), g // g = H6
690 MOVL 28(CTX), h // h = H7
691
692 avx2_loop0: // at each iteration works with one block (512 bit)
693
694 VMOVDQU (0*32)(INP), XTMP0
695 VMOVDQU (1*32)(INP), XTMP1
696 VMOVDQU (2*32)(INP), XTMP2
697 VMOVDQU (3*32)(INP), XTMP3
698
699 VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK
700
701 // Apply Byte Flip Mask: LE -> BE
702 VPSHUFB BYTE_FLIP_MASK, XTMP0, XTMP0
703 VPSHUFB BYTE_FLIP_MASK, XTMP1, XTMP1
704 VPSHUFB BYTE_FLIP_MASK, XTMP2, XTMP2
705 VPSHUFB BYTE_FLIP_MASK, XTMP3, XTMP3
706
707 // Transpose data into high/low parts
708 VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w3, w2, w1, w0
709 VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w7, w6, w5, w4
710 VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w11, w10, w9, w8
711 VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w15, w14, w13, w12
712
713 MOVQ $K256<>(SB), TBL // Loading address of table with round-specific constants
714
715 avx2_last_block_enter:
716 ADDQ $64, INP
717 MOVQ INP, _INP(SP)
718 XORQ SRND, SRND
719
720 avx2_loop1: // for w0 - w47
721 // Do 4 rounds and scheduling
722 VPADDD 0*32(TBL)(SRND*1), XDWORD0, XFER
723 VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
724 ROUND_AND_SCHED_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
725 ROUND_AND_SCHED_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
726 ROUND_AND_SCHED_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
727 ROUND_AND_SCHED_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
728
729 // Do 4 rounds and scheduling
730 VPADDD 1*32(TBL)(SRND*1), XDWORD1, XFER
731 VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
732 ROUND_AND_SCHED_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
733 ROUND_AND_SCHED_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
734 ROUND_AND_SCHED_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
735 ROUND_AND_SCHED_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
736
737 // Do 4 rounds and scheduling
738 VPADDD 2*32(TBL)(SRND*1), XDWORD2, XFER
739 VMOVDQU XFER, (_XFER + 2*32)(SP)(SRND*1)
740 ROUND_AND_SCHED_N_0(_XFER + 2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
741 ROUND_AND_SCHED_N_1(_XFER + 2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
742 ROUND_AND_SCHED_N_2(_XFER + 2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
743 ROUND_AND_SCHED_N_3(_XFER + 2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
744
745 // Do 4 rounds and scheduling
746 VPADDD 3*32(TBL)(SRND*1), XDWORD3, XFER
747 VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1)
748 ROUND_AND_SCHED_N_0(_XFER + 3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
749 ROUND_AND_SCHED_N_1(_XFER + 3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
750 ROUND_AND_SCHED_N_2(_XFER + 3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
751 ROUND_AND_SCHED_N_3(_XFER + 3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
752
753 ADDQ $4*32, SRND
754 CMPQ SRND, $3*4*32
755 JB avx2_loop1
756
757 avx2_loop2:
758 // w48 - w63 processed with no scheduling (last 16 rounds)
759 VPADDD 0*32(TBL)(SRND*1), XDWORD0, XFER
760 VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
761 DO_ROUND_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, h)
762 DO_ROUND_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, h)
763 DO_ROUND_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, g)
764 DO_ROUND_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, f)
765
766 VPADDD 1*32(TBL)(SRND*1), XDWORD1, XFER
767 VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
768 DO_ROUND_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, e)
769 DO_ROUND_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, d)
770 DO_ROUND_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, c)
771 DO_ROUND_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, b)
772
773 ADDQ $2*32, SRND
774
775 VMOVDQU XDWORD2, XDWORD0
776 VMOVDQU XDWORD3, XDWORD1
777
778 CMPQ SRND, $4*4*32
779 JB avx2_loop2
780
781 MOVQ dig+0(FP), CTX // d.h[8]
782 MOVQ _INP(SP), INP
783
784 addm( 0(CTX), a)
785 addm( 4(CTX), b)
786 addm( 8(CTX), c)
787 addm( 12(CTX), d)
788 addm( 16(CTX), e)
789 addm( 20(CTX), f)
790 addm( 24(CTX), g)
791 addm( 28(CTX), h)
792
793 CMPQ _INP_END(SP), INP
794 JB done_hash
795
796 XORQ SRND, SRND
797
798 avx2_loop3: // Do second block using previously scheduled results
799 DO_ROUND_N_0(_XFER + 0*32 + 16, a, b, c, d, e, f, g, h, a)
800 DO_ROUND_N_1(_XFER + 0*32 + 16, h, a, b, c, d, e, f, g, h)
801 DO_ROUND_N_2(_XFER + 0*32 + 16, g, h, a, b, c, d, e, f, g)
802 DO_ROUND_N_3(_XFER + 0*32 + 16, f, g, h, a, b, c, d, e, f)
803
804 DO_ROUND_N_0(_XFER + 1*32 + 16, e, f, g, h, a, b, c, d, e)
805 DO_ROUND_N_1(_XFER + 1*32 + 16, d, e, f, g, h, a, b, c, d)
806 DO_ROUND_N_2(_XFER + 1*32 + 16, c, d, e, f, g, h, a, b, c)
807 DO_ROUND_N_3(_XFER + 1*32 + 16, b, c, d, e, f, g, h, a, b)
808
809 ADDQ $2*32, SRND
810 CMPQ SRND, $4*4*32
811 JB avx2_loop3
812
813 MOVQ dig+0(FP), CTX // d.h[8]
814 MOVQ _INP(SP), INP
815 ADDQ $64, INP
816
817 addm( 0(CTX), a)
818 addm( 4(CTX), b)
819 addm( 8(CTX), c)
820 addm( 12(CTX), d)
821 addm( 16(CTX), e)
822 addm( 20(CTX), f)
823 addm( 24(CTX), g)
824 addm( 28(CTX), h)
825
826 CMPQ _INP_END(SP), INP
827 JA avx2_loop0
828 JB done_hash
829
830 avx2_do_last_block:
831
832 VMOVDQU 0(INP), XWORD0
833 VMOVDQU 16(INP), XWORD1
834 VMOVDQU 32(INP), XWORD2
835 VMOVDQU 48(INP), XWORD3
836
837 VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK
838
839 VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
840 VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
841 VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
842 VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
843
844 MOVQ $K256<>(SB), TBL
845
846 JMP avx2_last_block_enter
847
848 avx2_only_one_block:
849 // Load initial digest
850 MOVL 0(CTX), a // a = H0
851 MOVL 4(CTX), b // b = H1
852 MOVL 8(CTX), c // c = H2
853 MOVL 12(CTX), d // d = H3
854 MOVL 16(CTX), e // e = H4
855 MOVL 20(CTX), f // f = H5
856 MOVL 24(CTX), g // g = H6
857 MOVL 28(CTX), h // h = H7
858
859 JMP avx2_do_last_block
860
861 done_hash:
862 VZEROUPPER
863 RET
864
865 // shuffle byte order from LE to BE
866 DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
867 DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
868 DATA flip_mask<>+0x10(SB)/8, $0x0405060700010203
869 DATA flip_mask<>+0x18(SB)/8, $0x0c0d0e0f08090a0b
870 GLOBL flip_mask<>(SB), 8, $32
871
872 // shuffle xBxA -> 00BA
873 DATA shuff_00BA<>+0x00(SB)/8, $0x0b0a090803020100
874 DATA shuff_00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF
875 DATA shuff_00BA<>+0x10(SB)/8, $0x0b0a090803020100
876 DATA shuff_00BA<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
877 GLOBL shuff_00BA<>(SB), 8, $32
878
879 // shuffle xDxC -> DC00
880 DATA shuff_DC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF
881 DATA shuff_DC00<>+0x08(SB)/8, $0x0b0a090803020100
882 DATA shuff_DC00<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
883 DATA shuff_DC00<>+0x18(SB)/8, $0x0b0a090803020100
884 GLOBL shuff_DC00<>(SB), 8, $32
885
886 // Round specific constants
887 DATA K256<>+0x00(SB)/4, $0x428a2f98 // k1
888 DATA K256<>+0x04(SB)/4, $0x71374491 // k2
889 DATA K256<>+0x08(SB)/4, $0xb5c0fbcf // k3
890 DATA K256<>+0x0c(SB)/4, $0xe9b5dba5 // k4
891 DATA K256<>+0x10(SB)/4, $0x428a2f98 // k1
892 DATA K256<>+0x14(SB)/4, $0x71374491 // k2
893 DATA K256<>+0x18(SB)/4, $0xb5c0fbcf // k3
894 DATA K256<>+0x1c(SB)/4, $0xe9b5dba5 // k4
895
896 DATA K256<>+0x20(SB)/4, $0x3956c25b // k5 - k8
897 DATA K256<>+0x24(SB)/4, $0x59f111f1
898 DATA K256<>+0x28(SB)/4, $0x923f82a4
899 DATA K256<>+0x2c(SB)/4, $0xab1c5ed5
900 DATA K256<>+0x30(SB)/4, $0x3956c25b
901 DATA K256<>+0x34(SB)/4, $0x59f111f1
902 DATA K256<>+0x38(SB)/4, $0x923f82a4
903 DATA K256<>+0x3c(SB)/4, $0xab1c5ed5
904
905 DATA K256<>+0x40(SB)/4, $0xd807aa98 // k9 - k12
906 DATA K256<>+0x44(SB)/4, $0x12835b01
907 DATA K256<>+0x48(SB)/4, $0x243185be
908 DATA K256<>+0x4c(SB)/4, $0x550c7dc3
909 DATA K256<>+0x50(SB)/4, $0xd807aa98
910 DATA K256<>+0x54(SB)/4, $0x12835b01
911 DATA K256<>+0x58(SB)/4, $0x243185be
912 DATA K256<>+0x5c(SB)/4, $0x550c7dc3
913
914 DATA K256<>+0x60(SB)/4, $0x72be5d74 // k13 - k16
915 DATA K256<>+0x64(SB)/4, $0x80deb1fe
916 DATA K256<>+0x68(SB)/4, $0x9bdc06a7
917 DATA K256<>+0x6c(SB)/4, $0xc19bf174
918 DATA K256<>+0x70(SB)/4, $0x72be5d74
919 DATA K256<>+0x74(SB)/4, $0x80deb1fe
920 DATA K256<>+0x78(SB)/4, $0x9bdc06a7
921 DATA K256<>+0x7c(SB)/4, $0xc19bf174
922
923 DATA K256<>+0x80(SB)/4, $0xe49b69c1 // k17 - k20
924 DATA K256<>+0x84(SB)/4, $0xefbe4786
925 DATA K256<>+0x88(SB)/4, $0x0fc19dc6
926 DATA K256<>+0x8c(SB)/4, $0x240ca1cc
927 DATA K256<>+0x90(SB)/4, $0xe49b69c1
928 DATA K256<>+0x94(SB)/4, $0xefbe4786
929 DATA K256<>+0x98(SB)/4, $0x0fc19dc6
930 DATA K256<>+0x9c(SB)/4, $0x240ca1cc
931
932 DATA K256<>+0xa0(SB)/4, $0x2de92c6f // k21 - k24
933 DATA K256<>+0xa4(SB)/4, $0x4a7484aa
934 DATA K256<>+0xa8(SB)/4, $0x5cb0a9dc
935 DATA K256<>+0xac(SB)/4, $0x76f988da
936 DATA K256<>+0xb0(SB)/4, $0x2de92c6f
937 DATA K256<>+0xb4(SB)/4, $0x4a7484aa
938 DATA K256<>+0xb8(SB)/4, $0x5cb0a9dc
939 DATA K256<>+0xbc(SB)/4, $0x76f988da
940
941 DATA K256<>+0xc0(SB)/4, $0x983e5152 // k25 - k28
942 DATA K256<>+0xc4(SB)/4, $0xa831c66d
943 DATA K256<>+0xc8(SB)/4, $0xb00327c8
944 DATA K256<>+0xcc(SB)/4, $0xbf597fc7
945 DATA K256<>+0xd0(SB)/4, $0x983e5152
946 DATA K256<>+0xd4(SB)/4, $0xa831c66d
947 DATA K256<>+0xd8(SB)/4, $0xb00327c8
948 DATA K256<>+0xdc(SB)/4, $0xbf597fc7
949
950 DATA K256<>+0xe0(SB)/4, $0xc6e00bf3 // k29 - k32
951 DATA K256<>+0xe4(SB)/4, $0xd5a79147
952 DATA K256<>+0xe8(SB)/4, $0x06ca6351
953 DATA K256<>+0xec(SB)/4, $0x14292967
954 DATA K256<>+0xf0(SB)/4, $0xc6e00bf3
955 DATA K256<>+0xf4(SB)/4, $0xd5a79147
956 DATA K256<>+0xf8(SB)/4, $0x06ca6351
957 DATA K256<>+0xfc(SB)/4, $0x14292967
958
959 DATA K256<>+0x100(SB)/4, $0x27b70a85
960 DATA K256<>+0x104(SB)/4, $0x2e1b2138
961 DATA K256<>+0x108(SB)/4, $0x4d2c6dfc
962 DATA K256<>+0x10c(SB)/4, $0x53380d13
963 DATA K256<>+0x110(SB)/4, $0x27b70a85
964 DATA K256<>+0x114(SB)/4, $0x2e1b2138
965 DATA K256<>+0x118(SB)/4, $0x4d2c6dfc
966 DATA K256<>+0x11c(SB)/4, $0x53380d13
967
968 DATA K256<>+0x120(SB)/4, $0x650a7354
969 DATA K256<>+0x124(SB)/4, $0x766a0abb
970 DATA K256<>+0x128(SB)/4, $0x81c2c92e
971 DATA K256<>+0x12c(SB)/4, $0x92722c85
972 DATA K256<>+0x130(SB)/4, $0x650a7354
973 DATA K256<>+0x134(SB)/4, $0x766a0abb
974 DATA K256<>+0x138(SB)/4, $0x81c2c92e
975 DATA K256<>+0x13c(SB)/4, $0x92722c85
976
977 DATA K256<>+0x140(SB)/4, $0xa2bfe8a1
978 DATA K256<>+0x144(SB)/4, $0xa81a664b
979 DATA K256<>+0x148(SB)/4, $0xc24b8b70
980 DATA K256<>+0x14c(SB)/4, $0xc76c51a3
981 DATA K256<>+0x150(SB)/4, $0xa2bfe8a1
982 DATA K256<>+0x154(SB)/4, $0xa81a664b
983 DATA K256<>+0x158(SB)/4, $0xc24b8b70
984 DATA K256<>+0x15c(SB)/4, $0xc76c51a3
985
986 DATA K256<>+0x160(SB)/4, $0xd192e819
987 DATA K256<>+0x164(SB)/4, $0xd6990624
988 DATA K256<>+0x168(SB)/4, $0xf40e3585
989 DATA K256<>+0x16c(SB)/4, $0x106aa070
990 DATA K256<>+0x170(SB)/4, $0xd192e819
991 DATA K256<>+0x174(SB)/4, $0xd6990624
992 DATA K256<>+0x178(SB)/4, $0xf40e3585
993 DATA K256<>+0x17c(SB)/4, $0x106aa070
994
995 DATA K256<>+0x180(SB)/4, $0x19a4c116
996 DATA K256<>+0x184(SB)/4, $0x1e376c08
997 DATA K256<>+0x188(SB)/4, $0x2748774c
998 DATA K256<>+0x18c(SB)/4, $0x34b0bcb5
999 DATA K256<>+0x190(SB)/4, $0x19a4c116
1000 DATA K256<>+0x194(SB)/4, $0x1e376c08
1001 DATA K256<>+0x198(SB)/4, $0x2748774c
1002 DATA K256<>+0x19c(SB)/4, $0x34b0bcb5
1003
1004 DATA K256<>+0x1a0(SB)/4, $0x391c0cb3
1005 DATA K256<>+0x1a4(SB)/4, $0x4ed8aa4a
1006 DATA K256<>+0x1a8(SB)/4, $0x5b9cca4f
1007 DATA K256<>+0x1ac(SB)/4, $0x682e6ff3
1008 DATA K256<>+0x1b0(SB)/4, $0x391c0cb3
1009 DATA K256<>+0x1b4(SB)/4, $0x4ed8aa4a
1010 DATA K256<>+0x1b8(SB)/4, $0x5b9cca4f
1011 DATA K256<>+0x1bc(SB)/4, $0x682e6ff3
1012
1013 DATA K256<>+0x1c0(SB)/4, $0x748f82ee
1014 DATA K256<>+0x1c4(SB)/4, $0x78a5636f
1015 DATA K256<>+0x1c8(SB)/4, $0x84c87814
1016 DATA K256<>+0x1cc(SB)/4, $0x8cc70208
1017 DATA K256<>+0x1d0(SB)/4, $0x748f82ee
1018 DATA K256<>+0x1d4(SB)/4, $0x78a5636f
1019 DATA K256<>+0x1d8(SB)/4, $0x84c87814
1020 DATA K256<>+0x1dc(SB)/4, $0x8cc70208
1021
1022 DATA K256<>+0x1e0(SB)/4, $0x90befffa
1023 DATA K256<>+0x1e4(SB)/4, $0xa4506ceb
1024 DATA K256<>+0x1e8(SB)/4, $0xbef9a3f7
1025 DATA K256<>+0x1ec(SB)/4, $0xc67178f2
1026 DATA K256<>+0x1f0(SB)/4, $0x90befffa
1027 DATA K256<>+0x1f4(SB)/4, $0xa4506ceb
1028 DATA K256<>+0x1f8(SB)/4, $0xbef9a3f7
1029 DATA K256<>+0x1fc(SB)/4, $0xc67178f2
1030
1031 GLOBL K256<>(SB), (NOPTR + RODATA), $512
1032
View as plain text