Text file
src/crypto/sha512/sha512block_amd64.s
1 // Copyright 2013 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "textflag.h"
6
7 // SHA512 block routine. See sha512block.go for Go equivalent.
8 //
9 // The algorithm is detailed in FIPS 180-4:
10 //
11 // https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
12 //
13 // Wt = Mt; for 0 <= t <= 15
14 // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
15 //
16 // a = H0
17 // b = H1
18 // c = H2
19 // d = H3
20 // e = H4
21 // f = H5
22 // g = H6
23 // h = H7
24 //
25 // for t = 0 to 79 {
26 // T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
27 // T2 = BIGSIGMA0(a) + Maj(a,b,c)
28 // h = g
29 // g = f
30 // f = e
31 // e = d + T1
32 // d = c
33 // c = b
34 // b = a
35 // a = T1 + T2
36 // }
37 //
38 // H0 = a + H0
39 // H1 = b + H1
40 // H2 = c + H2
41 // H3 = d + H3
42 // H4 = e + H4
43 // H5 = f + H5
44 // H6 = g + H6
45 // H7 = h + H7
46
47 // Wt = Mt; for 0 <= t <= 15
48 #define MSGSCHEDULE0(index) \
49 MOVQ (index*8)(SI), AX; \
50 BSWAPQ AX; \
51 MOVQ AX, (index*8)(BP)
52
53 // Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
54 // SIGMA0(x) = ROTR(1,x) XOR ROTR(8,x) XOR SHR(7,x)
55 // SIGMA1(x) = ROTR(19,x) XOR ROTR(61,x) XOR SHR(6,x)
56 #define MSGSCHEDULE1(index) \
57 MOVQ ((index-2)*8)(BP), AX; \
58 MOVQ AX, CX; \
59 RORQ $19, AX; \
60 MOVQ CX, DX; \
61 RORQ $61, CX; \
62 SHRQ $6, DX; \
63 MOVQ ((index-15)*8)(BP), BX; \
64 XORQ CX, AX; \
65 MOVQ BX, CX; \
66 XORQ DX, AX; \
67 RORQ $1, BX; \
68 MOVQ CX, DX; \
69 SHRQ $7, DX; \
70 RORQ $8, CX; \
71 ADDQ ((index-7)*8)(BP), AX; \
72 XORQ CX, BX; \
73 XORQ DX, BX; \
74 ADDQ ((index-16)*8)(BP), BX; \
75 ADDQ BX, AX; \
76 MOVQ AX, ((index)*8)(BP)
77
78 // Calculate T1 in AX - uses AX, CX and DX registers.
79 // h is also used as an accumulator. Wt is passed in AX.
80 // T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
81 // BIGSIGMA1(x) = ROTR(14,x) XOR ROTR(18,x) XOR ROTR(41,x)
82 // Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
83 #define SHA512T1(const, e, f, g, h) \
84 MOVQ $const, DX; \
85 ADDQ AX, h; \
86 MOVQ e, AX; \
87 ADDQ DX, h; \
88 MOVQ e, CX; \
89 RORQ $14, AX; \
90 MOVQ e, DX; \
91 RORQ $18, CX; \
92 XORQ CX, AX; \
93 MOVQ e, CX; \
94 RORQ $41, DX; \
95 ANDQ f, CX; \
96 XORQ AX, DX; \
97 MOVQ e, AX; \
98 NOTQ AX; \
99 ADDQ DX, h; \
100 ANDQ g, AX; \
101 XORQ CX, AX; \
102 ADDQ h, AX
103
104 // Calculate T2 in BX - uses BX, CX, DX and DI registers.
105 // T2 = BIGSIGMA0(a) + Maj(a, b, c)
106 // BIGSIGMA0(x) = ROTR(28,x) XOR ROTR(34,x) XOR ROTR(39,x)
107 // Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
108 #define SHA512T2(a, b, c) \
109 MOVQ a, DI; \
110 MOVQ c, BX; \
111 RORQ $28, DI; \
112 MOVQ a, DX; \
113 ANDQ b, BX; \
114 RORQ $34, DX; \
115 MOVQ a, CX; \
116 ANDQ c, CX; \
117 XORQ DX, DI; \
118 XORQ CX, BX; \
119 MOVQ a, DX; \
120 MOVQ b, CX; \
121 RORQ $39, DX; \
122 ANDQ a, CX; \
123 XORQ CX, BX; \
124 XORQ DX, DI; \
125 ADDQ DI, BX
126
127 // Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
128 // The values for e and a are stored in d and h, ready for rotation.
129 #define SHA512ROUND(index, const, a, b, c, d, e, f, g, h) \
130 SHA512T1(const, e, f, g, h); \
131 SHA512T2(a, b, c); \
132 MOVQ BX, h; \
133 ADDQ AX, d; \
134 ADDQ AX, h
135
136 #define SHA512ROUND0(index, const, a, b, c, d, e, f, g, h) \
137 MSGSCHEDULE0(index); \
138 SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
139
140 #define SHA512ROUND1(index, const, a, b, c, d, e, f, g, h) \
141 MSGSCHEDULE1(index); \
142 SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
143
144 TEXT ·blockAMD64(SB),0,$648-32
145 MOVQ p_base+8(FP), SI
146 MOVQ p_len+16(FP), DX
147 SHRQ $7, DX
148 SHLQ $7, DX
149
150 LEAQ (SI)(DX*1), DI
151 MOVQ DI, 640(SP)
152 CMPQ SI, DI
153 JEQ end
154
155 MOVQ dig+0(FP), BP
156 MOVQ (0*8)(BP), R8 // a = H0
157 MOVQ (1*8)(BP), R9 // b = H1
158 MOVQ (2*8)(BP), R10 // c = H2
159 MOVQ (3*8)(BP), R11 // d = H3
160 MOVQ (4*8)(BP), R12 // e = H4
161 MOVQ (5*8)(BP), R13 // f = H5
162 MOVQ (6*8)(BP), R14 // g = H6
163 MOVQ (7*8)(BP), R15 // h = H7
164
165 loop:
166 MOVQ SP, BP // message schedule
167
168 SHA512ROUND0(0, 0x428a2f98d728ae22, R8, R9, R10, R11, R12, R13, R14, R15)
169 SHA512ROUND0(1, 0x7137449123ef65cd, R15, R8, R9, R10, R11, R12, R13, R14)
170 SHA512ROUND0(2, 0xb5c0fbcfec4d3b2f, R14, R15, R8, R9, R10, R11, R12, R13)
171 SHA512ROUND0(3, 0xe9b5dba58189dbbc, R13, R14, R15, R8, R9, R10, R11, R12)
172 SHA512ROUND0(4, 0x3956c25bf348b538, R12, R13, R14, R15, R8, R9, R10, R11)
173 SHA512ROUND0(5, 0x59f111f1b605d019, R11, R12, R13, R14, R15, R8, R9, R10)
174 SHA512ROUND0(6, 0x923f82a4af194f9b, R10, R11, R12, R13, R14, R15, R8, R9)
175 SHA512ROUND0(7, 0xab1c5ed5da6d8118, R9, R10, R11, R12, R13, R14, R15, R8)
176 SHA512ROUND0(8, 0xd807aa98a3030242, R8, R9, R10, R11, R12, R13, R14, R15)
177 SHA512ROUND0(9, 0x12835b0145706fbe, R15, R8, R9, R10, R11, R12, R13, R14)
178 SHA512ROUND0(10, 0x243185be4ee4b28c, R14, R15, R8, R9, R10, R11, R12, R13)
179 SHA512ROUND0(11, 0x550c7dc3d5ffb4e2, R13, R14, R15, R8, R9, R10, R11, R12)
180 SHA512ROUND0(12, 0x72be5d74f27b896f, R12, R13, R14, R15, R8, R9, R10, R11)
181 SHA512ROUND0(13, 0x80deb1fe3b1696b1, R11, R12, R13, R14, R15, R8, R9, R10)
182 SHA512ROUND0(14, 0x9bdc06a725c71235, R10, R11, R12, R13, R14, R15, R8, R9)
183 SHA512ROUND0(15, 0xc19bf174cf692694, R9, R10, R11, R12, R13, R14, R15, R8)
184
185 SHA512ROUND1(16, 0xe49b69c19ef14ad2, R8, R9, R10, R11, R12, R13, R14, R15)
186 SHA512ROUND1(17, 0xefbe4786384f25e3, R15, R8, R9, R10, R11, R12, R13, R14)
187 SHA512ROUND1(18, 0x0fc19dc68b8cd5b5, R14, R15, R8, R9, R10, R11, R12, R13)
188 SHA512ROUND1(19, 0x240ca1cc77ac9c65, R13, R14, R15, R8, R9, R10, R11, R12)
189 SHA512ROUND1(20, 0x2de92c6f592b0275, R12, R13, R14, R15, R8, R9, R10, R11)
190 SHA512ROUND1(21, 0x4a7484aa6ea6e483, R11, R12, R13, R14, R15, R8, R9, R10)
191 SHA512ROUND1(22, 0x5cb0a9dcbd41fbd4, R10, R11, R12, R13, R14, R15, R8, R9)
192 SHA512ROUND1(23, 0x76f988da831153b5, R9, R10, R11, R12, R13, R14, R15, R8)
193 SHA512ROUND1(24, 0x983e5152ee66dfab, R8, R9, R10, R11, R12, R13, R14, R15)
194 SHA512ROUND1(25, 0xa831c66d2db43210, R15, R8, R9, R10, R11, R12, R13, R14)
195 SHA512ROUND1(26, 0xb00327c898fb213f, R14, R15, R8, R9, R10, R11, R12, R13)
196 SHA512ROUND1(27, 0xbf597fc7beef0ee4, R13, R14, R15, R8, R9, R10, R11, R12)
197 SHA512ROUND1(28, 0xc6e00bf33da88fc2, R12, R13, R14, R15, R8, R9, R10, R11)
198 SHA512ROUND1(29, 0xd5a79147930aa725, R11, R12, R13, R14, R15, R8, R9, R10)
199 SHA512ROUND1(30, 0x06ca6351e003826f, R10, R11, R12, R13, R14, R15, R8, R9)
200 SHA512ROUND1(31, 0x142929670a0e6e70, R9, R10, R11, R12, R13, R14, R15, R8)
201 SHA512ROUND1(32, 0x27b70a8546d22ffc, R8, R9, R10, R11, R12, R13, R14, R15)
202 SHA512ROUND1(33, 0x2e1b21385c26c926, R15, R8, R9, R10, R11, R12, R13, R14)
203 SHA512ROUND1(34, 0x4d2c6dfc5ac42aed, R14, R15, R8, R9, R10, R11, R12, R13)
204 SHA512ROUND1(35, 0x53380d139d95b3df, R13, R14, R15, R8, R9, R10, R11, R12)
205 SHA512ROUND1(36, 0x650a73548baf63de, R12, R13, R14, R15, R8, R9, R10, R11)
206 SHA512ROUND1(37, 0x766a0abb3c77b2a8, R11, R12, R13, R14, R15, R8, R9, R10)
207 SHA512ROUND1(38, 0x81c2c92e47edaee6, R10, R11, R12, R13, R14, R15, R8, R9)
208 SHA512ROUND1(39, 0x92722c851482353b, R9, R10, R11, R12, R13, R14, R15, R8)
209 SHA512ROUND1(40, 0xa2bfe8a14cf10364, R8, R9, R10, R11, R12, R13, R14, R15)
210 SHA512ROUND1(41, 0xa81a664bbc423001, R15, R8, R9, R10, R11, R12, R13, R14)
211 SHA512ROUND1(42, 0xc24b8b70d0f89791, R14, R15, R8, R9, R10, R11, R12, R13)
212 SHA512ROUND1(43, 0xc76c51a30654be30, R13, R14, R15, R8, R9, R10, R11, R12)
213 SHA512ROUND1(44, 0xd192e819d6ef5218, R12, R13, R14, R15, R8, R9, R10, R11)
214 SHA512ROUND1(45, 0xd69906245565a910, R11, R12, R13, R14, R15, R8, R9, R10)
215 SHA512ROUND1(46, 0xf40e35855771202a, R10, R11, R12, R13, R14, R15, R8, R9)
216 SHA512ROUND1(47, 0x106aa07032bbd1b8, R9, R10, R11, R12, R13, R14, R15, R8)
217 SHA512ROUND1(48, 0x19a4c116b8d2d0c8, R8, R9, R10, R11, R12, R13, R14, R15)
218 SHA512ROUND1(49, 0x1e376c085141ab53, R15, R8, R9, R10, R11, R12, R13, R14)
219 SHA512ROUND1(50, 0x2748774cdf8eeb99, R14, R15, R8, R9, R10, R11, R12, R13)
220 SHA512ROUND1(51, 0x34b0bcb5e19b48a8, R13, R14, R15, R8, R9, R10, R11, R12)
221 SHA512ROUND1(52, 0x391c0cb3c5c95a63, R12, R13, R14, R15, R8, R9, R10, R11)
222 SHA512ROUND1(53, 0x4ed8aa4ae3418acb, R11, R12, R13, R14, R15, R8, R9, R10)
223 SHA512ROUND1(54, 0x5b9cca4f7763e373, R10, R11, R12, R13, R14, R15, R8, R9)
224 SHA512ROUND1(55, 0x682e6ff3d6b2b8a3, R9, R10, R11, R12, R13, R14, R15, R8)
225 SHA512ROUND1(56, 0x748f82ee5defb2fc, R8, R9, R10, R11, R12, R13, R14, R15)
226 SHA512ROUND1(57, 0x78a5636f43172f60, R15, R8, R9, R10, R11, R12, R13, R14)
227 SHA512ROUND1(58, 0x84c87814a1f0ab72, R14, R15, R8, R9, R10, R11, R12, R13)
228 SHA512ROUND1(59, 0x8cc702081a6439ec, R13, R14, R15, R8, R9, R10, R11, R12)
229 SHA512ROUND1(60, 0x90befffa23631e28, R12, R13, R14, R15, R8, R9, R10, R11)
230 SHA512ROUND1(61, 0xa4506cebde82bde9, R11, R12, R13, R14, R15, R8, R9, R10)
231 SHA512ROUND1(62, 0xbef9a3f7b2c67915, R10, R11, R12, R13, R14, R15, R8, R9)
232 SHA512ROUND1(63, 0xc67178f2e372532b, R9, R10, R11, R12, R13, R14, R15, R8)
233 SHA512ROUND1(64, 0xca273eceea26619c, R8, R9, R10, R11, R12, R13, R14, R15)
234 SHA512ROUND1(65, 0xd186b8c721c0c207, R15, R8, R9, R10, R11, R12, R13, R14)
235 SHA512ROUND1(66, 0xeada7dd6cde0eb1e, R14, R15, R8, R9, R10, R11, R12, R13)
236 SHA512ROUND1(67, 0xf57d4f7fee6ed178, R13, R14, R15, R8, R9, R10, R11, R12)
237 SHA512ROUND1(68, 0x06f067aa72176fba, R12, R13, R14, R15, R8, R9, R10, R11)
238 SHA512ROUND1(69, 0x0a637dc5a2c898a6, R11, R12, R13, R14, R15, R8, R9, R10)
239 SHA512ROUND1(70, 0x113f9804bef90dae, R10, R11, R12, R13, R14, R15, R8, R9)
240 SHA512ROUND1(71, 0x1b710b35131c471b, R9, R10, R11, R12, R13, R14, R15, R8)
241 SHA512ROUND1(72, 0x28db77f523047d84, R8, R9, R10, R11, R12, R13, R14, R15)
242 SHA512ROUND1(73, 0x32caab7b40c72493, R15, R8, R9, R10, R11, R12, R13, R14)
243 SHA512ROUND1(74, 0x3c9ebe0a15c9bebc, R14, R15, R8, R9, R10, R11, R12, R13)
244 SHA512ROUND1(75, 0x431d67c49c100d4c, R13, R14, R15, R8, R9, R10, R11, R12)
245 SHA512ROUND1(76, 0x4cc5d4becb3e42b6, R12, R13, R14, R15, R8, R9, R10, R11)
246 SHA512ROUND1(77, 0x597f299cfc657e2a, R11, R12, R13, R14, R15, R8, R9, R10)
247 SHA512ROUND1(78, 0x5fcb6fab3ad6faec, R10, R11, R12, R13, R14, R15, R8, R9)
248 SHA512ROUND1(79, 0x6c44198c4a475817, R9, R10, R11, R12, R13, R14, R15, R8)
249
250 MOVQ dig+0(FP), BP
251 ADDQ (0*8)(BP), R8 // H0 = a + H0
252 MOVQ R8, (0*8)(BP)
253 ADDQ (1*8)(BP), R9 // H1 = b + H1
254 MOVQ R9, (1*8)(BP)
255 ADDQ (2*8)(BP), R10 // H2 = c + H2
256 MOVQ R10, (2*8)(BP)
257 ADDQ (3*8)(BP), R11 // H3 = d + H3
258 MOVQ R11, (3*8)(BP)
259 ADDQ (4*8)(BP), R12 // H4 = e + H4
260 MOVQ R12, (4*8)(BP)
261 ADDQ (5*8)(BP), R13 // H5 = f + H5
262 MOVQ R13, (5*8)(BP)
263 ADDQ (6*8)(BP), R14 // H6 = g + H6
264 MOVQ R14, (6*8)(BP)
265 ADDQ (7*8)(BP), R15 // H7 = h + H7
266 MOVQ R15, (7*8)(BP)
267
268 ADDQ $128, SI
269 CMPQ SI, 640(SP)
270 JB loop
271
272 end:
273 RET
274
275 // Version below is based on "Fast SHA512 Implementations on Intel
276 // Architecture Processors" White-paper
277 // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-sha512-implementations-ia-processors-paper.pdf
278 // AVX2 version by Intel, same algorithm in Linux kernel:
279 // https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha512-avx2-asm.S
280
281 // James Guilford <james.guilford@intel.com>
282 // Kirk Yap <kirk.s.yap@intel.com>
283 // Tim Chen <tim.c.chen@linux.intel.com>
284 // David Cote <david.m.cote@intel.com>
285 // Aleksey Sidorov <aleksey.sidorov@intel.com>
286
287 #define YFER_SIZE (4*8)
288 #define SRND_SIZE (1*8)
289 #define INP_SIZE (1*8)
290
291 #define frame_YFER (0)
292 #define frame_SRND (frame_YFER + YFER_SIZE)
293 #define frame_INP (frame_SRND + SRND_SIZE)
294 #define frame_INPEND (frame_INP + INP_SIZE)
295
296 #define addm(p1, p2) \
297 ADDQ p1, p2; \
298 MOVQ p2, p1
299
300 #define COPY_YMM_AND_BSWAP(p1, p2, p3) \
301 VMOVDQU p2, p1; \
302 VPSHUFB p3, p1, p1
303
304 #define MY_VPALIGNR(YDST, YSRC1, YSRC2, RVAL) \
305 VPERM2F128 $0x3, YSRC2, YSRC1, YDST; \
306 VPALIGNR $RVAL, YSRC2, YDST, YDST
307
308 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x00(SB)/8, $0x0001020304050607
309 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x08(SB)/8, $0x08090a0b0c0d0e0f
310 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x10(SB)/8, $0x1011121314151617
311 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x18(SB)/8, $0x18191a1b1c1d1e1f
312
313 GLOBL PSHUFFLE_BYTE_FLIP_MASK<>(SB), (NOPTR+RODATA), $32
314
315 DATA MASK_YMM_LO<>+0x00(SB)/8, $0x0000000000000000
316 DATA MASK_YMM_LO<>+0x08(SB)/8, $0x0000000000000000
317 DATA MASK_YMM_LO<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
318 DATA MASK_YMM_LO<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
319
320 GLOBL MASK_YMM_LO<>(SB), (NOPTR+RODATA), $32
321
322 TEXT ·blockAVX2(SB), NOSPLIT, $56-32
323 MOVQ dig+0(FP), SI
324 MOVQ p_base+8(FP), DI
325 MOVQ p_len+16(FP), DX
326
327 SHRQ $7, DX
328 SHLQ $7, DX
329
330 JZ done_hash
331 ADDQ DI, DX
332 MOVQ DX, frame_INPEND(SP)
333
334 MOVQ (0*8)(SI), AX
335 MOVQ (1*8)(SI), BX
336 MOVQ (2*8)(SI), CX
337 MOVQ (3*8)(SI), R8
338 MOVQ (4*8)(SI), DX
339 MOVQ (5*8)(SI), R9
340 MOVQ (6*8)(SI), R10
341 MOVQ (7*8)(SI), R11
342
343 VMOVDQU PSHUFFLE_BYTE_FLIP_MASK<>(SB), Y9
344
345 loop0:
346 MOVQ ·_K+0(SB), BP
347
348 // byte swap first 16 dwords
349 COPY_YMM_AND_BSWAP(Y4, (0*32)(DI), Y9)
350 COPY_YMM_AND_BSWAP(Y5, (1*32)(DI), Y9)
351 COPY_YMM_AND_BSWAP(Y6, (2*32)(DI), Y9)
352 COPY_YMM_AND_BSWAP(Y7, (3*32)(DI), Y9)
353
354 MOVQ DI, frame_INP(SP)
355
356 // schedule 64 input dwords, by doing 12 rounds of 4 each
357 MOVQ $4, frame_SRND(SP)
358
359 loop1:
360 VPADDQ (BP), Y4, Y0
361 VMOVDQU Y0, frame_YFER(SP)
362
363 MY_VPALIGNR(Y0, Y7, Y6, 8)
364
365 VPADDQ Y4, Y0, Y0
366
367 MY_VPALIGNR(Y1, Y5, Y4, 8)
368
369 VPSRLQ $1, Y1, Y2
370 VPSLLQ $(64-1), Y1, Y3
371 VPOR Y2, Y3, Y3
372
373 VPSRLQ $7, Y1, Y8
374
375 MOVQ AX, DI
376 RORXQ $41, DX, R13
377 RORXQ $18, DX, R14
378 ADDQ frame_YFER(SP), R11
379 ORQ CX, DI
380 MOVQ R9, R15
381 RORXQ $34, AX, R12
382
383 XORQ R14, R13
384 XORQ R10, R15
385 RORXQ $14, DX, R14
386
387 ANDQ DX, R15
388 XORQ R14, R13
389 RORXQ $39, AX, R14
390 ADDQ R11, R8
391
392 ANDQ BX, DI
393 XORQ R12, R14
394 RORXQ $28, AX, R12
395
396 XORQ R10, R15
397 XORQ R12, R14
398 MOVQ AX, R12
399 ANDQ CX, R12
400
401 ADDQ R13, R15
402 ORQ R12, DI
403 ADDQ R14, R11
404
405 ADDQ R15, R8
406
407 ADDQ R15, R11
408 ADDQ DI, R11
409
410 VPSRLQ $8, Y1, Y2
411 VPSLLQ $(64-8), Y1, Y1
412 VPOR Y2, Y1, Y1
413
414 VPXOR Y8, Y3, Y3
415 VPXOR Y1, Y3, Y1
416
417 VPADDQ Y1, Y0, Y0
418
419 VPERM2F128 $0x0, Y0, Y0, Y4
420
421 VPAND MASK_YMM_LO<>(SB), Y0, Y0
422
423 VPERM2F128 $0x11, Y7, Y7, Y2
424 VPSRLQ $6, Y2, Y8
425
426 MOVQ R11, DI
427 RORXQ $41, R8, R13
428 RORXQ $18, R8, R14
429 ADDQ 1*8+frame_YFER(SP), R10
430 ORQ BX, DI
431
432 MOVQ DX, R15
433 RORXQ $34, R11, R12
434 XORQ R14, R13
435 XORQ R9, R15
436
437 RORXQ $14, R8, R14
438 XORQ R14, R13
439 RORXQ $39, R11, R14
440 ANDQ R8, R15
441 ADDQ R10, CX
442
443 ANDQ AX, DI
444 XORQ R12, R14
445
446 RORXQ $28, R11, R12
447 XORQ R9, R15
448
449 XORQ R12, R14
450 MOVQ R11, R12
451 ANDQ BX, R12
452 ADDQ R13, R15
453
454 ORQ R12, DI
455 ADDQ R14, R10
456
457 ADDQ R15, CX
458 ADDQ R15, R10
459 ADDQ DI, R10
460
461 VPSRLQ $19, Y2, Y3
462 VPSLLQ $(64-19), Y2, Y1
463 VPOR Y1, Y3, Y3
464 VPXOR Y3, Y8, Y8
465 VPSRLQ $61, Y2, Y3
466 VPSLLQ $(64-61), Y2, Y1
467 VPOR Y1, Y3, Y3
468 VPXOR Y3, Y8, Y8
469
470 VPADDQ Y8, Y4, Y4
471
472 VPSRLQ $6, Y4, Y8
473
474 MOVQ R10, DI
475 RORXQ $41, CX, R13
476 ADDQ 2*8+frame_YFER(SP), R9
477
478 RORXQ $18, CX, R14
479 ORQ AX, DI
480 MOVQ R8, R15
481 XORQ DX, R15
482
483 RORXQ $34, R10, R12
484 XORQ R14, R13
485 ANDQ CX, R15
486
487 RORXQ $14, CX, R14
488 ADDQ R9, BX
489 ANDQ R11, DI
490
491 XORQ R14, R13
492 RORXQ $39, R10, R14
493 XORQ DX, R15
494
495 XORQ R12, R14
496 RORXQ $28, R10, R12
497
498 XORQ R12, R14
499 MOVQ R10, R12
500 ANDQ AX, R12
501 ADDQ R13, R15
502
503 ORQ R12, DI
504 ADDQ R14, R9
505 ADDQ R15, BX
506 ADDQ R15, R9
507
508 ADDQ DI, R9
509
510 VPSRLQ $19, Y4, Y3
511 VPSLLQ $(64-19), Y4, Y1
512 VPOR Y1, Y3, Y3
513 VPXOR Y3, Y8, Y8
514 VPSRLQ $61, Y4, Y3
515 VPSLLQ $(64-61), Y4, Y1
516 VPOR Y1, Y3, Y3
517 VPXOR Y3, Y8, Y8
518
519 VPADDQ Y8, Y0, Y2
520
521 VPBLENDD $0xF0, Y2, Y4, Y4
522
523 MOVQ R9, DI
524 RORXQ $41, BX, R13
525 RORXQ $18, BX, R14
526 ADDQ 3*8+frame_YFER(SP), DX
527 ORQ R11, DI
528
529 MOVQ CX, R15
530 RORXQ $34, R9, R12
531 XORQ R14, R13
532 XORQ R8, R15
533
534 RORXQ $14, BX, R14
535 ANDQ BX, R15
536 ADDQ DX, AX
537 ANDQ R10, DI
538
539 XORQ R14, R13
540 XORQ R8, R15
541
542 RORXQ $39, R9, R14
543 ADDQ R13, R15
544
545 XORQ R12, R14
546 ADDQ R15, AX
547
548 RORXQ $28, R9, R12
549
550 XORQ R12, R14
551 MOVQ R9, R12
552 ANDQ R11, R12
553 ORQ R12, DI
554
555 ADDQ R14, DX
556 ADDQ R15, DX
557 ADDQ DI, DX
558
559 VPADDQ 1*32(BP), Y5, Y0
560 VMOVDQU Y0, frame_YFER(SP)
561
562 MY_VPALIGNR(Y0, Y4, Y7, 8)
563
564 VPADDQ Y5, Y0, Y0
565
566 MY_VPALIGNR(Y1, Y6, Y5, 8)
567
568 VPSRLQ $1, Y1, Y2
569 VPSLLQ $(64-1), Y1, Y3
570 VPOR Y2, Y3, Y3
571
572 VPSRLQ $7, Y1, Y8
573
574 MOVQ DX, DI
575 RORXQ $41, AX, R13
576 RORXQ $18, AX, R14
577 ADDQ frame_YFER(SP), R8
578 ORQ R10, DI
579 MOVQ BX, R15
580 RORXQ $34, DX, R12
581
582 XORQ R14, R13
583 XORQ CX, R15
584 RORXQ $14, AX, R14
585
586 ANDQ AX, R15
587 XORQ R14, R13
588 RORXQ $39, DX, R14
589 ADDQ R8, R11
590
591 ANDQ R9, DI
592 XORQ R12, R14
593 RORXQ $28, DX, R12
594
595 XORQ CX, R15
596 XORQ R12, R14
597 MOVQ DX, R12
598 ANDQ R10, R12
599
600 ADDQ R13, R15
601 ORQ R12, DI
602 ADDQ R14, R8
603
604 ADDQ R15, R11
605
606 ADDQ R15, R8
607 ADDQ DI, R8
608
609 VPSRLQ $8, Y1, Y2
610 VPSLLQ $(64-8), Y1, Y1
611 VPOR Y2, Y1, Y1
612
613 VPXOR Y8, Y3, Y3
614 VPXOR Y1, Y3, Y1
615
616 VPADDQ Y1, Y0, Y0
617
618 VPERM2F128 $0x0, Y0, Y0, Y5
619
620 VPAND MASK_YMM_LO<>(SB), Y0, Y0
621
622 VPERM2F128 $0x11, Y4, Y4, Y2
623 VPSRLQ $6, Y2, Y8
624
625 MOVQ R8, DI
626 RORXQ $41, R11, R13
627 RORXQ $18, R11, R14
628 ADDQ 1*8+frame_YFER(SP), CX
629 ORQ R9, DI
630
631 MOVQ AX, R15
632 RORXQ $34, R8, R12
633 XORQ R14, R13
634 XORQ BX, R15
635
636 RORXQ $14, R11, R14
637 XORQ R14, R13
638 RORXQ $39, R8, R14
639 ANDQ R11, R15
640 ADDQ CX, R10
641
642 ANDQ DX, DI
643 XORQ R12, R14
644
645 RORXQ $28, R8, R12
646 XORQ BX, R15
647
648 XORQ R12, R14
649 MOVQ R8, R12
650 ANDQ R9, R12
651 ADDQ R13, R15
652
653 ORQ R12, DI
654 ADDQ R14, CX
655
656 ADDQ R15, R10
657 ADDQ R15, CX
658 ADDQ DI, CX
659
660 VPSRLQ $19, Y2, Y3
661 VPSLLQ $(64-19), Y2, Y1
662 VPOR Y1, Y3, Y3
663 VPXOR Y3, Y8, Y8
664 VPSRLQ $61, Y2, Y3
665 VPSLLQ $(64-61), Y2, Y1
666 VPOR Y1, Y3, Y3
667 VPXOR Y3, Y8, Y8
668
669 VPADDQ Y8, Y5, Y5
670
671 VPSRLQ $6, Y5, Y8
672
673 MOVQ CX, DI
674 RORXQ $41, R10, R13
675 ADDQ 2*8+frame_YFER(SP), BX
676
677 RORXQ $18, R10, R14
678 ORQ DX, DI
679 MOVQ R11, R15
680 XORQ AX, R15
681
682 RORXQ $34, CX, R12
683 XORQ R14, R13
684 ANDQ R10, R15
685
686 RORXQ $14, R10, R14
687 ADDQ BX, R9
688 ANDQ R8, DI
689
690 XORQ R14, R13
691 RORXQ $39, CX, R14
692 XORQ AX, R15
693
694 XORQ R12, R14
695 RORXQ $28, CX, R12
696
697 XORQ R12, R14
698 MOVQ CX, R12
699 ANDQ DX, R12
700 ADDQ R13, R15
701
702 ORQ R12, DI
703 ADDQ R14, BX
704 ADDQ R15, R9
705 ADDQ R15, BX
706
707 ADDQ DI, BX
708
709 VPSRLQ $19, Y5, Y3
710 VPSLLQ $(64-19), Y5, Y1
711 VPOR Y1, Y3, Y3
712 VPXOR Y3, Y8, Y8
713 VPSRLQ $61, Y5, Y3
714 VPSLLQ $(64-61), Y5, Y1
715 VPOR Y1, Y3, Y3
716 VPXOR Y3, Y8, Y8
717
718 VPADDQ Y8, Y0, Y2
719
720 VPBLENDD $0xF0, Y2, Y5, Y5
721
722 MOVQ BX, DI
723 RORXQ $41, R9, R13
724 RORXQ $18, R9, R14
725 ADDQ 3*8+frame_YFER(SP), AX
726 ORQ R8, DI
727
728 MOVQ R10, R15
729 RORXQ $34, BX, R12
730 XORQ R14, R13
731 XORQ R11, R15
732
733 RORXQ $14, R9, R14
734 ANDQ R9, R15
735 ADDQ AX, DX
736 ANDQ CX, DI
737
738 XORQ R14, R13
739 XORQ R11, R15
740
741 RORXQ $39, BX, R14
742 ADDQ R13, R15
743
744 XORQ R12, R14
745 ADDQ R15, DX
746
747 RORXQ $28, BX, R12
748
749 XORQ R12, R14
750 MOVQ BX, R12
751 ANDQ R8, R12
752 ORQ R12, DI
753
754 ADDQ R14, AX
755 ADDQ R15, AX
756 ADDQ DI, AX
757
758 VPADDQ 2*32(BP), Y6, Y0
759 VMOVDQU Y0, frame_YFER(SP)
760
761 MY_VPALIGNR(Y0, Y5, Y4, 8)
762
763 VPADDQ Y6, Y0, Y0
764
765 MY_VPALIGNR(Y1, Y7, Y6, 8)
766
767 VPSRLQ $1, Y1, Y2
768 VPSLLQ $(64-1), Y1, Y3
769 VPOR Y2, Y3, Y3
770
771 VPSRLQ $7, Y1, Y8
772
773 MOVQ AX, DI
774 RORXQ $41, DX, R13
775 RORXQ $18, DX, R14
776 ADDQ frame_YFER(SP), R11
777 ORQ CX, DI
778 MOVQ R9, R15
779 RORXQ $34, AX, R12
780
781 XORQ R14, R13
782 XORQ R10, R15
783 RORXQ $14, DX, R14
784
785 ANDQ DX, R15
786 XORQ R14, R13
787 RORXQ $39, AX, R14
788 ADDQ R11, R8
789
790 ANDQ BX, DI
791 XORQ R12, R14
792 RORXQ $28, AX, R12
793
794 XORQ R10, R15
795 XORQ R12, R14
796 MOVQ AX, R12
797 ANDQ CX, R12
798
799 ADDQ R13, R15
800 ORQ R12, DI
801 ADDQ R14, R11
802
803 ADDQ R15, R8
804
805 ADDQ R15, R11
806 ADDQ DI, R11
807
808 VPSRLQ $8, Y1, Y2
809 VPSLLQ $(64-8), Y1, Y1
810 VPOR Y2, Y1, Y1
811
812 VPXOR Y8, Y3, Y3
813 VPXOR Y1, Y3, Y1
814
815 VPADDQ Y1, Y0, Y0
816
817 VPERM2F128 $0x0, Y0, Y0, Y6
818
819 VPAND MASK_YMM_LO<>(SB), Y0, Y0
820
821 VPERM2F128 $0x11, Y5, Y5, Y2
822 VPSRLQ $6, Y2, Y8
823
824 MOVQ R11, DI
825 RORXQ $41, R8, R13
826 RORXQ $18, R8, R14
827 ADDQ 1*8+frame_YFER(SP), R10
828 ORQ BX, DI
829
830 MOVQ DX, R15
831 RORXQ $34, R11, R12
832 XORQ R14, R13
833 XORQ R9, R15
834
835 RORXQ $14, R8, R14
836 XORQ R14, R13
837 RORXQ $39, R11, R14
838 ANDQ R8, R15
839 ADDQ R10, CX
840
841 ANDQ AX, DI
842 XORQ R12, R14
843
844 RORXQ $28, R11, R12
845 XORQ R9, R15
846
847 XORQ R12, R14
848 MOVQ R11, R12
849 ANDQ BX, R12
850 ADDQ R13, R15
851
852 ORQ R12, DI
853 ADDQ R14, R10
854
855 ADDQ R15, CX
856 ADDQ R15, R10
857 ADDQ DI, R10
858
859 VPSRLQ $19, Y2, Y3
860 VPSLLQ $(64-19), Y2, Y1
861 VPOR Y1, Y3, Y3
862 VPXOR Y3, Y8, Y8
863 VPSRLQ $61, Y2, Y3
864 VPSLLQ $(64-61), Y2, Y1
865 VPOR Y1, Y3, Y3
866 VPXOR Y3, Y8, Y8
867
868 VPADDQ Y8, Y6, Y6
869
870 VPSRLQ $6, Y6, Y8
871
872 MOVQ R10, DI
873 RORXQ $41, CX, R13
874 ADDQ 2*8+frame_YFER(SP), R9
875
876 RORXQ $18, CX, R14
877 ORQ AX, DI
878 MOVQ R8, R15
879 XORQ DX, R15
880
881 RORXQ $34, R10, R12
882 XORQ R14, R13
883 ANDQ CX, R15
884
885 RORXQ $14, CX, R14
886 ADDQ R9, BX
887 ANDQ R11, DI
888
889 XORQ R14, R13
890 RORXQ $39, R10, R14
891 XORQ DX, R15
892
893 XORQ R12, R14
894 RORXQ $28, R10, R12
895
896 XORQ R12, R14
897 MOVQ R10, R12
898 ANDQ AX, R12
899 ADDQ R13, R15
900
901 ORQ R12, DI
902 ADDQ R14, R9
903 ADDQ R15, BX
904 ADDQ R15, R9
905
906 ADDQ DI, R9
907
908 VPSRLQ $19, Y6, Y3
909 VPSLLQ $(64-19), Y6, Y1
910 VPOR Y1, Y3, Y3
911 VPXOR Y3, Y8, Y8
912 VPSRLQ $61, Y6, Y3
913 VPSLLQ $(64-61), Y6, Y1
914 VPOR Y1, Y3, Y3
915 VPXOR Y3, Y8, Y8
916
917 VPADDQ Y8, Y0, Y2
918
919 VPBLENDD $0xF0, Y2, Y6, Y6
920
921 MOVQ R9, DI
922 RORXQ $41, BX, R13
923 RORXQ $18, BX, R14
924 ADDQ 3*8+frame_YFER(SP), DX
925 ORQ R11, DI
926
927 MOVQ CX, R15
928 RORXQ $34, R9, R12
929 XORQ R14, R13
930 XORQ R8, R15
931
932 RORXQ $14, BX, R14
933 ANDQ BX, R15
934 ADDQ DX, AX
935 ANDQ R10, DI
936
937 XORQ R14, R13
938 XORQ R8, R15
939
940 RORXQ $39, R9, R14
941 ADDQ R13, R15
942
943 XORQ R12, R14
944 ADDQ R15, AX
945
946 RORXQ $28, R9, R12
947
948 XORQ R12, R14
949 MOVQ R9, R12
950 ANDQ R11, R12
951 ORQ R12, DI
952
953 ADDQ R14, DX
954 ADDQ R15, DX
955 ADDQ DI, DX
956
957 VPADDQ 3*32(BP), Y7, Y0
958 VMOVDQU Y0, frame_YFER(SP)
959 ADDQ $(4*32), BP
960
961 MY_VPALIGNR(Y0, Y6, Y5, 8)
962
963 VPADDQ Y7, Y0, Y0
964
965 MY_VPALIGNR(Y1, Y4, Y7, 8)
966
967 VPSRLQ $1, Y1, Y2
968 VPSLLQ $(64-1), Y1, Y3
969 VPOR Y2, Y3, Y3
970
971 VPSRLQ $7, Y1, Y8
972
973 MOVQ DX, DI
974 RORXQ $41, AX, R13
975 RORXQ $18, AX, R14
976 ADDQ frame_YFER(SP), R8
977 ORQ R10, DI
978 MOVQ BX, R15
979 RORXQ $34, DX, R12
980
981 XORQ R14, R13
982 XORQ CX, R15
983 RORXQ $14, AX, R14
984
985 ANDQ AX, R15
986 XORQ R14, R13
987 RORXQ $39, DX, R14
988 ADDQ R8, R11
989
990 ANDQ R9, DI
991 XORQ R12, R14
992 RORXQ $28, DX, R12
993
994 XORQ CX, R15
995 XORQ R12, R14
996 MOVQ DX, R12
997 ANDQ R10, R12
998
999 ADDQ R13, R15
1000 ORQ R12, DI
1001 ADDQ R14, R8
1002
1003 ADDQ R15, R11
1004
1005 ADDQ R15, R8
1006 ADDQ DI, R8
1007
1008 VPSRLQ $8, Y1, Y2
1009 VPSLLQ $(64-8), Y1, Y1
1010 VPOR Y2, Y1, Y1
1011
1012 VPXOR Y8, Y3, Y3
1013 VPXOR Y1, Y3, Y1
1014
1015 VPADDQ Y1, Y0, Y0
1016
1017 VPERM2F128 $0x0, Y0, Y0, Y7
1018
1019 VPAND MASK_YMM_LO<>(SB), Y0, Y0
1020
1021 VPERM2F128 $0x11, Y6, Y6, Y2
1022 VPSRLQ $6, Y2, Y8
1023
1024 MOVQ R8, DI
1025 RORXQ $41, R11, R13
1026 RORXQ $18, R11, R14
1027 ADDQ 1*8+frame_YFER(SP), CX
1028 ORQ R9, DI
1029
1030 MOVQ AX, R15
1031 RORXQ $34, R8, R12
1032 XORQ R14, R13
1033 XORQ BX, R15
1034
1035 RORXQ $14, R11, R14
1036 XORQ R14, R13
1037 RORXQ $39, R8, R14
1038 ANDQ R11, R15
1039 ADDQ CX, R10
1040
1041 ANDQ DX, DI
1042 XORQ R12, R14
1043
1044 RORXQ $28, R8, R12
1045 XORQ BX, R15
1046
1047 XORQ R12, R14
1048 MOVQ R8, R12
1049 ANDQ R9, R12
1050 ADDQ R13, R15
1051
1052 ORQ R12, DI
1053 ADDQ R14, CX
1054
1055 ADDQ R15, R10
1056 ADDQ R15, CX
1057 ADDQ DI, CX
1058
1059 VPSRLQ $19, Y2, Y3
1060 VPSLLQ $(64-19), Y2, Y1
1061 VPOR Y1, Y3, Y3
1062 VPXOR Y3, Y8, Y8
1063 VPSRLQ $61, Y2, Y3
1064 VPSLLQ $(64-61), Y2, Y1
1065 VPOR Y1, Y3, Y3
1066 VPXOR Y3, Y8, Y8
1067
1068 VPADDQ Y8, Y7, Y7
1069
1070 VPSRLQ $6, Y7, Y8
1071
1072 MOVQ CX, DI
1073 RORXQ $41, R10, R13
1074 ADDQ 2*8+frame_YFER(SP), BX
1075
1076 RORXQ $18, R10, R14
1077 ORQ DX, DI
1078 MOVQ R11, R15
1079 XORQ AX, R15
1080
1081 RORXQ $34, CX, R12
1082 XORQ R14, R13
1083 ANDQ R10, R15
1084
1085 RORXQ $14, R10, R14
1086 ADDQ BX, R9
1087 ANDQ R8, DI
1088
1089 XORQ R14, R13
1090 RORXQ $39, CX, R14
1091 XORQ AX, R15
1092
1093 XORQ R12, R14
1094 RORXQ $28, CX, R12
1095
1096 XORQ R12, R14
1097 MOVQ CX, R12
1098 ANDQ DX, R12
1099 ADDQ R13, R15
1100
1101 ORQ R12, DI
1102 ADDQ R14, BX
1103 ADDQ R15, R9
1104 ADDQ R15, BX
1105
1106 ADDQ DI, BX
1107
1108 VPSRLQ $19, Y7, Y3
1109 VPSLLQ $(64-19), Y7, Y1
1110 VPOR Y1, Y3, Y3
1111 VPXOR Y3, Y8, Y8
1112 VPSRLQ $61, Y7, Y3
1113 VPSLLQ $(64-61), Y7, Y1
1114 VPOR Y1, Y3, Y3
1115 VPXOR Y3, Y8, Y8
1116
1117 VPADDQ Y8, Y0, Y2
1118
1119 VPBLENDD $0xF0, Y2, Y7, Y7
1120
1121 MOVQ BX, DI
1122 RORXQ $41, R9, R13
1123 RORXQ $18, R9, R14
1124 ADDQ 3*8+frame_YFER(SP), AX
1125 ORQ R8, DI
1126
1127 MOVQ R10, R15
1128 RORXQ $34, BX, R12
1129 XORQ R14, R13
1130 XORQ R11, R15
1131
1132 RORXQ $14, R9, R14
1133 ANDQ R9, R15
1134 ADDQ AX, DX
1135 ANDQ CX, DI
1136
1137 XORQ R14, R13
1138 XORQ R11, R15
1139
1140 RORXQ $39, BX, R14
1141 ADDQ R13, R15
1142
1143 XORQ R12, R14
1144 ADDQ R15, DX
1145
1146 RORXQ $28, BX, R12
1147
1148 XORQ R12, R14
1149 MOVQ BX, R12
1150 ANDQ R8, R12
1151 ORQ R12, DI
1152
1153 ADDQ R14, AX
1154 ADDQ R15, AX
1155 ADDQ DI, AX
1156
1157 SUBQ $1, frame_SRND(SP)
1158 JNE loop1
1159
1160 MOVQ $2, frame_SRND(SP)
1161
1162 loop2:
1163 VPADDQ (BP), Y4, Y0
1164 VMOVDQU Y0, frame_YFER(SP)
1165
1166 MOVQ R9, R15
1167 RORXQ $41, DX, R13
1168 RORXQ $18, DX, R14
1169 XORQ R10, R15
1170
1171 XORQ R14, R13
1172 RORXQ $14, DX, R14
1173 ANDQ DX, R15
1174
1175 XORQ R14, R13
1176 RORXQ $34, AX, R12
1177 XORQ R10, R15
1178 RORXQ $39, AX, R14
1179 MOVQ AX, DI
1180
1181 XORQ R12, R14
1182 RORXQ $28, AX, R12
1183 ADDQ frame_YFER(SP), R11
1184 ORQ CX, DI
1185
1186 XORQ R12, R14
1187 MOVQ AX, R12
1188 ANDQ BX, DI
1189 ANDQ CX, R12
1190 ADDQ R13, R15
1191
1192 ADDQ R11, R8
1193 ORQ R12, DI
1194 ADDQ R14, R11
1195
1196 ADDQ R15, R8
1197
1198 ADDQ R15, R11
1199 MOVQ DX, R15
1200 RORXQ $41, R8, R13
1201 RORXQ $18, R8, R14
1202 XORQ R9, R15
1203
1204 XORQ R14, R13
1205 RORXQ $14, R8, R14
1206 ANDQ R8, R15
1207 ADDQ DI, R11
1208
1209 XORQ R14, R13
1210 RORXQ $34, R11, R12
1211 XORQ R9, R15
1212 RORXQ $39, R11, R14
1213 MOVQ R11, DI
1214
1215 XORQ R12, R14
1216 RORXQ $28, R11, R12
1217 ADDQ 8*1+frame_YFER(SP), R10
1218 ORQ BX, DI
1219
1220 XORQ R12, R14
1221 MOVQ R11, R12
1222 ANDQ AX, DI
1223 ANDQ BX, R12
1224 ADDQ R13, R15
1225
1226 ADDQ R10, CX
1227 ORQ R12, DI
1228 ADDQ R14, R10
1229
1230 ADDQ R15, CX
1231
1232 ADDQ R15, R10
1233 MOVQ R8, R15
1234 RORXQ $41, CX, R13
1235 RORXQ $18, CX, R14
1236 XORQ DX, R15
1237
1238 XORQ R14, R13
1239 RORXQ $14, CX, R14
1240 ANDQ CX, R15
1241 ADDQ DI, R10
1242
1243 XORQ R14, R13
1244 RORXQ $34, R10, R12
1245 XORQ DX, R15
1246 RORXQ $39, R10, R14
1247 MOVQ R10, DI
1248
1249 XORQ R12, R14
1250 RORXQ $28, R10, R12
1251 ADDQ 8*2+frame_YFER(SP), R9
1252 ORQ AX, DI
1253
1254 XORQ R12, R14
1255 MOVQ R10, R12
1256 ANDQ R11, DI
1257 ANDQ AX, R12
1258 ADDQ R13, R15
1259
1260 ADDQ R9, BX
1261 ORQ R12, DI
1262 ADDQ R14, R9
1263
1264 ADDQ R15, BX
1265
1266 ADDQ R15, R9
1267 MOVQ CX, R15
1268 RORXQ $41, BX, R13
1269 RORXQ $18, BX, R14
1270 XORQ R8, R15
1271
1272 XORQ R14, R13
1273 RORXQ $14, BX, R14
1274 ANDQ BX, R15
1275 ADDQ DI, R9
1276
1277 XORQ R14, R13
1278 RORXQ $34, R9, R12
1279 XORQ R8, R15
1280 RORXQ $39, R9, R14
1281 MOVQ R9, DI
1282
1283 XORQ R12, R14
1284 RORXQ $28, R9, R12
1285 ADDQ 8*3+frame_YFER(SP), DX
1286 ORQ R11, DI
1287
1288 XORQ R12, R14
1289 MOVQ R9, R12
1290 ANDQ R10, DI
1291 ANDQ R11, R12
1292 ADDQ R13, R15
1293
1294 ADDQ DX, AX
1295 ORQ R12, DI
1296 ADDQ R14, DX
1297
1298 ADDQ R15, AX
1299
1300 ADDQ R15, DX
1301
1302 ADDQ DI, DX
1303
1304 VPADDQ 1*32(BP), Y5, Y0
1305 VMOVDQU Y0, frame_YFER(SP)
1306 ADDQ $(2*32), BP
1307
1308 MOVQ BX, R15
1309 RORXQ $41, AX, R13
1310 RORXQ $18, AX, R14
1311 XORQ CX, R15
1312
1313 XORQ R14, R13
1314 RORXQ $14, AX, R14
1315 ANDQ AX, R15
1316
1317 XORQ R14, R13
1318 RORXQ $34, DX, R12
1319 XORQ CX, R15
1320 RORXQ $39, DX, R14
1321 MOVQ DX, DI
1322
1323 XORQ R12, R14
1324 RORXQ $28, DX, R12
1325 ADDQ frame_YFER(SP), R8
1326 ORQ R10, DI
1327
1328 XORQ R12, R14
1329 MOVQ DX, R12
1330 ANDQ R9, DI
1331 ANDQ R10, R12
1332 ADDQ R13, R15
1333
1334 ADDQ R8, R11
1335 ORQ R12, DI
1336 ADDQ R14, R8
1337
1338 ADDQ R15, R11
1339
1340 ADDQ R15, R8
1341 MOVQ AX, R15
1342 RORXQ $41, R11, R13
1343 RORXQ $18, R11, R14
1344 XORQ BX, R15
1345
1346 XORQ R14, R13
1347 RORXQ $14, R11, R14
1348 ANDQ R11, R15
1349 ADDQ DI, R8
1350
1351 XORQ R14, R13
1352 RORXQ $34, R8, R12
1353 XORQ BX, R15
1354 RORXQ $39, R8, R14
1355 MOVQ R8, DI
1356
1357 XORQ R12, R14
1358 RORXQ $28, R8, R12
1359 ADDQ 8*1+frame_YFER(SP), CX
1360 ORQ R9, DI
1361
1362 XORQ R12, R14
1363 MOVQ R8, R12
1364 ANDQ DX, DI
1365 ANDQ R9, R12
1366 ADDQ R13, R15
1367
1368 ADDQ CX, R10
1369 ORQ R12, DI
1370 ADDQ R14, CX
1371
1372 ADDQ R15, R10
1373
1374 ADDQ R15, CX
1375 MOVQ R11, R15
1376 RORXQ $41, R10, R13
1377 RORXQ $18, R10, R14
1378 XORQ AX, R15
1379
1380 XORQ R14, R13
1381 RORXQ $14, R10, R14
1382 ANDQ R10, R15
1383 ADDQ DI, CX
1384
1385 XORQ R14, R13
1386 RORXQ $34, CX, R12
1387 XORQ AX, R15
1388 RORXQ $39, CX, R14
1389 MOVQ CX, DI
1390
1391 XORQ R12, R14
1392 RORXQ $28, CX, R12
1393 ADDQ 8*2+frame_YFER(SP), BX
1394 ORQ DX, DI
1395
1396 XORQ R12, R14
1397 MOVQ CX, R12
1398 ANDQ R8, DI
1399 ANDQ DX, R12
1400 ADDQ R13, R15
1401
1402 ADDQ BX, R9
1403 ORQ R12, DI
1404 ADDQ R14, BX
1405
1406 ADDQ R15, R9
1407
1408 ADDQ R15, BX
1409 MOVQ R10, R15
1410 RORXQ $41, R9, R13
1411 RORXQ $18, R9, R14
1412 XORQ R11, R15
1413
1414 XORQ R14, R13
1415 RORXQ $14, R9, R14
1416 ANDQ R9, R15
1417 ADDQ DI, BX
1418
1419 XORQ R14, R13
1420 RORXQ $34, BX, R12
1421 XORQ R11, R15
1422 RORXQ $39, BX, R14
1423 MOVQ BX, DI
1424
1425 XORQ R12, R14
1426 RORXQ $28, BX, R12
1427 ADDQ 8*3+frame_YFER(SP), AX
1428 ORQ R8, DI
1429
1430 XORQ R12, R14
1431 MOVQ BX, R12
1432 ANDQ CX, DI
1433 ANDQ R8, R12
1434 ADDQ R13, R15
1435
1436 ADDQ AX, DX
1437 ORQ R12, DI
1438 ADDQ R14, AX
1439
1440 ADDQ R15, DX
1441
1442 ADDQ R15, AX
1443
1444 ADDQ DI, AX
1445
1446 VMOVDQU Y6, Y4
1447 VMOVDQU Y7, Y5
1448
1449 SUBQ $1, frame_SRND(SP)
1450 JNE loop2
1451
1452 addm(8*0(SI),AX)
1453 addm(8*1(SI),BX)
1454 addm(8*2(SI),CX)
1455 addm(8*3(SI),R8)
1456 addm(8*4(SI),DX)
1457 addm(8*5(SI),R9)
1458 addm(8*6(SI),R10)
1459 addm(8*7(SI),R11)
1460
1461 MOVQ frame_INP(SP), DI
1462 ADDQ $128, DI
1463 CMPQ DI, frame_INPEND(SP)
1464 JNE loop0
1465
1466 done_hash:
1467 VZEROUPPER
1468 RET
1469
View as plain text