]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/dev/if_wg/module/chacha20-x86_64.S
service(8): use an environment more consistent with init(8)
[FreeBSD/FreeBSD.git] / sys / dev / if_wg / module / chacha20-x86_64.S
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 //
3 // Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
4 // Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
5 // Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
6 //
7 // This code is taken from the OpenSSL project but the author, Andy Polyakov,
8 // has relicensed it under the licenses specified in the SPDX header above.
9 // The original headers, including the original license headers, are
10 // included below for completeness.
11 //
12 // ====================================================================
13 // Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
14 // project. The module is, however, dual licensed under OpenSSL and
15 // CRYPTOGAMS licenses depending on where you obtain it. For further
16 // details see http://www.openssl.org/~appro/cryptogams/.
17 // ====================================================================
18 //
19 // November 2014
20 //
21 // ChaCha20 for x86_64.
22 //
23 // December 2016
24 //
25 // Add AVX512F code path.
26 //
27 // December 2017
28 //
29 // Add AVX512VL code path.
30 //
31 // Performance in cycles per byte out of large buffer.
32 //
33 //              IALU/gcc 4.8(i) 1x/2xSSSE3(ii)  4xSSSE3     NxAVX(v)
34 //
35 // P4           9.48/+99%       -               -
36 // Core2                7.83/+55%       7.90/5.76       4.35
37 // Westmere     7.19/+50%       5.60/4.50       3.00
38 // Sandy Bridge 8.31/+42%       5.45/4.00       2.72
39 // Ivy Bridge   6.71/+46%       5.40/?          2.41
40 // Haswell      5.92/+43%       5.20/3.45       2.42        1.23
41 // Skylake[-X]  5.87/+39%       4.70/3.22       2.31        1.19[0.80(vi)]
42 // Silvermont   12.0/+33%       7.75/6.90       7.03(iii)
43 // Knights L    11.7/-          ?               9.60(iii)   0.80
44 // Goldmont     10.6/+17%       5.10/3.52       3.28
45 // Sledgehammer 7.28/+52%       -               -
46 // Bulldozer    9.66/+28%       9.85/5.35(iv)   3.06(iv)
47 // Ryzen                5.96/+50%       5.19/3.00       2.40        2.09
48 // VIA Nano     10.5/+46%       6.72/6.88       6.05
49 //
50 // (i)  compared to older gcc 3.x one can observe >2x improvement on
51 //      most platforms;
52 // (ii) 2xSSSE3 is code path optimized specifically for 128 bytes used
53 //      by chacha20_poly1305_tls_cipher, results are EVP-free;
54 // (iii)        this is not optimal result for Atom because of MSROM
55 //      limitations, SSE2 can do better, but gain is considered too
56 //      low to justify the [maintenance] effort;
57 // (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20
58 //      and 4.85 for 128-byte inputs;
59 // (v)  8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable;
60 // (vi) even though Skylake-X can execute AVX512F code and deliver 0.57
61 //      cpb in single thread, the corresponding capability is suppressed;
62
63 //#include <linux/linkage.h>
64 .section .rodata.cst16.Lzero, "aM", @progbits, 16
65 .align 16
66 .Lzero:
67 .long 0,0,0,0
68 .section .rodata.cst16.Lone, "aM", @progbits, 16
69 .align 16
70 .Lone:
71 .long 1,0,0,0
72 .section .rodata.cst16.Linc, "aM", @progbits, 16
73 .align 16
74 .Linc:
75 .long 0,1,2,3
76 .section .rodata.cst16.Lfour, "aM", @progbits, 16
77 .align 16
78 .Lfour:
79 .long 4,4,4,4
80 .section .rodata.cst32.Lincy, "aM", @progbits, 32
81 .align 32
82 .Lincy:
83 .long 0,2,4,6,1,3,5,7
84 .section .rodata.cst32.Leight, "aM", @progbits, 32
85 .align 32
86 .Leight:
87 .long 8,8,8,8,8,8,8,8
88 .section .rodata.cst16.Lrot16, "aM", @progbits, 16
89 .align 16
90 .Lrot16:
91 .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
92 .section .rodata.cst16.Lrot24, "aM", @progbits, 16
93 .align 16
94 .Lrot24:
95 .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
96 .section .rodata.cst32.Ltwoy, "aM", @progbits, 32
97 .align 32
98 .Ltwoy:
99 .long 2,0,0,0, 2,0,0,0
100 .section .rodata.cst64.Lzeroz, "aM", @progbits, 64
101 .align 64
102 .Lzeroz:
103 .long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
104 .section .rodata.cst64.Lfourz, "aM", @progbits, 64
105 .align 64
106 .Lfourz:
107 .long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
108 .section .rodata.cst64.Lincz, "aM", @progbits, 64
109 .align 64
110 .Lincz:
111 .long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
112 .section .rodata.cst64.Lsixteen, "aM", @progbits, 64
113 .align 64
114 .Lsixteen:
115 .long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
116 .section .rodata.cst16.Lsigma, "aM", @progbits, 16
117 .align 16
118 .Lsigma:
119 .ascii "expand 32-byte k"
120 .text
121 #ifdef CONFIG_AS_SSSE3
122 .align 32
123 SYM_FUNC_START(hchacha20_ssse3)
124 .Lhchacha20_ssse3:
125         movdqa  .Lsigma(%rip),%xmm0
126         movdqu  (%rdx),%xmm1
127         movdqu  16(%rdx),%xmm2
128         movdqu  (%rsi),%xmm3
129         # This code is only used when targeting kernel.
130         # If targeting win64, xmm{6,7} preserving needs to be added.
131         movdqa  .Lrot16(%rip),%xmm6
132         movdqa  .Lrot24(%rip),%xmm7
133         mov     $10,%r8         # reuse %r8
134         jmp     1f
135 .align  32
136 1:
137         paddd   %xmm1,%xmm0
138         pxor    %xmm0,%xmm3
139         pshufb  %xmm6,%xmm3
140         paddd   %xmm3,%xmm2
141         pxor    %xmm2,%xmm1
142         movdqa  %xmm1,%xmm4
143         psrld   $20,%xmm1
144         pslld   $12,%xmm4
145         por     %xmm4,%xmm1
146         paddd   %xmm1,%xmm0
147         pxor    %xmm0,%xmm3
148         pshufb  %xmm7,%xmm3
149         paddd   %xmm3,%xmm2
150         pxor    %xmm2,%xmm1
151         movdqa  %xmm1,%xmm4
152         psrld   $25,%xmm1
153         pslld   $7,%xmm4
154         por     %xmm4,%xmm1
155         pshufd  $147,%xmm0,%xmm0
156         pshufd  $78,%xmm3,%xmm3
157         pshufd  $57,%xmm2,%xmm2
158         nop     
159         paddd   %xmm1,%xmm0
160         pxor    %xmm0,%xmm3
161         pshufb  %xmm6,%xmm3
162         paddd   %xmm3,%xmm2
163         pxor    %xmm2,%xmm1
164         movdqa  %xmm1,%xmm4
165         psrld   $20,%xmm1
166         pslld   $12,%xmm4
167         por     %xmm4,%xmm1
168         paddd   %xmm1,%xmm0
169         pxor    %xmm0,%xmm3
170         pshufb  %xmm7,%xmm3
171         paddd   %xmm3,%xmm2
172         pxor    %xmm2,%xmm1
173         movdqa  %xmm1,%xmm4
174         psrld   $25,%xmm1
175         pslld   $7,%xmm4
176         por     %xmm4,%xmm1
177         pshufd  $57,%xmm0,%xmm0
178         pshufd  $78,%xmm3,%xmm3
179         pshufd  $147,%xmm2,%xmm2
180         dec     %r8
181         jnz     1b
182         movdqu %xmm0, (%rdi)
183         movdqu %xmm3, 16(%rdi)
184         ret
185 SYM_FUNC_END(hchacha20_ssse3)
186 .align 32
187 SYM_FUNC_START(chacha20_ssse3)
188 .Lchacha20_ssse3:
189         lea     8(%rsp),%r10            # frame pointer
190         cmp     $128,%rdx               # we might throw away some data,
191         je      .Lchacha20_128
192         ja      .Lchacha20_4x           # but overall it won't be slower
193
194 .Ldo_ssse3_after_all:
195         sub     $64+8,%rsp
196         and $-16,%rsp
197         movdqa  .Lsigma(%rip),%xmm0
198         movdqu  (%rcx),%xmm1
199         movdqu  16(%rcx),%xmm2
200         movdqu  (%r8),%xmm3
201         movdqa  .Lrot16(%rip),%xmm6
202         movdqa  .Lrot24(%rip),%xmm7
203
204         movdqa  %xmm0,0x00(%rsp)
205         movdqa  %xmm1,0x10(%rsp)
206         movdqa  %xmm2,0x20(%rsp)
207         movdqa  %xmm3,0x30(%rsp)
208         mov     $10,%r8         # reuse %r8
209         jmp     .Loop_ssse3
210
211 .align  32
212 .Loop_outer_ssse3:
213         movdqa  .Lone(%rip),%xmm3
214         movdqa  0x00(%rsp),%xmm0
215         movdqa  0x10(%rsp),%xmm1
216         movdqa  0x20(%rsp),%xmm2
217         paddd   0x30(%rsp),%xmm3
218         mov     $10,%r8
219         movdqa  %xmm3,0x30(%rsp)
220         jmp     .Loop_ssse3
221
222 .align  32
223 .Loop_ssse3:
224         paddd   %xmm1,%xmm0
225         pxor    %xmm0,%xmm3
226         pshufb  %xmm6,%xmm3
227         paddd   %xmm3,%xmm2
228         pxor    %xmm2,%xmm1
229         movdqa  %xmm1,%xmm4
230         psrld   $20,%xmm1
231         pslld   $12,%xmm4
232         por     %xmm4,%xmm1
233         paddd   %xmm1,%xmm0
234         pxor    %xmm0,%xmm3
235         pshufb  %xmm7,%xmm3
236         paddd   %xmm3,%xmm2
237         pxor    %xmm2,%xmm1
238         movdqa  %xmm1,%xmm4
239         psrld   $25,%xmm1
240         pslld   $7,%xmm4
241         por     %xmm4,%xmm1
242         pshufd  $147,%xmm0,%xmm0
243         pshufd  $78,%xmm3,%xmm3
244         pshufd  $57,%xmm2,%xmm2
245         nop     
246         paddd   %xmm1,%xmm0
247         pxor    %xmm0,%xmm3
248         pshufb  %xmm6,%xmm3
249         paddd   %xmm3,%xmm2
250         pxor    %xmm2,%xmm1
251         movdqa  %xmm1,%xmm4
252         psrld   $20,%xmm1
253         pslld   $12,%xmm4
254         por     %xmm4,%xmm1
255         paddd   %xmm1,%xmm0
256         pxor    %xmm0,%xmm3
257         pshufb  %xmm7,%xmm3
258         paddd   %xmm3,%xmm2
259         pxor    %xmm2,%xmm1
260         movdqa  %xmm1,%xmm4
261         psrld   $25,%xmm1
262         pslld   $7,%xmm4
263         por     %xmm4,%xmm1
264         pshufd  $57,%xmm0,%xmm0
265         pshufd  $78,%xmm3,%xmm3
266         pshufd  $147,%xmm2,%xmm2
267         dec     %r8
268         jnz     .Loop_ssse3
269         paddd   0x00(%rsp),%xmm0
270         paddd   0x10(%rsp),%xmm1
271         paddd   0x20(%rsp),%xmm2
272         paddd   0x30(%rsp),%xmm3
273
274         cmp     $64,%rdx
275         jb      .Ltail_ssse3
276
277         movdqu  0x00(%rsi),%xmm4
278         movdqu  0x10(%rsi),%xmm5
279         pxor    %xmm4,%xmm0                     # xor with input
280         movdqu  0x20(%rsi),%xmm4
281         pxor    %xmm5,%xmm1
282         movdqu  0x30(%rsi),%xmm5
283         lea     0x40(%rsi),%rsi         # inp+=64
284         pxor    %xmm4,%xmm2
285         pxor    %xmm5,%xmm3
286
287         movdqu  %xmm0,0x00(%rdi)                # write output
288         movdqu  %xmm1,0x10(%rdi)
289         movdqu  %xmm2,0x20(%rdi)
290         movdqu  %xmm3,0x30(%rdi)
291         lea     0x40(%rdi),%rdi         # out+=64
292
293         sub     $64,%rdx
294         jnz     .Loop_outer_ssse3
295
296         jmp     .Ldone_ssse3
297
298 .align  16
299 .Ltail_ssse3:
300         movdqa  %xmm0,0x00(%rsp)
301         movdqa  %xmm1,0x10(%rsp)
302         movdqa  %xmm2,0x20(%rsp)
303         movdqa  %xmm3,0x30(%rsp)
304         xor     %r8,%r8
305
306 .Loop_tail_ssse3:
307         movzb   (%rsi,%r8),%eax
308         movzb   (%rsp,%r8),%ecx
309         lea     1(%r8),%r8
310         xor     %ecx,%eax
311         mov     %al,-1(%rdi,%r8)
312         dec     %rdx
313         jnz     .Loop_tail_ssse3
314
315 .Ldone_ssse3:
316         lea     -8(%r10),%rsp
317 .Lssse3_epilogue:
318         ret
319 SYM_FUNC_END(chacha20_ssse3)
320 .type   chacha20_128,@function
321 .align  32
322 chacha20_128:
323 .Lchacha20_128:
324         lea     8(%rsp),%r10            # frame pointer
325         sub     $64+8,%rsp
326         and $-16,%rsp
327         movdqa  .Lsigma(%rip),%xmm8
328         movdqu  (%rcx),%xmm9
329         movdqu  16(%rcx),%xmm2
330         movdqu  (%r8),%xmm3
331         movdqa  .Lone(%rip),%xmm1
332         movdqa  .Lrot16(%rip),%xmm6
333         movdqa  .Lrot24(%rip),%xmm7
334
335         movdqa  %xmm8,%xmm10
336         movdqa  %xmm8,0x00(%rsp)
337         movdqa  %xmm9,%xmm11
338         movdqa  %xmm9,0x10(%rsp)
339         movdqa  %xmm2,%xmm0
340         movdqa  %xmm2,0x20(%rsp)
341         paddd   %xmm3,%xmm1
342         movdqa  %xmm3,0x30(%rsp)
343         mov     $10,%r8         # reuse %r8
344         jmp     .Loop_128
345
346 .align  32
347 .Loop_128:
348         paddd   %xmm9,%xmm8
349         pxor    %xmm8,%xmm3
350         paddd   %xmm11,%xmm10
351         pxor    %xmm10,%xmm1
352         pshufb  %xmm6,%xmm3
353         pshufb  %xmm6,%xmm1
354         paddd   %xmm3,%xmm2
355         paddd   %xmm1,%xmm0
356         pxor    %xmm2,%xmm9
357         pxor    %xmm0,%xmm11
358         movdqa  %xmm9,%xmm4
359         psrld   $20,%xmm9
360         movdqa  %xmm11,%xmm5
361         pslld   $12,%xmm4
362         psrld   $20,%xmm11
363         por     %xmm4,%xmm9
364         pslld   $12,%xmm5
365         por     %xmm5,%xmm11
366         paddd   %xmm9,%xmm8
367         pxor    %xmm8,%xmm3
368         paddd   %xmm11,%xmm10
369         pxor    %xmm10,%xmm1
370         pshufb  %xmm7,%xmm3
371         pshufb  %xmm7,%xmm1
372         paddd   %xmm3,%xmm2
373         paddd   %xmm1,%xmm0
374         pxor    %xmm2,%xmm9
375         pxor    %xmm0,%xmm11
376         movdqa  %xmm9,%xmm4
377         psrld   $25,%xmm9
378         movdqa  %xmm11,%xmm5
379         pslld   $7,%xmm4
380         psrld   $25,%xmm11
381         por     %xmm4,%xmm9
382         pslld   $7,%xmm5
383         por     %xmm5,%xmm11
384         pshufd  $147,%xmm8,%xmm8
385         pshufd  $78,%xmm3,%xmm3
386         pshufd  $57,%xmm2,%xmm2
387         pshufd  $147,%xmm10,%xmm10
388         pshufd  $78,%xmm1,%xmm1
389         pshufd  $57,%xmm0,%xmm0
390         paddd   %xmm9,%xmm8
391         pxor    %xmm8,%xmm3
392         paddd   %xmm11,%xmm10
393         pxor    %xmm10,%xmm1
394         pshufb  %xmm6,%xmm3
395         pshufb  %xmm6,%xmm1
396         paddd   %xmm3,%xmm2
397         paddd   %xmm1,%xmm0
398         pxor    %xmm2,%xmm9
399         pxor    %xmm0,%xmm11
400         movdqa  %xmm9,%xmm4
401         psrld   $20,%xmm9
402         movdqa  %xmm11,%xmm5
403         pslld   $12,%xmm4
404         psrld   $20,%xmm11
405         por     %xmm4,%xmm9
406         pslld   $12,%xmm5
407         por     %xmm5,%xmm11
408         paddd   %xmm9,%xmm8
409         pxor    %xmm8,%xmm3
410         paddd   %xmm11,%xmm10
411         pxor    %xmm10,%xmm1
412         pshufb  %xmm7,%xmm3
413         pshufb  %xmm7,%xmm1
414         paddd   %xmm3,%xmm2
415         paddd   %xmm1,%xmm0
416         pxor    %xmm2,%xmm9
417         pxor    %xmm0,%xmm11
418         movdqa  %xmm9,%xmm4
419         psrld   $25,%xmm9
420         movdqa  %xmm11,%xmm5
421         pslld   $7,%xmm4
422         psrld   $25,%xmm11
423         por     %xmm4,%xmm9
424         pslld   $7,%xmm5
425         por     %xmm5,%xmm11
426         pshufd  $57,%xmm8,%xmm8
427         pshufd  $78,%xmm3,%xmm3
428         pshufd  $147,%xmm2,%xmm2
429         pshufd  $57,%xmm10,%xmm10
430         pshufd  $78,%xmm1,%xmm1
431         pshufd  $147,%xmm0,%xmm0
432         dec     %r8
433         jnz     .Loop_128
434         paddd   0x00(%rsp),%xmm8
435         paddd   0x10(%rsp),%xmm9
436         paddd   0x20(%rsp),%xmm2
437         paddd   0x30(%rsp),%xmm3
438         paddd   .Lone(%rip),%xmm1
439         paddd   0x00(%rsp),%xmm10
440         paddd   0x10(%rsp),%xmm11
441         paddd   0x20(%rsp),%xmm0
442         paddd   0x30(%rsp),%xmm1
443
444         movdqu  0x00(%rsi),%xmm4
445         movdqu  0x10(%rsi),%xmm5
446         pxor    %xmm4,%xmm8                     # xor with input
447         movdqu  0x20(%rsi),%xmm4
448         pxor    %xmm5,%xmm9
449         movdqu  0x30(%rsi),%xmm5
450         pxor    %xmm4,%xmm2
451         movdqu  0x40(%rsi),%xmm4
452         pxor    %xmm5,%xmm3
453         movdqu  0x50(%rsi),%xmm5
454         pxor    %xmm4,%xmm10
455         movdqu  0x60(%rsi),%xmm4
456         pxor    %xmm5,%xmm11
457         movdqu  0x70(%rsi),%xmm5
458         pxor    %xmm4,%xmm0
459         pxor    %xmm5,%xmm1
460
461         movdqu  %xmm8,0x00(%rdi)                # write output
462         movdqu  %xmm9,0x10(%rdi)
463         movdqu  %xmm2,0x20(%rdi)
464         movdqu  %xmm3,0x30(%rdi)
465         movdqu  %xmm10,0x40(%rdi)
466         movdqu  %xmm11,0x50(%rdi)
467         movdqu  %xmm0,0x60(%rdi)
468         movdqu  %xmm1,0x70(%rdi)
469         lea     -8(%r10),%rsp
470 .L128_epilogue:
471         ret
472 .size   chacha20_128,.-chacha20_128
473 .type   chacha20_4x,@function
474 .align  32
475 chacha20_4x:
476 .Lchacha20_4x:
477         lea             8(%rsp),%r10            # frame pointer
478         cmp             $192,%rdx
479         ja              .Lproceed4x
480 .Lproceed4x:
481         sub             $0x140+8,%rsp
482         and             $-16,%rsp
483         movdqa          .Lsigma(%rip),%xmm11    # key[0]
484         movdqu          (%rcx),%xmm15           # key[1]
485         movdqu          16(%rcx),%xmm7          # key[2]
486         movdqu          (%r8),%xmm3             # key[3]
487         lea             0x100(%rsp),%rcx        # size optimization
488         lea             .Lrot16(%rip),%r9
489         lea             .Lrot24(%rip),%r11
490
491         pshufd          $0x00,%xmm11,%xmm8      # smash key by lanes...
492         pshufd          $0x55,%xmm11,%xmm9
493         movdqa          %xmm8,0x40(%rsp)                # ... and offload
494         pshufd          $0xaa,%xmm11,%xmm10
495         movdqa          %xmm9,0x50(%rsp)
496         pshufd          $0xff,%xmm11,%xmm11
497         movdqa          %xmm10,0x60(%rsp)
498         movdqa          %xmm11,0x70(%rsp)
499
500         pshufd          $0x00,%xmm15,%xmm12
501         pshufd          $0x55,%xmm15,%xmm13
502         movdqa          %xmm12,0x80-0x100(%rcx)
503         pshufd          $0xaa,%xmm15,%xmm14
504         movdqa          %xmm13,0x90-0x100(%rcx)
505         pshufd          $0xff,%xmm15,%xmm15
506         movdqa          %xmm14,0xa0-0x100(%rcx)
507         movdqa          %xmm15,0xb0-0x100(%rcx)
508
509         pshufd          $0x00,%xmm7,%xmm4       # ""
510         pshufd          $0x55,%xmm7,%xmm5       # ""
511         movdqa          %xmm4,0xc0-0x100(%rcx)
512         pshufd          $0xaa,%xmm7,%xmm6       # ""
513         movdqa          %xmm5,0xd0-0x100(%rcx)
514         pshufd          $0xff,%xmm7,%xmm7       # ""
515         movdqa          %xmm6,0xe0-0x100(%rcx)
516         movdqa          %xmm7,0xf0-0x100(%rcx)
517
518         pshufd          $0x00,%xmm3,%xmm0
519         pshufd          $0x55,%xmm3,%xmm1
520         paddd           .Linc(%rip),%xmm0       # don't save counters yet
521         pshufd          $0xaa,%xmm3,%xmm2
522         movdqa          %xmm1,0x110-0x100(%rcx)
523         pshufd          $0xff,%xmm3,%xmm3
524         movdqa          %xmm2,0x120-0x100(%rcx)
525         movdqa          %xmm3,0x130-0x100(%rcx)
526
527         jmp             .Loop_enter4x
528
529 .align  32
530 .Loop_outer4x:
531         movdqa          0x40(%rsp),%xmm8                # re-load smashed key
532         movdqa          0x50(%rsp),%xmm9
533         movdqa          0x60(%rsp),%xmm10
534         movdqa          0x70(%rsp),%xmm11
535         movdqa          0x80-0x100(%rcx),%xmm12
536         movdqa          0x90-0x100(%rcx),%xmm13
537         movdqa          0xa0-0x100(%rcx),%xmm14
538         movdqa          0xb0-0x100(%rcx),%xmm15
539         movdqa          0xc0-0x100(%rcx),%xmm4  # ""
540         movdqa          0xd0-0x100(%rcx),%xmm5  # ""
541         movdqa          0xe0-0x100(%rcx),%xmm6  # ""
542         movdqa          0xf0-0x100(%rcx),%xmm7  # ""
543         movdqa          0x100-0x100(%rcx),%xmm0
544         movdqa          0x110-0x100(%rcx),%xmm1
545         movdqa          0x120-0x100(%rcx),%xmm2
546         movdqa          0x130-0x100(%rcx),%xmm3
547         paddd           .Lfour(%rip),%xmm0      # next SIMD counters
548
549 .Loop_enter4x:
550         movdqa          %xmm6,0x20(%rsp)                # SIMD equivalent of "%nox"
551         movdqa          %xmm7,0x30(%rsp)                # SIMD equivalent of "%nox"
552         movdqa          (%r9),%xmm7             # .Lrot16(%rip)
553         mov             $10,%eax
554         movdqa          %xmm0,0x100-0x100(%rcx) # save SIMD counters
555         jmp             .Loop4x
556
557 .align  32
558 .Loop4x:
559         paddd   %xmm12,%xmm8
560         paddd   %xmm13,%xmm9
561         pxor    %xmm8,%xmm0
562         pxor    %xmm9,%xmm1
563         pshufb  %xmm7,%xmm0
564         pshufb  %xmm7,%xmm1
565         paddd   %xmm0,%xmm4
566         paddd   %xmm1,%xmm5
567         pxor    %xmm4,%xmm12
568         pxor    %xmm5,%xmm13
569         movdqa  %xmm12,%xmm6
570         pslld   $12,%xmm12
571         psrld   $20,%xmm6
572         movdqa  %xmm13,%xmm7
573         pslld   $12,%xmm13
574         por     %xmm6,%xmm12
575         psrld   $20,%xmm7
576         movdqa  (%r11),%xmm6
577         por     %xmm7,%xmm13
578         paddd   %xmm12,%xmm8
579         paddd   %xmm13,%xmm9
580         pxor    %xmm8,%xmm0
581         pxor    %xmm9,%xmm1
582         pshufb  %xmm6,%xmm0
583         pshufb  %xmm6,%xmm1
584         paddd   %xmm0,%xmm4
585         paddd   %xmm1,%xmm5
586         pxor    %xmm4,%xmm12
587         pxor    %xmm5,%xmm13
588         movdqa  %xmm12,%xmm7
589         pslld   $7,%xmm12
590         psrld   $25,%xmm7
591         movdqa  %xmm13,%xmm6
592         pslld   $7,%xmm13
593         por     %xmm7,%xmm12
594         psrld   $25,%xmm6
595         movdqa  (%r9),%xmm7
596         por     %xmm6,%xmm13
597         movdqa  %xmm4,0(%rsp)
598         movdqa  %xmm5,16(%rsp)
599         movdqa  32(%rsp),%xmm4
600         movdqa  48(%rsp),%xmm5
601         paddd   %xmm14,%xmm10
602         paddd   %xmm15,%xmm11
603         pxor    %xmm10,%xmm2
604         pxor    %xmm11,%xmm3
605         pshufb  %xmm7,%xmm2
606         pshufb  %xmm7,%xmm3
607         paddd   %xmm2,%xmm4
608         paddd   %xmm3,%xmm5
609         pxor    %xmm4,%xmm14
610         pxor    %xmm5,%xmm15
611         movdqa  %xmm14,%xmm6
612         pslld   $12,%xmm14
613         psrld   $20,%xmm6
614         movdqa  %xmm15,%xmm7
615         pslld   $12,%xmm15
616         por     %xmm6,%xmm14
617         psrld   $20,%xmm7
618         movdqa  (%r11),%xmm6
619         por     %xmm7,%xmm15
620         paddd   %xmm14,%xmm10
621         paddd   %xmm15,%xmm11
622         pxor    %xmm10,%xmm2
623         pxor    %xmm11,%xmm3
624         pshufb  %xmm6,%xmm2
625         pshufb  %xmm6,%xmm3
626         paddd   %xmm2,%xmm4
627         paddd   %xmm3,%xmm5
628         pxor    %xmm4,%xmm14
629         pxor    %xmm5,%xmm15
630         movdqa  %xmm14,%xmm7
631         pslld   $7,%xmm14
632         psrld   $25,%xmm7
633         movdqa  %xmm15,%xmm6
634         pslld   $7,%xmm15
635         por     %xmm7,%xmm14
636         psrld   $25,%xmm6
637         movdqa  (%r9),%xmm7
638         por     %xmm6,%xmm15
639         paddd   %xmm13,%xmm8
640         paddd   %xmm14,%xmm9
641         pxor    %xmm8,%xmm3
642         pxor    %xmm9,%xmm0
643         pshufb  %xmm7,%xmm3
644         pshufb  %xmm7,%xmm0
645         paddd   %xmm3,%xmm4
646         paddd   %xmm0,%xmm5
647         pxor    %xmm4,%xmm13
648         pxor    %xmm5,%xmm14
649         movdqa  %xmm13,%xmm6
650         pslld   $12,%xmm13
651         psrld   $20,%xmm6
652         movdqa  %xmm14,%xmm7
653         pslld   $12,%xmm14
654         por     %xmm6,%xmm13
655         psrld   $20,%xmm7
656         movdqa  (%r11),%xmm6
657         por     %xmm7,%xmm14
658         paddd   %xmm13,%xmm8
659         paddd   %xmm14,%xmm9
660         pxor    %xmm8,%xmm3
661         pxor    %xmm9,%xmm0
662         pshufb  %xmm6,%xmm3
663         pshufb  %xmm6,%xmm0
664         paddd   %xmm3,%xmm4
665         paddd   %xmm0,%xmm5
666         pxor    %xmm4,%xmm13
667         pxor    %xmm5,%xmm14
668         movdqa  %xmm13,%xmm7
669         pslld   $7,%xmm13
670         psrld   $25,%xmm7
671         movdqa  %xmm14,%xmm6
672         pslld   $7,%xmm14
673         por     %xmm7,%xmm13
674         psrld   $25,%xmm6
675         movdqa  (%r9),%xmm7
676         por     %xmm6,%xmm14
677         movdqa  %xmm4,32(%rsp)
678         movdqa  %xmm5,48(%rsp)
679         movdqa  0(%rsp),%xmm4
680         movdqa  16(%rsp),%xmm5
681         paddd   %xmm15,%xmm10
682         paddd   %xmm12,%xmm11
683         pxor    %xmm10,%xmm1
684         pxor    %xmm11,%xmm2
685         pshufb  %xmm7,%xmm1
686         pshufb  %xmm7,%xmm2
687         paddd   %xmm1,%xmm4
688         paddd   %xmm2,%xmm5
689         pxor    %xmm4,%xmm15
690         pxor    %xmm5,%xmm12
691         movdqa  %xmm15,%xmm6
692         pslld   $12,%xmm15
693         psrld   $20,%xmm6
694         movdqa  %xmm12,%xmm7
695         pslld   $12,%xmm12
696         por     %xmm6,%xmm15
697         psrld   $20,%xmm7
698         movdqa  (%r11),%xmm6
699         por     %xmm7,%xmm12
700         paddd   %xmm15,%xmm10
701         paddd   %xmm12,%xmm11
702         pxor    %xmm10,%xmm1
703         pxor    %xmm11,%xmm2
704         pshufb  %xmm6,%xmm1
705         pshufb  %xmm6,%xmm2
706         paddd   %xmm1,%xmm4
707         paddd   %xmm2,%xmm5
708         pxor    %xmm4,%xmm15
709         pxor    %xmm5,%xmm12
710         movdqa  %xmm15,%xmm7
711         pslld   $7,%xmm15
712         psrld   $25,%xmm7
713         movdqa  %xmm12,%xmm6
714         pslld   $7,%xmm12
715         por     %xmm7,%xmm15
716         psrld   $25,%xmm6
717         movdqa  (%r9),%xmm7
718         por     %xmm6,%xmm12
719         dec             %eax
720         jnz             .Loop4x
721
722         paddd           0x40(%rsp),%xmm8                # accumulate key material
723         paddd           0x50(%rsp),%xmm9
724         paddd           0x60(%rsp),%xmm10
725         paddd           0x70(%rsp),%xmm11
726
727         movdqa          %xmm8,%xmm6             # "de-interlace" data
728         punpckldq       %xmm9,%xmm8
729         movdqa          %xmm10,%xmm7
730         punpckldq       %xmm11,%xmm10
731         punpckhdq       %xmm9,%xmm6
732         punpckhdq       %xmm11,%xmm7
733         movdqa          %xmm8,%xmm9
734         punpcklqdq      %xmm10,%xmm8            # "a0"
735         movdqa          %xmm6,%xmm11
736         punpcklqdq      %xmm7,%xmm6             # "a2"
737         punpckhqdq      %xmm10,%xmm9            # "a1"
738         punpckhqdq      %xmm7,%xmm11            # "a3"
739         paddd           0x80-0x100(%rcx),%xmm12
740         paddd           0x90-0x100(%rcx),%xmm13
741         paddd           0xa0-0x100(%rcx),%xmm14
742         paddd           0xb0-0x100(%rcx),%xmm15
743
744         movdqa          %xmm8,0x00(%rsp)                # offload 
745         movdqa          %xmm9,0x10(%rsp)
746         movdqa          0x20(%rsp),%xmm8                # "xc2"
747         movdqa          0x30(%rsp),%xmm9                # "xc3"
748
749         movdqa          %xmm12,%xmm10
750         punpckldq       %xmm13,%xmm12
751         movdqa          %xmm14,%xmm7
752         punpckldq       %xmm15,%xmm14
753         punpckhdq       %xmm13,%xmm10
754         punpckhdq       %xmm15,%xmm7
755         movdqa          %xmm12,%xmm13
756         punpcklqdq      %xmm14,%xmm12           # "b0"
757         movdqa          %xmm10,%xmm15
758         punpcklqdq      %xmm7,%xmm10            # "b2"
759         punpckhqdq      %xmm14,%xmm13           # "b1"
760         punpckhqdq      %xmm7,%xmm15            # "b3"
761         paddd           0xc0-0x100(%rcx),%xmm4
762         paddd           0xd0-0x100(%rcx),%xmm5
763         paddd           0xe0-0x100(%rcx),%xmm8
764         paddd           0xf0-0x100(%rcx),%xmm9
765
766         movdqa          %xmm6,0x20(%rsp)                # keep offloading 
767         movdqa          %xmm11,0x30(%rsp)
768
769         movdqa          %xmm4,%xmm14
770         punpckldq       %xmm5,%xmm4
771         movdqa          %xmm8,%xmm7
772         punpckldq       %xmm9,%xmm8
773         punpckhdq       %xmm5,%xmm14
774         punpckhdq       %xmm9,%xmm7
775         movdqa          %xmm4,%xmm5
776         punpcklqdq      %xmm8,%xmm4             # "c0"
777         movdqa          %xmm14,%xmm9
778         punpcklqdq      %xmm7,%xmm14            # "c2"
779         punpckhqdq      %xmm8,%xmm5             # "c1"
780         punpckhqdq      %xmm7,%xmm9             # "c3"
781         paddd           0x100-0x100(%rcx),%xmm0
782         paddd           0x110-0x100(%rcx),%xmm1
783         paddd           0x120-0x100(%rcx),%xmm2
784         paddd           0x130-0x100(%rcx),%xmm3
785
786         movdqa          %xmm0,%xmm8
787         punpckldq       %xmm1,%xmm0
788         movdqa          %xmm2,%xmm7
789         punpckldq       %xmm3,%xmm2
790         punpckhdq       %xmm1,%xmm8
791         punpckhdq       %xmm3,%xmm7
792         movdqa          %xmm0,%xmm1
793         punpcklqdq      %xmm2,%xmm0             # "d0"
794         movdqa          %xmm8,%xmm3
795         punpcklqdq      %xmm7,%xmm8             # "d2"
796         punpckhqdq      %xmm2,%xmm1             # "d1"
797         punpckhqdq      %xmm7,%xmm3             # "d3"
798         cmp             $64*4,%rdx
799         jb              .Ltail4x
800
801         movdqu          0x00(%rsi),%xmm6                # xor with input
802         movdqu          0x10(%rsi),%xmm11
803         movdqu          0x20(%rsi),%xmm2
804         movdqu          0x30(%rsi),%xmm7
805         pxor            0x00(%rsp),%xmm6                #  is offloaded, remember?
806         pxor            %xmm12,%xmm11
807         pxor            %xmm4,%xmm2
808         pxor            %xmm0,%xmm7
809
810          movdqu         %xmm6,0x00(%rdi)
811         movdqu          0x40(%rsi),%xmm6
812          movdqu         %xmm11,0x10(%rdi)
813         movdqu          0x50(%rsi),%xmm11
814          movdqu         %xmm2,0x20(%rdi)
815         movdqu          0x60(%rsi),%xmm2
816          movdqu         %xmm7,0x30(%rdi)
817         movdqu          0x70(%rsi),%xmm7
818         lea             0x80(%rsi),%rsi         # size optimization
819         pxor            0x10(%rsp),%xmm6
820         pxor            %xmm13,%xmm11
821         pxor            %xmm5,%xmm2
822         pxor            %xmm1,%xmm7
823
824          movdqu         %xmm6,0x40(%rdi)
825         movdqu          0x00(%rsi),%xmm6
826          movdqu         %xmm11,0x50(%rdi)
827         movdqu          0x10(%rsi),%xmm11
828          movdqu         %xmm2,0x60(%rdi)
829         movdqu          0x20(%rsi),%xmm2
830          movdqu         %xmm7,0x70(%rdi)
831          lea            0x80(%rdi),%rdi         # size optimization
832         movdqu          0x30(%rsi),%xmm7
833         pxor            0x20(%rsp),%xmm6
834         pxor            %xmm10,%xmm11
835         pxor            %xmm14,%xmm2
836         pxor            %xmm8,%xmm7
837
838          movdqu         %xmm6,0x00(%rdi)
839         movdqu          0x40(%rsi),%xmm6
840          movdqu         %xmm11,0x10(%rdi)
841         movdqu          0x50(%rsi),%xmm11
842          movdqu         %xmm2,0x20(%rdi)
843         movdqu          0x60(%rsi),%xmm2
844          movdqu         %xmm7,0x30(%rdi)
845         movdqu          0x70(%rsi),%xmm7
846         lea             0x80(%rsi),%rsi         # inp+=64*4
847         pxor            0x30(%rsp),%xmm6
848         pxor            %xmm15,%xmm11
849         pxor            %xmm9,%xmm2
850         pxor            %xmm3,%xmm7
851         movdqu          %xmm6,0x40(%rdi)
852         movdqu          %xmm11,0x50(%rdi)
853         movdqu          %xmm2,0x60(%rdi)
854         movdqu          %xmm7,0x70(%rdi)
855         lea             0x80(%rdi),%rdi         # out+=64*4
856
857         sub             $64*4,%rdx
858         jnz             .Loop_outer4x
859
860         jmp             .Ldone4x
861
862 .Ltail4x:
863         cmp             $192,%rdx
864         jae             .L192_or_more4x
865         cmp             $128,%rdx
866         jae             .L128_or_more4x
867         cmp             $64,%rdx
868         jae             .L64_or_more4x
869
870         #movdqa         0x00(%rsp),%xmm6                #  is offloaded, remember?
871         xor             %r9,%r9
872         #movdqa         %xmm6,0x00(%rsp)
873         movdqa          %xmm12,0x10(%rsp)
874         movdqa          %xmm4,0x20(%rsp)
875         movdqa          %xmm0,0x30(%rsp)
876         jmp             .Loop_tail4x
877
878 .align  32
879 .L64_or_more4x:
880         movdqu          0x00(%rsi),%xmm6                # xor with input
881         movdqu          0x10(%rsi),%xmm11
882         movdqu          0x20(%rsi),%xmm2
883         movdqu          0x30(%rsi),%xmm7
884         pxor            0x00(%rsp),%xmm6                #  is offloaded, remember?
885         pxor            %xmm12,%xmm11
886         pxor            %xmm4,%xmm2
887         pxor            %xmm0,%xmm7
888         movdqu          %xmm6,0x00(%rdi)
889         movdqu          %xmm11,0x10(%rdi)
890         movdqu          %xmm2,0x20(%rdi)
891         movdqu          %xmm7,0x30(%rdi)
892         je              .Ldone4x
893
894         movdqa          0x10(%rsp),%xmm6                #  is offloaded, remember?
895         lea             0x40(%rsi),%rsi         # inp+=64*1
896         xor             %r9,%r9
897         movdqa          %xmm6,0x00(%rsp)
898         movdqa          %xmm13,0x10(%rsp)
899         lea             0x40(%rdi),%rdi         # out+=64*1
900         movdqa          %xmm5,0x20(%rsp)
901         sub             $64,%rdx                # len-=64*1
902         movdqa          %xmm1,0x30(%rsp)
903         jmp             .Loop_tail4x
904
905 .align  32
906 .L128_or_more4x:
907         movdqu          0x00(%rsi),%xmm6                # xor with input
908         movdqu          0x10(%rsi),%xmm11
909         movdqu          0x20(%rsi),%xmm2
910         movdqu          0x30(%rsi),%xmm7
911         pxor            0x00(%rsp),%xmm6                #  is offloaded, remember?
912         pxor            %xmm12,%xmm11
913         pxor            %xmm4,%xmm2
914         pxor            %xmm0,%xmm7
915
916          movdqu         %xmm6,0x00(%rdi)
917         movdqu          0x40(%rsi),%xmm6
918          movdqu         %xmm11,0x10(%rdi)
919         movdqu          0x50(%rsi),%xmm11
920          movdqu         %xmm2,0x20(%rdi)
921         movdqu          0x60(%rsi),%xmm2
922          movdqu         %xmm7,0x30(%rdi)
923         movdqu          0x70(%rsi),%xmm7
924         pxor            0x10(%rsp),%xmm6
925         pxor            %xmm13,%xmm11
926         pxor            %xmm5,%xmm2
927         pxor            %xmm1,%xmm7
928         movdqu          %xmm6,0x40(%rdi)
929         movdqu          %xmm11,0x50(%rdi)
930         movdqu          %xmm2,0x60(%rdi)
931         movdqu          %xmm7,0x70(%rdi)
932         je              .Ldone4x
933
934         movdqa          0x20(%rsp),%xmm6                #  is offloaded, remember?
935         lea             0x80(%rsi),%rsi         # inp+=64*2
936         xor             %r9,%r9
937         movdqa          %xmm6,0x00(%rsp)
938         movdqa          %xmm10,0x10(%rsp)
939         lea             0x80(%rdi),%rdi         # out+=64*2
940         movdqa          %xmm14,0x20(%rsp)
941         sub             $128,%rdx               # len-=64*2
942         movdqa          %xmm8,0x30(%rsp)
943         jmp             .Loop_tail4x
944
945 .align  32
946 .L192_or_more4x:
947         movdqu          0x00(%rsi),%xmm6                # xor with input
948         movdqu          0x10(%rsi),%xmm11
949         movdqu          0x20(%rsi),%xmm2
950         movdqu          0x30(%rsi),%xmm7
951         pxor            0x00(%rsp),%xmm6                #  is offloaded, remember?
952         pxor            %xmm12,%xmm11
953         pxor            %xmm4,%xmm2
954         pxor            %xmm0,%xmm7
955
956          movdqu         %xmm6,0x00(%rdi)
957         movdqu          0x40(%rsi),%xmm6
958          movdqu         %xmm11,0x10(%rdi)
959         movdqu          0x50(%rsi),%xmm11
960          movdqu         %xmm2,0x20(%rdi)
961         movdqu          0x60(%rsi),%xmm2
962          movdqu         %xmm7,0x30(%rdi)
963         movdqu          0x70(%rsi),%xmm7
964         lea             0x80(%rsi),%rsi         # size optimization
965         pxor            0x10(%rsp),%xmm6
966         pxor            %xmm13,%xmm11
967         pxor            %xmm5,%xmm2
968         pxor            %xmm1,%xmm7
969
970          movdqu         %xmm6,0x40(%rdi)
971         movdqu          0x00(%rsi),%xmm6
972          movdqu         %xmm11,0x50(%rdi)
973         movdqu          0x10(%rsi),%xmm11
974          movdqu         %xmm2,0x60(%rdi)
975         movdqu          0x20(%rsi),%xmm2
976          movdqu         %xmm7,0x70(%rdi)
977          lea            0x80(%rdi),%rdi         # size optimization
978         movdqu          0x30(%rsi),%xmm7
979         pxor            0x20(%rsp),%xmm6
980         pxor            %xmm10,%xmm11
981         pxor            %xmm14,%xmm2
982         pxor            %xmm8,%xmm7
983         movdqu          %xmm6,0x00(%rdi)
984         movdqu          %xmm11,0x10(%rdi)
985         movdqu          %xmm2,0x20(%rdi)
986         movdqu          %xmm7,0x30(%rdi)
987         je              .Ldone4x
988
989         movdqa          0x30(%rsp),%xmm6                #  is offloaded, remember?
990         lea             0x40(%rsi),%rsi         # inp+=64*3
991         xor             %r9,%r9
992         movdqa          %xmm6,0x00(%rsp)
993         movdqa          %xmm15,0x10(%rsp)
994         lea             0x40(%rdi),%rdi         # out+=64*3
995         movdqa          %xmm9,0x20(%rsp)
996         sub             $192,%rdx               # len-=64*3
997         movdqa          %xmm3,0x30(%rsp)
998
999 .Loop_tail4x:
1000         movzb           (%rsi,%r9),%eax
1001         movzb           (%rsp,%r9),%ecx
1002         lea             1(%r9),%r9
1003         xor             %ecx,%eax
1004         mov             %al,-1(%rdi,%r9)
1005         dec             %rdx
1006         jnz             .Loop_tail4x
1007
1008 .Ldone4x:
1009         lea             -8(%r10),%rsp
1010 .L4x_epilogue:
1011         ret
1012 .size   chacha20_4x,.-chacha20_4x
1013 #endif
1014 #ifdef CONFIG_AS_AVX2
1015 .align 32
1016 SYM_FUNC_START(chacha20_avx2)
1017 .Lchacha20_avx2:
1018 .Lchacha20_8x:
1019         lea             8(%rsp),%r10            # frame register
1020         sub             $0x280+8,%rsp
1021         and             $-32,%rsp
1022         vzeroupper
1023
1024         ################ stack layout
1025         # +0x00         SIMD equivalent of %r12d
1026         # ...
1027         # +0x80         constant copy of key[0-2] smashed by lanes
1028         # ...
1029         # +0x200        SIMD counters (with nonce smashed by lanes)
1030         # ...
1031         # +0x280
1032
1033         vbroadcasti128  .Lsigma(%rip),%ymm11    # key[0]
1034         vbroadcasti128  (%rcx),%ymm3            # key[1]
1035         vbroadcasti128  16(%rcx),%ymm15         # key[2]
1036         vbroadcasti128  (%r8),%ymm7             # key[3]
1037         lea             0x100(%rsp),%rcx        # size optimization
1038         lea             0x200(%rsp),%rax        # size optimization
1039         lea             .Lrot16(%rip),%r9
1040         lea             .Lrot24(%rip),%r11
1041
1042         vpshufd         $0x00,%ymm11,%ymm8      # smash key by lanes...
1043         vpshufd         $0x55,%ymm11,%ymm9
1044         vmovdqa         %ymm8,0x80-0x100(%rcx)  # ... and offload
1045         vpshufd         $0xaa,%ymm11,%ymm10
1046         vmovdqa         %ymm9,0xa0-0x100(%rcx)
1047         vpshufd         $0xff,%ymm11,%ymm11
1048         vmovdqa         %ymm10,0xc0-0x100(%rcx)
1049         vmovdqa         %ymm11,0xe0-0x100(%rcx)
1050
1051         vpshufd         $0x00,%ymm3,%ymm0
1052         vpshufd         $0x55,%ymm3,%ymm1
1053         vmovdqa         %ymm0,0x100-0x100(%rcx)
1054         vpshufd         $0xaa,%ymm3,%ymm2
1055         vmovdqa         %ymm1,0x120-0x100(%rcx)
1056         vpshufd         $0xff,%ymm3,%ymm3
1057         vmovdqa         %ymm2,0x140-0x100(%rcx)
1058         vmovdqa         %ymm3,0x160-0x100(%rcx)
1059
1060         vpshufd         $0x00,%ymm15,%ymm12     # "xc0"
1061         vpshufd         $0x55,%ymm15,%ymm13     # "xc1"
1062         vmovdqa         %ymm12,0x180-0x200(%rax)
1063         vpshufd         $0xaa,%ymm15,%ymm14     # "xc2"
1064         vmovdqa         %ymm13,0x1a0-0x200(%rax)
1065         vpshufd         $0xff,%ymm15,%ymm15     # "xc3"
1066         vmovdqa         %ymm14,0x1c0-0x200(%rax)
1067         vmovdqa         %ymm15,0x1e0-0x200(%rax)
1068
1069         vpshufd         $0x00,%ymm7,%ymm4
1070         vpshufd         $0x55,%ymm7,%ymm5
1071         vpaddd          .Lincy(%rip),%ymm4,%ymm4        # don't save counters yet
1072         vpshufd         $0xaa,%ymm7,%ymm6
1073         vmovdqa         %ymm5,0x220-0x200(%rax)
1074         vpshufd         $0xff,%ymm7,%ymm7
1075         vmovdqa         %ymm6,0x240-0x200(%rax)
1076         vmovdqa         %ymm7,0x260-0x200(%rax)
1077
1078         jmp             .Loop_enter8x
1079
1080 .align  32
1081 .Loop_outer8x:
1082         vmovdqa         0x80-0x100(%rcx),%ymm8  # re-load smashed key
1083         vmovdqa         0xa0-0x100(%rcx),%ymm9
1084         vmovdqa         0xc0-0x100(%rcx),%ymm10
1085         vmovdqa         0xe0-0x100(%rcx),%ymm11
1086         vmovdqa         0x100-0x100(%rcx),%ymm0
1087         vmovdqa         0x120-0x100(%rcx),%ymm1
1088         vmovdqa         0x140-0x100(%rcx),%ymm2
1089         vmovdqa         0x160-0x100(%rcx),%ymm3
1090         vmovdqa         0x180-0x200(%rax),%ymm12        # "xc0"
1091         vmovdqa         0x1a0-0x200(%rax),%ymm13        # "xc1"
1092         vmovdqa         0x1c0-0x200(%rax),%ymm14        # "xc2"
1093         vmovdqa         0x1e0-0x200(%rax),%ymm15        # "xc3"
1094         vmovdqa         0x200-0x200(%rax),%ymm4
1095         vmovdqa         0x220-0x200(%rax),%ymm5
1096         vmovdqa         0x240-0x200(%rax),%ymm6
1097         vmovdqa         0x260-0x200(%rax),%ymm7
1098         vpaddd          .Leight(%rip),%ymm4,%ymm4       # next SIMD counters
1099
1100 .Loop_enter8x:
1101         vmovdqa         %ymm14,0x40(%rsp)               # SIMD equivalent of "%nox"
1102         vmovdqa         %ymm15,0x60(%rsp)               # SIMD equivalent of "%nox"
1103         vbroadcasti128  (%r9),%ymm15
1104         vmovdqa         %ymm4,0x200-0x200(%rax) # save SIMD counters
1105         mov             $10,%eax
1106         jmp             .Loop8x
1107
1108 .align  32
1109 .Loop8x:
1110         vpaddd  %ymm0,%ymm8,%ymm8
1111         vpxor   %ymm4,%ymm8,%ymm4
1112         vpshufb %ymm15,%ymm4,%ymm4
1113         vpaddd  %ymm1,%ymm9,%ymm9
1114         vpxor   %ymm5,%ymm9,%ymm5
1115         vpshufb %ymm15,%ymm5,%ymm5
1116         vpaddd  %ymm4,%ymm12,%ymm12
1117         vpxor   %ymm0,%ymm12,%ymm0
1118         vpslld  $12,%ymm0,%ymm14
1119         vpsrld  $20,%ymm0,%ymm0
1120         vpor    %ymm0,%ymm14,%ymm0
1121         vbroadcasti128  (%r11),%ymm14
1122         vpaddd  %ymm5,%ymm13,%ymm13
1123         vpxor   %ymm1,%ymm13,%ymm1
1124         vpslld  $12,%ymm1,%ymm15
1125         vpsrld  $20,%ymm1,%ymm1
1126         vpor    %ymm1,%ymm15,%ymm1
1127         vpaddd  %ymm0,%ymm8,%ymm8
1128         vpxor   %ymm4,%ymm8,%ymm4
1129         vpshufb %ymm14,%ymm4,%ymm4
1130         vpaddd  %ymm1,%ymm9,%ymm9
1131         vpxor   %ymm5,%ymm9,%ymm5
1132         vpshufb %ymm14,%ymm5,%ymm5
1133         vpaddd  %ymm4,%ymm12,%ymm12
1134         vpxor   %ymm0,%ymm12,%ymm0
1135         vpslld  $7,%ymm0,%ymm15
1136         vpsrld  $25,%ymm0,%ymm0
1137         vpor    %ymm0,%ymm15,%ymm0
1138         vbroadcasti128  (%r9),%ymm15
1139         vpaddd  %ymm5,%ymm13,%ymm13
1140         vpxor   %ymm1,%ymm13,%ymm1
1141         vpslld  $7,%ymm1,%ymm14
1142         vpsrld  $25,%ymm1,%ymm1
1143         vpor    %ymm1,%ymm14,%ymm1
1144         vmovdqa %ymm12,0(%rsp)
1145         vmovdqa %ymm13,32(%rsp)
1146         vmovdqa 64(%rsp),%ymm12
1147         vmovdqa 96(%rsp),%ymm13
1148         vpaddd  %ymm2,%ymm10,%ymm10
1149         vpxor   %ymm6,%ymm10,%ymm6
1150         vpshufb %ymm15,%ymm6,%ymm6
1151         vpaddd  %ymm3,%ymm11,%ymm11
1152         vpxor   %ymm7,%ymm11,%ymm7
1153         vpshufb %ymm15,%ymm7,%ymm7
1154         vpaddd  %ymm6,%ymm12,%ymm12
1155         vpxor   %ymm2,%ymm12,%ymm2
1156         vpslld  $12,%ymm2,%ymm14
1157         vpsrld  $20,%ymm2,%ymm2
1158         vpor    %ymm2,%ymm14,%ymm2
1159         vbroadcasti128  (%r11),%ymm14
1160         vpaddd  %ymm7,%ymm13,%ymm13
1161         vpxor   %ymm3,%ymm13,%ymm3
1162         vpslld  $12,%ymm3,%ymm15
1163         vpsrld  $20,%ymm3,%ymm3
1164         vpor    %ymm3,%ymm15,%ymm3
1165         vpaddd  %ymm2,%ymm10,%ymm10
1166         vpxor   %ymm6,%ymm10,%ymm6
1167         vpshufb %ymm14,%ymm6,%ymm6
1168         vpaddd  %ymm3,%ymm11,%ymm11
1169         vpxor   %ymm7,%ymm11,%ymm7
1170         vpshufb %ymm14,%ymm7,%ymm7
1171         vpaddd  %ymm6,%ymm12,%ymm12
1172         vpxor   %ymm2,%ymm12,%ymm2
1173         vpslld  $7,%ymm2,%ymm15
1174         vpsrld  $25,%ymm2,%ymm2
1175         vpor    %ymm2,%ymm15,%ymm2
1176         vbroadcasti128  (%r9),%ymm15
1177         vpaddd  %ymm7,%ymm13,%ymm13
1178         vpxor   %ymm3,%ymm13,%ymm3
1179         vpslld  $7,%ymm3,%ymm14
1180         vpsrld  $25,%ymm3,%ymm3
1181         vpor    %ymm3,%ymm14,%ymm3
1182         vpaddd  %ymm1,%ymm8,%ymm8
1183         vpxor   %ymm7,%ymm8,%ymm7
1184         vpshufb %ymm15,%ymm7,%ymm7
1185         vpaddd  %ymm2,%ymm9,%ymm9
1186         vpxor   %ymm4,%ymm9,%ymm4
1187         vpshufb %ymm15,%ymm4,%ymm4
1188         vpaddd  %ymm7,%ymm12,%ymm12
1189         vpxor   %ymm1,%ymm12,%ymm1
1190         vpslld  $12,%ymm1,%ymm14
1191         vpsrld  $20,%ymm1,%ymm1
1192         vpor    %ymm1,%ymm14,%ymm1
1193         vbroadcasti128  (%r11),%ymm14
1194         vpaddd  %ymm4,%ymm13,%ymm13
1195         vpxor   %ymm2,%ymm13,%ymm2
1196         vpslld  $12,%ymm2,%ymm15
1197         vpsrld  $20,%ymm2,%ymm2
1198         vpor    %ymm2,%ymm15,%ymm2
1199         vpaddd  %ymm1,%ymm8,%ymm8
1200         vpxor   %ymm7,%ymm8,%ymm7
1201         vpshufb %ymm14,%ymm7,%ymm7
1202         vpaddd  %ymm2,%ymm9,%ymm9
1203         vpxor   %ymm4,%ymm9,%ymm4
1204         vpshufb %ymm14,%ymm4,%ymm4
1205         vpaddd  %ymm7,%ymm12,%ymm12
1206         vpxor   %ymm1,%ymm12,%ymm1
1207         vpslld  $7,%ymm1,%ymm15
1208         vpsrld  $25,%ymm1,%ymm1
1209         vpor    %ymm1,%ymm15,%ymm1
1210         vbroadcasti128  (%r9),%ymm15
1211         vpaddd  %ymm4,%ymm13,%ymm13
1212         vpxor   %ymm2,%ymm13,%ymm2
1213         vpslld  $7,%ymm2,%ymm14
1214         vpsrld  $25,%ymm2,%ymm2
1215         vpor    %ymm2,%ymm14,%ymm2
1216         vmovdqa %ymm12,64(%rsp)
1217         vmovdqa %ymm13,96(%rsp)
1218         vmovdqa 0(%rsp),%ymm12
1219         vmovdqa 32(%rsp),%ymm13
1220         vpaddd  %ymm3,%ymm10,%ymm10
1221         vpxor   %ymm5,%ymm10,%ymm5
1222         vpshufb %ymm15,%ymm5,%ymm5
1223         vpaddd  %ymm0,%ymm11,%ymm11
1224         vpxor   %ymm6,%ymm11,%ymm6
1225         vpshufb %ymm15,%ymm6,%ymm6
1226         vpaddd  %ymm5,%ymm12,%ymm12
1227         vpxor   %ymm3,%ymm12,%ymm3
1228         vpslld  $12,%ymm3,%ymm14
1229         vpsrld  $20,%ymm3,%ymm3
1230         vpor    %ymm3,%ymm14,%ymm3
1231         vbroadcasti128  (%r11),%ymm14
1232         vpaddd  %ymm6,%ymm13,%ymm13
1233         vpxor   %ymm0,%ymm13,%ymm0
1234         vpslld  $12,%ymm0,%ymm15
1235         vpsrld  $20,%ymm0,%ymm0
1236         vpor    %ymm0,%ymm15,%ymm0
1237         vpaddd  %ymm3,%ymm10,%ymm10
1238         vpxor   %ymm5,%ymm10,%ymm5
1239         vpshufb %ymm14,%ymm5,%ymm5
1240         vpaddd  %ymm0,%ymm11,%ymm11
1241         vpxor   %ymm6,%ymm11,%ymm6
1242         vpshufb %ymm14,%ymm6,%ymm6
1243         vpaddd  %ymm5,%ymm12,%ymm12
1244         vpxor   %ymm3,%ymm12,%ymm3
1245         vpslld  $7,%ymm3,%ymm15
1246         vpsrld  $25,%ymm3,%ymm3
1247         vpor    %ymm3,%ymm15,%ymm3
1248         vbroadcasti128  (%r9),%ymm15
1249         vpaddd  %ymm6,%ymm13,%ymm13
1250         vpxor   %ymm0,%ymm13,%ymm0
1251         vpslld  $7,%ymm0,%ymm14
1252         vpsrld  $25,%ymm0,%ymm0
1253         vpor    %ymm0,%ymm14,%ymm0
1254         dec             %eax
1255         jnz             .Loop8x
1256
1257         lea             0x200(%rsp),%rax        # size optimization
1258         vpaddd          0x80-0x100(%rcx),%ymm8,%ymm8    # accumulate key
1259         vpaddd          0xa0-0x100(%rcx),%ymm9,%ymm9
1260         vpaddd          0xc0-0x100(%rcx),%ymm10,%ymm10
1261         vpaddd          0xe0-0x100(%rcx),%ymm11,%ymm11
1262
1263         vpunpckldq      %ymm9,%ymm8,%ymm14              # "de-interlace" data
1264         vpunpckldq      %ymm11,%ymm10,%ymm15
1265         vpunpckhdq      %ymm9,%ymm8,%ymm8
1266         vpunpckhdq      %ymm11,%ymm10,%ymm10
1267         vpunpcklqdq     %ymm15,%ymm14,%ymm9             # "a0"
1268         vpunpckhqdq     %ymm15,%ymm14,%ymm14            # "a1"
1269         vpunpcklqdq     %ymm10,%ymm8,%ymm11             # "a2"
1270         vpunpckhqdq     %ymm10,%ymm8,%ymm8              # "a3"
1271         vpaddd          0x100-0x100(%rcx),%ymm0,%ymm0
1272         vpaddd          0x120-0x100(%rcx),%ymm1,%ymm1
1273         vpaddd          0x140-0x100(%rcx),%ymm2,%ymm2
1274         vpaddd          0x160-0x100(%rcx),%ymm3,%ymm3
1275
1276         vpunpckldq      %ymm1,%ymm0,%ymm10
1277         vpunpckldq      %ymm3,%ymm2,%ymm15
1278         vpunpckhdq      %ymm1,%ymm0,%ymm0
1279         vpunpckhdq      %ymm3,%ymm2,%ymm2
1280         vpunpcklqdq     %ymm15,%ymm10,%ymm1             # "b0"
1281         vpunpckhqdq     %ymm15,%ymm10,%ymm10            # "b1"
1282         vpunpcklqdq     %ymm2,%ymm0,%ymm3               # "b2"
1283         vpunpckhqdq     %ymm2,%ymm0,%ymm0               # "b3"
1284         vperm2i128      $0x20,%ymm1,%ymm9,%ymm15        # "de-interlace" further
1285         vperm2i128      $0x31,%ymm1,%ymm9,%ymm1
1286         vperm2i128      $0x20,%ymm10,%ymm14,%ymm9
1287         vperm2i128      $0x31,%ymm10,%ymm14,%ymm10
1288         vperm2i128      $0x20,%ymm3,%ymm11,%ymm14
1289         vperm2i128      $0x31,%ymm3,%ymm11,%ymm3
1290         vperm2i128      $0x20,%ymm0,%ymm8,%ymm11
1291         vperm2i128      $0x31,%ymm0,%ymm8,%ymm0
1292         vmovdqa         %ymm15,0x00(%rsp)               # offload 
1293         vmovdqa         %ymm9,0x20(%rsp)
1294         vmovdqa         0x40(%rsp),%ymm15               # %ymm15
1295         vmovdqa         0x60(%rsp),%ymm9                # %ymm9
1296
1297         vpaddd          0x180-0x200(%rax),%ymm12,%ymm12
1298         vpaddd          0x1a0-0x200(%rax),%ymm13,%ymm13
1299         vpaddd          0x1c0-0x200(%rax),%ymm15,%ymm15
1300         vpaddd          0x1e0-0x200(%rax),%ymm9,%ymm9
1301
1302         vpunpckldq      %ymm13,%ymm12,%ymm2
1303         vpunpckldq      %ymm9,%ymm15,%ymm8
1304         vpunpckhdq      %ymm13,%ymm12,%ymm12
1305         vpunpckhdq      %ymm9,%ymm15,%ymm15
1306         vpunpcklqdq     %ymm8,%ymm2,%ymm13              # "c0"
1307         vpunpckhqdq     %ymm8,%ymm2,%ymm2               # "c1"
1308         vpunpcklqdq     %ymm15,%ymm12,%ymm9             # "c2"
1309         vpunpckhqdq     %ymm15,%ymm12,%ymm12            # "c3"
1310         vpaddd          0x200-0x200(%rax),%ymm4,%ymm4
1311         vpaddd          0x220-0x200(%rax),%ymm5,%ymm5
1312         vpaddd          0x240-0x200(%rax),%ymm6,%ymm6
1313         vpaddd          0x260-0x200(%rax),%ymm7,%ymm7
1314
1315         vpunpckldq      %ymm5,%ymm4,%ymm15
1316         vpunpckldq      %ymm7,%ymm6,%ymm8
1317         vpunpckhdq      %ymm5,%ymm4,%ymm4
1318         vpunpckhdq      %ymm7,%ymm6,%ymm6
1319         vpunpcklqdq     %ymm8,%ymm15,%ymm5              # "d0"
1320         vpunpckhqdq     %ymm8,%ymm15,%ymm15             # "d1"
1321         vpunpcklqdq     %ymm6,%ymm4,%ymm7               # "d2"
1322         vpunpckhqdq     %ymm6,%ymm4,%ymm4               # "d3"
1323         vperm2i128      $0x20,%ymm5,%ymm13,%ymm8        # "de-interlace" further
1324         vperm2i128      $0x31,%ymm5,%ymm13,%ymm5
1325         vperm2i128      $0x20,%ymm15,%ymm2,%ymm13
1326         vperm2i128      $0x31,%ymm15,%ymm2,%ymm15
1327         vperm2i128      $0x20,%ymm7,%ymm9,%ymm2
1328         vperm2i128      $0x31,%ymm7,%ymm9,%ymm7
1329         vperm2i128      $0x20,%ymm4,%ymm12,%ymm9
1330         vperm2i128      $0x31,%ymm4,%ymm12,%ymm4
1331         vmovdqa         0x00(%rsp),%ymm6                #  was offloaded, remember?
1332         vmovdqa         0x20(%rsp),%ymm12
1333
1334         cmp             $64*8,%rdx
1335         jb              .Ltail8x
1336
1337         vpxor           0x00(%rsi),%ymm6,%ymm6  # xor with input
1338         vpxor           0x20(%rsi),%ymm8,%ymm8
1339         vpxor           0x40(%rsi),%ymm1,%ymm1
1340         vpxor           0x60(%rsi),%ymm5,%ymm5
1341         lea             0x80(%rsi),%rsi         # size optimization
1342         vmovdqu         %ymm6,0x00(%rdi)
1343         vmovdqu         %ymm8,0x20(%rdi)
1344         vmovdqu         %ymm1,0x40(%rdi)
1345         vmovdqu         %ymm5,0x60(%rdi)
1346         lea             0x80(%rdi),%rdi         # size optimization
1347
1348         vpxor           0x00(%rsi),%ymm12,%ymm12
1349         vpxor           0x20(%rsi),%ymm13,%ymm13
1350         vpxor           0x40(%rsi),%ymm10,%ymm10
1351         vpxor           0x60(%rsi),%ymm15,%ymm15
1352         lea             0x80(%rsi),%rsi         # size optimization
1353         vmovdqu         %ymm12,0x00(%rdi)
1354         vmovdqu         %ymm13,0x20(%rdi)
1355         vmovdqu         %ymm10,0x40(%rdi)
1356         vmovdqu         %ymm15,0x60(%rdi)
1357         lea             0x80(%rdi),%rdi         # size optimization
1358
1359         vpxor           0x00(%rsi),%ymm14,%ymm14
1360         vpxor           0x20(%rsi),%ymm2,%ymm2
1361         vpxor           0x40(%rsi),%ymm3,%ymm3
1362         vpxor           0x60(%rsi),%ymm7,%ymm7
1363         lea             0x80(%rsi),%rsi         # size optimization
1364         vmovdqu         %ymm14,0x00(%rdi)
1365         vmovdqu         %ymm2,0x20(%rdi)
1366         vmovdqu         %ymm3,0x40(%rdi)
1367         vmovdqu         %ymm7,0x60(%rdi)
1368         lea             0x80(%rdi),%rdi         # size optimization
1369
1370         vpxor           0x00(%rsi),%ymm11,%ymm11
1371         vpxor           0x20(%rsi),%ymm9,%ymm9
1372         vpxor           0x40(%rsi),%ymm0,%ymm0
1373         vpxor           0x60(%rsi),%ymm4,%ymm4
1374         lea             0x80(%rsi),%rsi         # size optimization
1375         vmovdqu         %ymm11,0x00(%rdi)
1376         vmovdqu         %ymm9,0x20(%rdi)
1377         vmovdqu         %ymm0,0x40(%rdi)
1378         vmovdqu         %ymm4,0x60(%rdi)
1379         lea             0x80(%rdi),%rdi         # size optimization
1380
1381         sub             $64*8,%rdx
1382         jnz             .Loop_outer8x
1383
1384         jmp             .Ldone8x
1385
1386 .Ltail8x:
1387         cmp             $448,%rdx
1388         jae             .L448_or_more8x
1389         cmp             $384,%rdx
1390         jae             .L384_or_more8x
1391         cmp             $320,%rdx
1392         jae             .L320_or_more8x
1393         cmp             $256,%rdx
1394         jae             .L256_or_more8x
1395         cmp             $192,%rdx
1396         jae             .L192_or_more8x
1397         cmp             $128,%rdx
1398         jae             .L128_or_more8x
1399         cmp             $64,%rdx
1400         jae             .L64_or_more8x
1401
1402         xor             %r9,%r9
1403         vmovdqa         %ymm6,0x00(%rsp)
1404         vmovdqa         %ymm8,0x20(%rsp)
1405         jmp             .Loop_tail8x
1406
1407 .align  32
1408 .L64_or_more8x:
1409         vpxor           0x00(%rsi),%ymm6,%ymm6  # xor with input
1410         vpxor           0x20(%rsi),%ymm8,%ymm8
1411         vmovdqu         %ymm6,0x00(%rdi)
1412         vmovdqu         %ymm8,0x20(%rdi)
1413         je              .Ldone8x
1414
1415         lea             0x40(%rsi),%rsi         # inp+=64*1
1416         xor             %r9,%r9
1417         vmovdqa         %ymm1,0x00(%rsp)
1418         lea             0x40(%rdi),%rdi         # out+=64*1
1419         sub             $64,%rdx                # len-=64*1
1420         vmovdqa         %ymm5,0x20(%rsp)
1421         jmp             .Loop_tail8x
1422
1423 .align  32
1424 .L128_or_more8x:
1425         vpxor           0x00(%rsi),%ymm6,%ymm6  # xor with input
1426         vpxor           0x20(%rsi),%ymm8,%ymm8
1427         vpxor           0x40(%rsi),%ymm1,%ymm1
1428         vpxor           0x60(%rsi),%ymm5,%ymm5
1429         vmovdqu         %ymm6,0x00(%rdi)
1430         vmovdqu         %ymm8,0x20(%rdi)
1431         vmovdqu         %ymm1,0x40(%rdi)
1432         vmovdqu         %ymm5,0x60(%rdi)
1433         je              .Ldone8x
1434
1435         lea             0x80(%rsi),%rsi         # inp+=64*2
1436         xor             %r9,%r9
1437         vmovdqa         %ymm12,0x00(%rsp)
1438         lea             0x80(%rdi),%rdi         # out+=64*2
1439         sub             $128,%rdx               # len-=64*2
1440         vmovdqa         %ymm13,0x20(%rsp)
1441         jmp             .Loop_tail8x
1442
1443 .align  32
1444 .L192_or_more8x:
1445         vpxor           0x00(%rsi),%ymm6,%ymm6  # xor with input
1446         vpxor           0x20(%rsi),%ymm8,%ymm8
1447         vpxor           0x40(%rsi),%ymm1,%ymm1
1448         vpxor           0x60(%rsi),%ymm5,%ymm5
1449         vpxor           0x80(%rsi),%ymm12,%ymm12
1450         vpxor           0xa0(%rsi),%ymm13,%ymm13
1451         vmovdqu         %ymm6,0x00(%rdi)
1452         vmovdqu         %ymm8,0x20(%rdi)
1453         vmovdqu         %ymm1,0x40(%rdi)
1454         vmovdqu         %ymm5,0x60(%rdi)
1455         vmovdqu         %ymm12,0x80(%rdi)
1456         vmovdqu         %ymm13,0xa0(%rdi)
1457         je              .Ldone8x
1458
1459         lea             0xc0(%rsi),%rsi         # inp+=64*3
1460         xor             %r9,%r9
1461         vmovdqa         %ymm10,0x00(%rsp)
1462         lea             0xc0(%rdi),%rdi         # out+=64*3
1463         sub             $192,%rdx               # len-=64*3
1464         vmovdqa         %ymm15,0x20(%rsp)
1465         jmp             .Loop_tail8x
1466
1467 .align  32
1468 .L256_or_more8x:
1469         vpxor           0x00(%rsi),%ymm6,%ymm6  # xor with input
1470         vpxor           0x20(%rsi),%ymm8,%ymm8
1471         vpxor           0x40(%rsi),%ymm1,%ymm1
1472         vpxor           0x60(%rsi),%ymm5,%ymm5
1473         vpxor           0x80(%rsi),%ymm12,%ymm12
1474         vpxor           0xa0(%rsi),%ymm13,%ymm13
1475         vpxor           0xc0(%rsi),%ymm10,%ymm10
1476         vpxor           0xe0(%rsi),%ymm15,%ymm15
1477         vmovdqu         %ymm6,0x00(%rdi)
1478         vmovdqu         %ymm8,0x20(%rdi)
1479         vmovdqu         %ymm1,0x40(%rdi)
1480         vmovdqu         %ymm5,0x60(%rdi)
1481         vmovdqu         %ymm12,0x80(%rdi)
1482         vmovdqu         %ymm13,0xa0(%rdi)
1483         vmovdqu         %ymm10,0xc0(%rdi)
1484         vmovdqu         %ymm15,0xe0(%rdi)
1485         je              .Ldone8x
1486
1487         lea             0x100(%rsi),%rsi        # inp+=64*4
1488         xor             %r9,%r9
1489         vmovdqa         %ymm14,0x00(%rsp)
1490         lea             0x100(%rdi),%rdi        # out+=64*4
1491         sub             $256,%rdx               # len-=64*4
1492         vmovdqa         %ymm2,0x20(%rsp)
1493         jmp             .Loop_tail8x
1494
1495 .align  32
1496 .L320_or_more8x:
1497         vpxor           0x00(%rsi),%ymm6,%ymm6  # xor with input
1498         vpxor           0x20(%rsi),%ymm8,%ymm8
1499         vpxor           0x40(%rsi),%ymm1,%ymm1
1500         vpxor           0x60(%rsi),%ymm5,%ymm5
1501         vpxor           0x80(%rsi),%ymm12,%ymm12
1502         vpxor           0xa0(%rsi),%ymm13,%ymm13
1503         vpxor           0xc0(%rsi),%ymm10,%ymm10
1504         vpxor           0xe0(%rsi),%ymm15,%ymm15
1505         vpxor           0x100(%rsi),%ymm14,%ymm14
1506         vpxor           0x120(%rsi),%ymm2,%ymm2
1507         vmovdqu         %ymm6,0x00(%rdi)
1508         vmovdqu         %ymm8,0x20(%rdi)
1509         vmovdqu         %ymm1,0x40(%rdi)
1510         vmovdqu         %ymm5,0x60(%rdi)
1511         vmovdqu         %ymm12,0x80(%rdi)
1512         vmovdqu         %ymm13,0xa0(%rdi)
1513         vmovdqu         %ymm10,0xc0(%rdi)
1514         vmovdqu         %ymm15,0xe0(%rdi)
1515         vmovdqu         %ymm14,0x100(%rdi)
1516         vmovdqu         %ymm2,0x120(%rdi)
1517         je              .Ldone8x
1518
1519         lea             0x140(%rsi),%rsi        # inp+=64*5
1520         xor             %r9,%r9
1521         vmovdqa         %ymm3,0x00(%rsp)
1522         lea             0x140(%rdi),%rdi        # out+=64*5
1523         sub             $320,%rdx               # len-=64*5
1524         vmovdqa         %ymm7,0x20(%rsp)
1525         jmp             .Loop_tail8x
1526
1527 .align  32
1528 .L384_or_more8x:
1529         vpxor           0x00(%rsi),%ymm6,%ymm6  # xor with input
1530         vpxor           0x20(%rsi),%ymm8,%ymm8
1531         vpxor           0x40(%rsi),%ymm1,%ymm1
1532         vpxor           0x60(%rsi),%ymm5,%ymm5
1533         vpxor           0x80(%rsi),%ymm12,%ymm12
1534         vpxor           0xa0(%rsi),%ymm13,%ymm13
1535         vpxor           0xc0(%rsi),%ymm10,%ymm10
1536         vpxor           0xe0(%rsi),%ymm15,%ymm15
1537         vpxor           0x100(%rsi),%ymm14,%ymm14
1538         vpxor           0x120(%rsi),%ymm2,%ymm2
1539         vpxor           0x140(%rsi),%ymm3,%ymm3
1540         vpxor           0x160(%rsi),%ymm7,%ymm7
1541         vmovdqu         %ymm6,0x00(%rdi)
1542         vmovdqu         %ymm8,0x20(%rdi)
1543         vmovdqu         %ymm1,0x40(%rdi)
1544         vmovdqu         %ymm5,0x60(%rdi)
1545         vmovdqu         %ymm12,0x80(%rdi)
1546         vmovdqu         %ymm13,0xa0(%rdi)
1547         vmovdqu         %ymm10,0xc0(%rdi)
1548         vmovdqu         %ymm15,0xe0(%rdi)
1549         vmovdqu         %ymm14,0x100(%rdi)
1550         vmovdqu         %ymm2,0x120(%rdi)
1551         vmovdqu         %ymm3,0x140(%rdi)
1552         vmovdqu         %ymm7,0x160(%rdi)
1553         je              .Ldone8x
1554
1555         lea             0x180(%rsi),%rsi        # inp+=64*6
1556         xor             %r9,%r9
1557         vmovdqa         %ymm11,0x00(%rsp)
1558         lea             0x180(%rdi),%rdi        # out+=64*6
1559         sub             $384,%rdx               # len-=64*6
1560         vmovdqa         %ymm9,0x20(%rsp)
1561         jmp             .Loop_tail8x
1562
1563 .align  32
1564 .L448_or_more8x:
1565         vpxor           0x00(%rsi),%ymm6,%ymm6  # xor with input
1566         vpxor           0x20(%rsi),%ymm8,%ymm8
1567         vpxor           0x40(%rsi),%ymm1,%ymm1
1568         vpxor           0x60(%rsi),%ymm5,%ymm5
1569         vpxor           0x80(%rsi),%ymm12,%ymm12
1570         vpxor           0xa0(%rsi),%ymm13,%ymm13
1571         vpxor           0xc0(%rsi),%ymm10,%ymm10
1572         vpxor           0xe0(%rsi),%ymm15,%ymm15
1573         vpxor           0x100(%rsi),%ymm14,%ymm14
1574         vpxor           0x120(%rsi),%ymm2,%ymm2
1575         vpxor           0x140(%rsi),%ymm3,%ymm3
1576         vpxor           0x160(%rsi),%ymm7,%ymm7
1577         vpxor           0x180(%rsi),%ymm11,%ymm11
1578         vpxor           0x1a0(%rsi),%ymm9,%ymm9
1579         vmovdqu         %ymm6,0x00(%rdi)
1580         vmovdqu         %ymm8,0x20(%rdi)
1581         vmovdqu         %ymm1,0x40(%rdi)
1582         vmovdqu         %ymm5,0x60(%rdi)
1583         vmovdqu         %ymm12,0x80(%rdi)
1584         vmovdqu         %ymm13,0xa0(%rdi)
1585         vmovdqu         %ymm10,0xc0(%rdi)
1586         vmovdqu         %ymm15,0xe0(%rdi)
1587         vmovdqu         %ymm14,0x100(%rdi)
1588         vmovdqu         %ymm2,0x120(%rdi)
1589         vmovdqu         %ymm3,0x140(%rdi)
1590         vmovdqu         %ymm7,0x160(%rdi)
1591         vmovdqu         %ymm11,0x180(%rdi)
1592         vmovdqu         %ymm9,0x1a0(%rdi)
1593         je              .Ldone8x
1594
1595         lea             0x1c0(%rsi),%rsi        # inp+=64*7
1596         xor             %r9,%r9
1597         vmovdqa         %ymm0,0x00(%rsp)
1598         lea             0x1c0(%rdi),%rdi        # out+=64*7
1599         sub             $448,%rdx               # len-=64*7
1600         vmovdqa         %ymm4,0x20(%rsp)
1601
1602 .Loop_tail8x:
1603         movzb           (%rsi,%r9),%eax
1604         movzb           (%rsp,%r9),%ecx
1605         lea             1(%r9),%r9
1606         xor             %ecx,%eax
1607         mov             %al,-1(%rdi,%r9)
1608         dec             %rdx
1609         jnz             .Loop_tail8x
1610
1611 .Ldone8x:
1612         vzeroall
1613         lea             -8(%r10),%rsp
1614 .L8x_epilogue:
1615         ret
1616 SYM_FUNC_END(chacha20_avx2)
1617 #endif
1618 #ifdef CONFIG_AS_AVX512
1619 .align 32
1620 SYM_FUNC_START(chacha20_avx512)
1621 .Lchacha20_avx512:
1622         lea     8(%rsp),%r10            # frame pointer
1623         cmp     $512,%rdx
1624         ja      .Lchacha20_16x
1625
1626         sub     $64+8,%rsp
1627         and $-64,%rsp
1628         vbroadcasti32x4 .Lsigma(%rip),%zmm0
1629         vbroadcasti32x4 (%rcx),%zmm1
1630         vbroadcasti32x4 16(%rcx),%zmm2
1631         vbroadcasti32x4 (%r8),%zmm3
1632
1633         vmovdqa32       %zmm0,%zmm16
1634         vmovdqa32       %zmm1,%zmm17
1635         vmovdqa32       %zmm2,%zmm18
1636         vpaddd          .Lzeroz(%rip),%zmm3,%zmm3
1637         vmovdqa32       .Lfourz(%rip),%zmm20
1638         mov             $10,%r8 # reuse %r8
1639         vmovdqa32       %zmm3,%zmm19
1640         jmp             .Loop_avx512
1641
1642 .align  16
1643 .Loop_outer_avx512:
1644         vmovdqa32       %zmm16,%zmm0
1645         vmovdqa32       %zmm17,%zmm1
1646         vmovdqa32       %zmm18,%zmm2
1647         vpaddd          %zmm20,%zmm19,%zmm3
1648         mov             $10,%r8
1649         vmovdqa32       %zmm3,%zmm19
1650         jmp             .Loop_avx512
1651
1652 .align  32
1653 .Loop_avx512:
1654         vpaddd  %zmm1,%zmm0,%zmm0
1655         vpxord  %zmm0,%zmm3,%zmm3
1656         vprold  $16,%zmm3,%zmm3
1657         vpaddd  %zmm3,%zmm2,%zmm2
1658         vpxord  %zmm2,%zmm1,%zmm1
1659         vprold  $12,%zmm1,%zmm1
1660         vpaddd  %zmm1,%zmm0,%zmm0
1661         vpxord  %zmm0,%zmm3,%zmm3
1662         vprold  $8,%zmm3,%zmm3
1663         vpaddd  %zmm3,%zmm2,%zmm2
1664         vpxord  %zmm2,%zmm1,%zmm1
1665         vprold  $7,%zmm1,%zmm1
1666         vpshufd $78,%zmm2,%zmm2
1667         vpshufd $57,%zmm1,%zmm1
1668         vpshufd $147,%zmm3,%zmm3
1669         vpaddd  %zmm1,%zmm0,%zmm0
1670         vpxord  %zmm0,%zmm3,%zmm3
1671         vprold  $16,%zmm3,%zmm3
1672         vpaddd  %zmm3,%zmm2,%zmm2
1673         vpxord  %zmm2,%zmm1,%zmm1
1674         vprold  $12,%zmm1,%zmm1
1675         vpaddd  %zmm1,%zmm0,%zmm0
1676         vpxord  %zmm0,%zmm3,%zmm3
1677         vprold  $8,%zmm3,%zmm3
1678         vpaddd  %zmm3,%zmm2,%zmm2
1679         vpxord  %zmm2,%zmm1,%zmm1
1680         vprold  $7,%zmm1,%zmm1
1681         vpshufd $78,%zmm2,%zmm2
1682         vpshufd $147,%zmm1,%zmm1
1683         vpshufd $57,%zmm3,%zmm3
1684         dec     %r8
1685         jnz     .Loop_avx512
1686         vpaddd          %zmm16,%zmm0,%zmm0
1687         vpaddd          %zmm17,%zmm1,%zmm1
1688         vpaddd          %zmm18,%zmm2,%zmm2
1689         vpaddd          %zmm19,%zmm3,%zmm3
1690
1691         sub             $64,%rdx
1692         jb              .Ltail64_avx512
1693
1694         vpxor           0x00(%rsi),%xmm0,%xmm4  # xor with input
1695         vpxor           0x10(%rsi),%xmm1,%xmm5
1696         vpxor           0x20(%rsi),%xmm2,%xmm6
1697         vpxor           0x30(%rsi),%xmm3,%xmm7
1698         lea             0x40(%rsi),%rsi         # inp+=64
1699
1700         vmovdqu         %xmm4,0x00(%rdi)                # write output
1701         vmovdqu         %xmm5,0x10(%rdi)
1702         vmovdqu         %xmm6,0x20(%rdi)
1703         vmovdqu         %xmm7,0x30(%rdi)
1704         lea             0x40(%rdi),%rdi         # out+=64
1705
1706         jz              .Ldone_avx512
1707
1708         vextracti32x4   $1,%zmm0,%xmm4
1709         vextracti32x4   $1,%zmm1,%xmm5
1710         vextracti32x4   $1,%zmm2,%xmm6
1711         vextracti32x4   $1,%zmm3,%xmm7
1712
1713         sub             $64,%rdx
1714         jb              .Ltail_avx512
1715
1716         vpxor           0x00(%rsi),%xmm4,%xmm4  # xor with input
1717         vpxor           0x10(%rsi),%xmm5,%xmm5
1718         vpxor           0x20(%rsi),%xmm6,%xmm6
1719         vpxor           0x30(%rsi),%xmm7,%xmm7
1720         lea             0x40(%rsi),%rsi         # inp+=64
1721
1722         vmovdqu         %xmm4,0x00(%rdi)                # write output
1723         vmovdqu         %xmm5,0x10(%rdi)
1724         vmovdqu         %xmm6,0x20(%rdi)
1725         vmovdqu         %xmm7,0x30(%rdi)
1726         lea             0x40(%rdi),%rdi         # out+=64
1727
1728         jz              .Ldone_avx512
1729
1730         vextracti32x4   $2,%zmm0,%xmm4
1731         vextracti32x4   $2,%zmm1,%xmm5
1732         vextracti32x4   $2,%zmm2,%xmm6
1733         vextracti32x4   $2,%zmm3,%xmm7
1734
1735         sub             $64,%rdx
1736         jb              .Ltail_avx512
1737
1738         vpxor           0x00(%rsi),%xmm4,%xmm4  # xor with input
1739         vpxor           0x10(%rsi),%xmm5,%xmm5
1740         vpxor           0x20(%rsi),%xmm6,%xmm6
1741         vpxor           0x30(%rsi),%xmm7,%xmm7
1742         lea             0x40(%rsi),%rsi         # inp+=64
1743
1744         vmovdqu         %xmm4,0x00(%rdi)                # write output
1745         vmovdqu         %xmm5,0x10(%rdi)
1746         vmovdqu         %xmm6,0x20(%rdi)
1747         vmovdqu         %xmm7,0x30(%rdi)
1748         lea             0x40(%rdi),%rdi         # out+=64
1749
1750         jz              .Ldone_avx512
1751
1752         vextracti32x4   $3,%zmm0,%xmm4
1753         vextracti32x4   $3,%zmm1,%xmm5
1754         vextracti32x4   $3,%zmm2,%xmm6
1755         vextracti32x4   $3,%zmm3,%xmm7
1756
1757         sub             $64,%rdx
1758         jb              .Ltail_avx512
1759
1760         vpxor           0x00(%rsi),%xmm4,%xmm4  # xor with input
1761         vpxor           0x10(%rsi),%xmm5,%xmm5
1762         vpxor           0x20(%rsi),%xmm6,%xmm6
1763         vpxor           0x30(%rsi),%xmm7,%xmm7
1764         lea             0x40(%rsi),%rsi         # inp+=64
1765
1766         vmovdqu         %xmm4,0x00(%rdi)                # write output
1767         vmovdqu         %xmm5,0x10(%rdi)
1768         vmovdqu         %xmm6,0x20(%rdi)
1769         vmovdqu         %xmm7,0x30(%rdi)
1770         lea             0x40(%rdi),%rdi         # out+=64
1771
1772         jnz             .Loop_outer_avx512
1773
1774         jmp             .Ldone_avx512
1775
1776 .align  16
1777 .Ltail64_avx512:
1778         vmovdqa         %xmm0,0x00(%rsp)
1779         vmovdqa         %xmm1,0x10(%rsp)
1780         vmovdqa         %xmm2,0x20(%rsp)
1781         vmovdqa         %xmm3,0x30(%rsp)
1782         add             $64,%rdx
1783         jmp             .Loop_tail_avx512
1784
1785 .align  16
1786 .Ltail_avx512:
1787         vmovdqa         %xmm4,0x00(%rsp)
1788         vmovdqa         %xmm5,0x10(%rsp)
1789         vmovdqa         %xmm6,0x20(%rsp)
1790         vmovdqa         %xmm7,0x30(%rsp)
1791         add             $64,%rdx
1792
1793 .Loop_tail_avx512:
1794         movzb           (%rsi,%r8),%eax
1795         movzb           (%rsp,%r8),%ecx
1796         lea             1(%r8),%r8
1797         xor             %ecx,%eax
1798         mov             %al,-1(%rdi,%r8)
1799         dec             %rdx
1800         jnz             .Loop_tail_avx512
1801
1802         vmovdqu32       %zmm16,0x00(%rsp)
1803
1804 .Ldone_avx512:
1805         vzeroall
1806         lea     -8(%r10),%rsp
1807 .Lavx512_epilogue:
1808         ret
1809 SYM_FUNC_END(chacha20_avx512)
1810 .align 32
1811 SYM_FUNC_START(chacha20_avx512vl)
1812 .Lchacha20_avx512vl:
1813         lea     8(%rsp),%r10            # frame pointer
1814         cmp     $128,%rdx
1815         ja      .Lchacha20_8xvl
1816
1817         sub     $64+8,%rsp
1818         and $-32,%rsp
1819         vbroadcasti128  .Lsigma(%rip),%ymm0
1820         vbroadcasti128  (%rcx),%ymm1
1821         vbroadcasti128  16(%rcx),%ymm2
1822         vbroadcasti128  (%r8),%ymm3
1823
1824         vmovdqa32       %ymm0,%ymm16
1825         vmovdqa32       %ymm1,%ymm17
1826         vmovdqa32       %ymm2,%ymm18
1827         vpaddd          .Lzeroz(%rip),%ymm3,%ymm3
1828         vmovdqa32       .Ltwoy(%rip),%ymm20
1829         mov             $10,%r8 # reuse %r8
1830         vmovdqa32       %ymm3,%ymm19
1831         jmp             .Loop_avx512vl
1832
1833 .align  16
1834 .Loop_outer_avx512vl:
1835         vmovdqa32       %ymm18,%ymm2
1836         vpaddd          %ymm20,%ymm19,%ymm3
1837         mov             $10,%r8
1838         vmovdqa32       %ymm3,%ymm19
1839         jmp             .Loop_avx512vl
1840
1841 .align  32
1842 .Loop_avx512vl:
1843         vpaddd  %ymm1,%ymm0,%ymm0
1844         vpxor   %ymm0,%ymm3,%ymm3
1845         vprold  $16,%ymm3,%ymm3
1846         vpaddd  %ymm3,%ymm2,%ymm2
1847         vpxor   %ymm2,%ymm1,%ymm1
1848         vprold  $12,%ymm1,%ymm1
1849         vpaddd  %ymm1,%ymm0,%ymm0
1850         vpxor   %ymm0,%ymm3,%ymm3
1851         vprold  $8,%ymm3,%ymm3
1852         vpaddd  %ymm3,%ymm2,%ymm2
1853         vpxor   %ymm2,%ymm1,%ymm1
1854         vprold  $7,%ymm1,%ymm1
1855         vpshufd $78,%ymm2,%ymm2
1856         vpshufd $57,%ymm1,%ymm1
1857         vpshufd $147,%ymm3,%ymm3
1858         vpaddd  %ymm1,%ymm0,%ymm0
1859         vpxor   %ymm0,%ymm3,%ymm3
1860         vprold  $16,%ymm3,%ymm3
1861         vpaddd  %ymm3,%ymm2,%ymm2
1862         vpxor   %ymm2,%ymm1,%ymm1
1863         vprold  $12,%ymm1,%ymm1
1864         vpaddd  %ymm1,%ymm0,%ymm0
1865         vpxor   %ymm0,%ymm3,%ymm3
1866         vprold  $8,%ymm3,%ymm3
1867         vpaddd  %ymm3,%ymm2,%ymm2
1868         vpxor   %ymm2,%ymm1,%ymm1
1869         vprold  $7,%ymm1,%ymm1
1870         vpshufd $78,%ymm2,%ymm2
1871         vpshufd $147,%ymm1,%ymm1
1872         vpshufd $57,%ymm3,%ymm3
1873         dec     %r8
1874         jnz     .Loop_avx512vl
1875         vpaddd          %ymm16,%ymm0,%ymm0
1876         vpaddd          %ymm17,%ymm1,%ymm1
1877         vpaddd          %ymm18,%ymm2,%ymm2
1878         vpaddd          %ymm19,%ymm3,%ymm3
1879
1880         sub             $64,%rdx
1881         jb              .Ltail64_avx512vl
1882
1883         vpxor           0x00(%rsi),%xmm0,%xmm4  # xor with input
1884         vpxor           0x10(%rsi),%xmm1,%xmm5
1885         vpxor           0x20(%rsi),%xmm2,%xmm6
1886         vpxor           0x30(%rsi),%xmm3,%xmm7
1887         lea             0x40(%rsi),%rsi         # inp+=64
1888
1889         vmovdqu         %xmm4,0x00(%rdi)                # write output
1890         vmovdqu         %xmm5,0x10(%rdi)
1891         vmovdqu         %xmm6,0x20(%rdi)
1892         vmovdqu         %xmm7,0x30(%rdi)
1893         lea             0x40(%rdi),%rdi         # out+=64
1894
1895         jz              .Ldone_avx512vl
1896
1897         vextracti128    $1,%ymm0,%xmm4
1898         vextracti128    $1,%ymm1,%xmm5
1899         vextracti128    $1,%ymm2,%xmm6
1900         vextracti128    $1,%ymm3,%xmm7
1901
1902         sub             $64,%rdx
1903         jb              .Ltail_avx512vl
1904
1905         vpxor           0x00(%rsi),%xmm4,%xmm4  # xor with input
1906         vpxor           0x10(%rsi),%xmm5,%xmm5
1907         vpxor           0x20(%rsi),%xmm6,%xmm6
1908         vpxor           0x30(%rsi),%xmm7,%xmm7
1909         lea             0x40(%rsi),%rsi         # inp+=64
1910
1911         vmovdqu         %xmm4,0x00(%rdi)                # write output
1912         vmovdqu         %xmm5,0x10(%rdi)
1913         vmovdqu         %xmm6,0x20(%rdi)
1914         vmovdqu         %xmm7,0x30(%rdi)
1915         lea             0x40(%rdi),%rdi         # out+=64
1916
1917         vmovdqa32       %ymm16,%ymm0
1918         vmovdqa32       %ymm17,%ymm1
1919         jnz             .Loop_outer_avx512vl
1920
1921         jmp             .Ldone_avx512vl
1922
1923 .align  16
1924 .Ltail64_avx512vl:
1925         vmovdqa         %xmm0,0x00(%rsp)
1926         vmovdqa         %xmm1,0x10(%rsp)
1927         vmovdqa         %xmm2,0x20(%rsp)
1928         vmovdqa         %xmm3,0x30(%rsp)
1929         add             $64,%rdx
1930         jmp             .Loop_tail_avx512vl
1931
1932 .align  16
1933 .Ltail_avx512vl:
1934         vmovdqa         %xmm4,0x00(%rsp)
1935         vmovdqa         %xmm5,0x10(%rsp)
1936         vmovdqa         %xmm6,0x20(%rsp)
1937         vmovdqa         %xmm7,0x30(%rsp)
1938         add             $64,%rdx
1939
1940 .Loop_tail_avx512vl:
1941         movzb           (%rsi,%r8),%eax
1942         movzb           (%rsp,%r8),%ecx
1943         lea             1(%r8),%r8
1944         xor             %ecx,%eax
1945         mov             %al,-1(%rdi,%r8)
1946         dec             %rdx
1947         jnz             .Loop_tail_avx512vl
1948
1949         vmovdqu32       %ymm16,0x00(%rsp)
1950         vmovdqu32       %ymm16,0x20(%rsp)
1951
1952 .Ldone_avx512vl:
1953         vzeroall
1954         lea     -8(%r10),%rsp
1955 .Lavx512vl_epilogue:
1956         ret
1957 SYM_FUNC_END(chacha20_avx512vl)
1958 .type   chacha20_16x,@function
1959 .align  32
1960 chacha20_16x:
1961 .Lchacha20_16x:
1962         lea             8(%rsp),%r10            # frame register
1963         sub             $64+8,%rsp
1964         and             $-64,%rsp
1965         vzeroupper
1966
1967         lea             .Lsigma(%rip),%r9
1968         vbroadcasti32x4 (%r9),%zmm3             # key[0]
1969         vbroadcasti32x4 (%rcx),%zmm7            # key[1]
1970         vbroadcasti32x4 16(%rcx),%zmm11         # key[2]
1971         vbroadcasti32x4 (%r8),%zmm15            # key[3]
1972
1973         vpshufd         $0x00,%zmm3,%zmm0       # smash key by lanes...
1974         vpshufd         $0x55,%zmm3,%zmm1
1975         vpshufd         $0xaa,%zmm3,%zmm2
1976         vpshufd         $0xff,%zmm3,%zmm3
1977         vmovdqa64       %zmm0,%zmm16
1978         vmovdqa64       %zmm1,%zmm17
1979         vmovdqa64       %zmm2,%zmm18
1980         vmovdqa64       %zmm3,%zmm19
1981
1982         vpshufd         $0x00,%zmm7,%zmm4
1983         vpshufd         $0x55,%zmm7,%zmm5
1984         vpshufd         $0xaa,%zmm7,%zmm6
1985         vpshufd         $0xff,%zmm7,%zmm7
1986         vmovdqa64       %zmm4,%zmm20
1987         vmovdqa64       %zmm5,%zmm21
1988         vmovdqa64       %zmm6,%zmm22
1989         vmovdqa64       %zmm7,%zmm23
1990
1991         vpshufd         $0x00,%zmm11,%zmm8
1992         vpshufd         $0x55,%zmm11,%zmm9
1993         vpshufd         $0xaa,%zmm11,%zmm10
1994         vpshufd         $0xff,%zmm11,%zmm11
1995         vmovdqa64       %zmm8,%zmm24
1996         vmovdqa64       %zmm9,%zmm25
1997         vmovdqa64       %zmm10,%zmm26
1998         vmovdqa64       %zmm11,%zmm27
1999
2000         vpshufd         $0x00,%zmm15,%zmm12
2001         vpshufd         $0x55,%zmm15,%zmm13
2002         vpshufd         $0xaa,%zmm15,%zmm14
2003         vpshufd         $0xff,%zmm15,%zmm15
2004         vpaddd          .Lincz(%rip),%zmm12,%zmm12      # don't save counters yet
2005         vmovdqa64       %zmm12,%zmm28
2006         vmovdqa64       %zmm13,%zmm29
2007         vmovdqa64       %zmm14,%zmm30
2008         vmovdqa64       %zmm15,%zmm31
2009
2010         mov             $10,%eax
2011         jmp             .Loop16x
2012
2013 .align  32
2014 .Loop_outer16x:
2015         vpbroadcastd    0(%r9),%zmm0            # reload key
2016         vpbroadcastd    4(%r9),%zmm1
2017         vpbroadcastd    8(%r9),%zmm2
2018         vpbroadcastd    12(%r9),%zmm3
2019         vpaddd          .Lsixteen(%rip),%zmm28,%zmm28   # next SIMD counters
2020         vmovdqa64       %zmm20,%zmm4
2021         vmovdqa64       %zmm21,%zmm5
2022         vmovdqa64       %zmm22,%zmm6
2023         vmovdqa64       %zmm23,%zmm7
2024         vmovdqa64       %zmm24,%zmm8
2025         vmovdqa64       %zmm25,%zmm9
2026         vmovdqa64       %zmm26,%zmm10
2027         vmovdqa64       %zmm27,%zmm11
2028         vmovdqa64       %zmm28,%zmm12
2029         vmovdqa64       %zmm29,%zmm13
2030         vmovdqa64       %zmm30,%zmm14
2031         vmovdqa64       %zmm31,%zmm15
2032
2033         vmovdqa64       %zmm0,%zmm16
2034         vmovdqa64       %zmm1,%zmm17
2035         vmovdqa64       %zmm2,%zmm18
2036         vmovdqa64       %zmm3,%zmm19
2037
2038         mov             $10,%eax
2039         jmp             .Loop16x
2040
2041 .align  32
2042 .Loop16x:
2043         vpaddd  %zmm4,%zmm0,%zmm0
2044         vpaddd  %zmm5,%zmm1,%zmm1
2045         vpaddd  %zmm6,%zmm2,%zmm2
2046         vpaddd  %zmm7,%zmm3,%zmm3
2047         vpxord  %zmm0,%zmm12,%zmm12
2048         vpxord  %zmm1,%zmm13,%zmm13
2049         vpxord  %zmm2,%zmm14,%zmm14
2050         vpxord  %zmm3,%zmm15,%zmm15
2051         vprold  $16,%zmm12,%zmm12
2052         vprold  $16,%zmm13,%zmm13
2053         vprold  $16,%zmm14,%zmm14
2054         vprold  $16,%zmm15,%zmm15
2055         vpaddd  %zmm12,%zmm8,%zmm8
2056         vpaddd  %zmm13,%zmm9,%zmm9
2057         vpaddd  %zmm14,%zmm10,%zmm10
2058         vpaddd  %zmm15,%zmm11,%zmm11
2059         vpxord  %zmm8,%zmm4,%zmm4
2060         vpxord  %zmm9,%zmm5,%zmm5
2061         vpxord  %zmm10,%zmm6,%zmm6
2062         vpxord  %zmm11,%zmm7,%zmm7
2063         vprold  $12,%zmm4,%zmm4
2064         vprold  $12,%zmm5,%zmm5
2065         vprold  $12,%zmm6,%zmm6
2066         vprold  $12,%zmm7,%zmm7
2067         vpaddd  %zmm4,%zmm0,%zmm0
2068         vpaddd  %zmm5,%zmm1,%zmm1
2069         vpaddd  %zmm6,%zmm2,%zmm2
2070         vpaddd  %zmm7,%zmm3,%zmm3
2071         vpxord  %zmm0,%zmm12,%zmm12
2072         vpxord  %zmm1,%zmm13,%zmm13
2073         vpxord  %zmm2,%zmm14,%zmm14
2074         vpxord  %zmm3,%zmm15,%zmm15
2075         vprold  $8,%zmm12,%zmm12
2076         vprold  $8,%zmm13,%zmm13
2077         vprold  $8,%zmm14,%zmm14
2078         vprold  $8,%zmm15,%zmm15
2079         vpaddd  %zmm12,%zmm8,%zmm8
2080         vpaddd  %zmm13,%zmm9,%zmm9
2081         vpaddd  %zmm14,%zmm10,%zmm10
2082         vpaddd  %zmm15,%zmm11,%zmm11
2083         vpxord  %zmm8,%zmm4,%zmm4
2084         vpxord  %zmm9,%zmm5,%zmm5
2085         vpxord  %zmm10,%zmm6,%zmm6
2086         vpxord  %zmm11,%zmm7,%zmm7
2087         vprold  $7,%zmm4,%zmm4
2088         vprold  $7,%zmm5,%zmm5
2089         vprold  $7,%zmm6,%zmm6
2090         vprold  $7,%zmm7,%zmm7
2091         vpaddd  %zmm5,%zmm0,%zmm0
2092         vpaddd  %zmm6,%zmm1,%zmm1
2093         vpaddd  %zmm7,%zmm2,%zmm2
2094         vpaddd  %zmm4,%zmm3,%zmm3
2095         vpxord  %zmm0,%zmm15,%zmm15
2096         vpxord  %zmm1,%zmm12,%zmm12
2097         vpxord  %zmm2,%zmm13,%zmm13
2098         vpxord  %zmm3,%zmm14,%zmm14
2099         vprold  $16,%zmm15,%zmm15
2100         vprold  $16,%zmm12,%zmm12
2101         vprold  $16,%zmm13,%zmm13
2102         vprold  $16,%zmm14,%zmm14
2103         vpaddd  %zmm15,%zmm10,%zmm10
2104         vpaddd  %zmm12,%zmm11,%zmm11
2105         vpaddd  %zmm13,%zmm8,%zmm8
2106         vpaddd  %zmm14,%zmm9,%zmm9
2107         vpxord  %zmm10,%zmm5,%zmm5
2108         vpxord  %zmm11,%zmm6,%zmm6
2109         vpxord  %zmm8,%zmm7,%zmm7
2110         vpxord  %zmm9,%zmm4,%zmm4
2111         vprold  $12,%zmm5,%zmm5
2112         vprold  $12,%zmm6,%zmm6
2113         vprold  $12,%zmm7,%zmm7
2114         vprold  $12,%zmm4,%zmm4
2115         vpaddd  %zmm5,%zmm0,%zmm0
2116         vpaddd  %zmm6,%zmm1,%zmm1
2117         vpaddd  %zmm7,%zmm2,%zmm2
2118         vpaddd  %zmm4,%zmm3,%zmm3
2119         vpxord  %zmm0,%zmm15,%zmm15
2120         vpxord  %zmm1,%zmm12,%zmm12
2121         vpxord  %zmm2,%zmm13,%zmm13
2122         vpxord  %zmm3,%zmm14,%zmm14
2123         vprold  $8,%zmm15,%zmm15
2124         vprold  $8,%zmm12,%zmm12
2125         vprold  $8,%zmm13,%zmm13
2126         vprold  $8,%zmm14,%zmm14
2127         vpaddd  %zmm15,%zmm10,%zmm10
2128         vpaddd  %zmm12,%zmm11,%zmm11
2129         vpaddd  %zmm13,%zmm8,%zmm8
2130         vpaddd  %zmm14,%zmm9,%zmm9
2131         vpxord  %zmm10,%zmm5,%zmm5
2132         vpxord  %zmm11,%zmm6,%zmm6
2133         vpxord  %zmm8,%zmm7,%zmm7
2134         vpxord  %zmm9,%zmm4,%zmm4
2135         vprold  $7,%zmm5,%zmm5
2136         vprold  $7,%zmm6,%zmm6
2137         vprold  $7,%zmm7,%zmm7
2138         vprold  $7,%zmm4,%zmm4
2139         dec             %eax
2140         jnz             .Loop16x
2141
2142         vpaddd          %zmm16,%zmm0,%zmm0      # accumulate key
2143         vpaddd          %zmm17,%zmm1,%zmm1
2144         vpaddd          %zmm18,%zmm2,%zmm2
2145         vpaddd          %zmm19,%zmm3,%zmm3
2146
2147         vpunpckldq      %zmm1,%zmm0,%zmm18              # "de-interlace" data
2148         vpunpckldq      %zmm3,%zmm2,%zmm19
2149         vpunpckhdq      %zmm1,%zmm0,%zmm0
2150         vpunpckhdq      %zmm3,%zmm2,%zmm2
2151         vpunpcklqdq     %zmm19,%zmm18,%zmm1             # "a0"
2152         vpunpckhqdq     %zmm19,%zmm18,%zmm18            # "a1"
2153         vpunpcklqdq     %zmm2,%zmm0,%zmm3               # "a2"
2154         vpunpckhqdq     %zmm2,%zmm0,%zmm0               # "a3"
2155         vpaddd          %zmm20,%zmm4,%zmm4
2156         vpaddd          %zmm21,%zmm5,%zmm5
2157         vpaddd          %zmm22,%zmm6,%zmm6
2158         vpaddd          %zmm23,%zmm7,%zmm7
2159
2160         vpunpckldq      %zmm5,%zmm4,%zmm2
2161         vpunpckldq      %zmm7,%zmm6,%zmm19
2162         vpunpckhdq      %zmm5,%zmm4,%zmm4
2163         vpunpckhdq      %zmm7,%zmm6,%zmm6
2164         vpunpcklqdq     %zmm19,%zmm2,%zmm5              # "b0"
2165         vpunpckhqdq     %zmm19,%zmm2,%zmm2              # "b1"
2166         vpunpcklqdq     %zmm6,%zmm4,%zmm7               # "b2"
2167         vpunpckhqdq     %zmm6,%zmm4,%zmm4               # "b3"
2168         vshufi32x4      $0x44,%zmm5,%zmm1,%zmm19        # "de-interlace" further
2169         vshufi32x4      $0xee,%zmm5,%zmm1,%zmm5
2170         vshufi32x4      $0x44,%zmm2,%zmm18,%zmm1
2171         vshufi32x4      $0xee,%zmm2,%zmm18,%zmm2
2172         vshufi32x4      $0x44,%zmm7,%zmm3,%zmm18
2173         vshufi32x4      $0xee,%zmm7,%zmm3,%zmm7
2174         vshufi32x4      $0x44,%zmm4,%zmm0,%zmm3
2175         vshufi32x4      $0xee,%zmm4,%zmm0,%zmm4
2176         vpaddd          %zmm24,%zmm8,%zmm8
2177         vpaddd          %zmm25,%zmm9,%zmm9
2178         vpaddd          %zmm26,%zmm10,%zmm10
2179         vpaddd          %zmm27,%zmm11,%zmm11
2180
2181         vpunpckldq      %zmm9,%zmm8,%zmm6
2182         vpunpckldq      %zmm11,%zmm10,%zmm0
2183         vpunpckhdq      %zmm9,%zmm8,%zmm8
2184         vpunpckhdq      %zmm11,%zmm10,%zmm10
2185         vpunpcklqdq     %zmm0,%zmm6,%zmm9               # "c0"
2186         vpunpckhqdq     %zmm0,%zmm6,%zmm6               # "c1"
2187         vpunpcklqdq     %zmm10,%zmm8,%zmm11             # "c2"
2188         vpunpckhqdq     %zmm10,%zmm8,%zmm8              # "c3"
2189         vpaddd          %zmm28,%zmm12,%zmm12
2190         vpaddd          %zmm29,%zmm13,%zmm13
2191         vpaddd          %zmm30,%zmm14,%zmm14
2192         vpaddd          %zmm31,%zmm15,%zmm15
2193
2194         vpunpckldq      %zmm13,%zmm12,%zmm10
2195         vpunpckldq      %zmm15,%zmm14,%zmm0
2196         vpunpckhdq      %zmm13,%zmm12,%zmm12
2197         vpunpckhdq      %zmm15,%zmm14,%zmm14
2198         vpunpcklqdq     %zmm0,%zmm10,%zmm13             # "d0"
2199         vpunpckhqdq     %zmm0,%zmm10,%zmm10             # "d1"
2200         vpunpcklqdq     %zmm14,%zmm12,%zmm15            # "d2"
2201         vpunpckhqdq     %zmm14,%zmm12,%zmm12            # "d3"
2202         vshufi32x4      $0x44,%zmm13,%zmm9,%zmm0        # "de-interlace" further
2203         vshufi32x4      $0xee,%zmm13,%zmm9,%zmm13
2204         vshufi32x4      $0x44,%zmm10,%zmm6,%zmm9
2205         vshufi32x4      $0xee,%zmm10,%zmm6,%zmm10
2206         vshufi32x4      $0x44,%zmm15,%zmm11,%zmm6
2207         vshufi32x4      $0xee,%zmm15,%zmm11,%zmm15
2208         vshufi32x4      $0x44,%zmm12,%zmm8,%zmm11
2209         vshufi32x4      $0xee,%zmm12,%zmm8,%zmm12
2210         vshufi32x4      $0x88,%zmm0,%zmm19,%zmm16       # "de-interlace" further
2211         vshufi32x4      $0xdd,%zmm0,%zmm19,%zmm19
2212          vshufi32x4     $0x88,%zmm13,%zmm5,%zmm0
2213          vshufi32x4     $0xdd,%zmm13,%zmm5,%zmm13
2214         vshufi32x4      $0x88,%zmm9,%zmm1,%zmm17
2215         vshufi32x4      $0xdd,%zmm9,%zmm1,%zmm1
2216          vshufi32x4     $0x88,%zmm10,%zmm2,%zmm9
2217          vshufi32x4     $0xdd,%zmm10,%zmm2,%zmm10
2218         vshufi32x4      $0x88,%zmm6,%zmm18,%zmm14
2219         vshufi32x4      $0xdd,%zmm6,%zmm18,%zmm18
2220          vshufi32x4     $0x88,%zmm15,%zmm7,%zmm6
2221          vshufi32x4     $0xdd,%zmm15,%zmm7,%zmm15
2222         vshufi32x4      $0x88,%zmm11,%zmm3,%zmm8
2223         vshufi32x4      $0xdd,%zmm11,%zmm3,%zmm3
2224          vshufi32x4     $0x88,%zmm12,%zmm4,%zmm11
2225          vshufi32x4     $0xdd,%zmm12,%zmm4,%zmm12
2226         cmp             $64*16,%rdx
2227         jb              .Ltail16x
2228
2229         vpxord          0x00(%rsi),%zmm16,%zmm16        # xor with input
2230         vpxord          0x40(%rsi),%zmm17,%zmm17
2231         vpxord          0x80(%rsi),%zmm14,%zmm14
2232         vpxord          0xc0(%rsi),%zmm8,%zmm8
2233         vmovdqu32       %zmm16,0x00(%rdi)
2234         vmovdqu32       %zmm17,0x40(%rdi)
2235         vmovdqu32       %zmm14,0x80(%rdi)
2236         vmovdqu32       %zmm8,0xc0(%rdi)
2237
2238         vpxord          0x100(%rsi),%zmm19,%zmm19
2239         vpxord          0x140(%rsi),%zmm1,%zmm1
2240         vpxord          0x180(%rsi),%zmm18,%zmm18
2241         vpxord          0x1c0(%rsi),%zmm3,%zmm3
2242         vmovdqu32       %zmm19,0x100(%rdi)
2243         vmovdqu32       %zmm1,0x140(%rdi)
2244         vmovdqu32       %zmm18,0x180(%rdi)
2245         vmovdqu32       %zmm3,0x1c0(%rdi)
2246
2247         vpxord          0x200(%rsi),%zmm0,%zmm0
2248         vpxord          0x240(%rsi),%zmm9,%zmm9
2249         vpxord          0x280(%rsi),%zmm6,%zmm6
2250         vpxord          0x2c0(%rsi),%zmm11,%zmm11
2251         vmovdqu32       %zmm0,0x200(%rdi)
2252         vmovdqu32       %zmm9,0x240(%rdi)
2253         vmovdqu32       %zmm6,0x280(%rdi)
2254         vmovdqu32       %zmm11,0x2c0(%rdi)
2255
2256         vpxord          0x300(%rsi),%zmm13,%zmm13
2257         vpxord          0x340(%rsi),%zmm10,%zmm10
2258         vpxord          0x380(%rsi),%zmm15,%zmm15
2259         vpxord          0x3c0(%rsi),%zmm12,%zmm12
2260         lea             0x400(%rsi),%rsi
2261         vmovdqu32       %zmm13,0x300(%rdi)
2262         vmovdqu32       %zmm10,0x340(%rdi)
2263         vmovdqu32       %zmm15,0x380(%rdi)
2264         vmovdqu32       %zmm12,0x3c0(%rdi)
2265         lea             0x400(%rdi),%rdi
2266
2267         sub             $64*16,%rdx
2268         jnz             .Loop_outer16x
2269
2270         jmp             .Ldone16x
2271
2272 .align  32
2273 .Ltail16x:
2274         xor             %r9,%r9
2275         sub             %rsi,%rdi
2276         cmp             $64*1,%rdx
2277         jb              .Less_than_64_16x
2278         vpxord          (%rsi),%zmm16,%zmm16    # xor with input
2279         vmovdqu32       %zmm16,(%rdi,%rsi)
2280         je              .Ldone16x
2281         vmovdqa32       %zmm17,%zmm16
2282         lea             64(%rsi),%rsi
2283
2284         cmp             $64*2,%rdx
2285         jb              .Less_than_64_16x
2286         vpxord          (%rsi),%zmm17,%zmm17
2287         vmovdqu32       %zmm17,(%rdi,%rsi)
2288         je              .Ldone16x
2289         vmovdqa32       %zmm14,%zmm16
2290         lea             64(%rsi),%rsi
2291
2292         cmp             $64*3,%rdx
2293         jb              .Less_than_64_16x
2294         vpxord          (%rsi),%zmm14,%zmm14
2295         vmovdqu32       %zmm14,(%rdi,%rsi)
2296         je              .Ldone16x
2297         vmovdqa32       %zmm8,%zmm16
2298         lea             64(%rsi),%rsi
2299
2300         cmp             $64*4,%rdx
2301         jb              .Less_than_64_16x
2302         vpxord          (%rsi),%zmm8,%zmm8
2303         vmovdqu32       %zmm8,(%rdi,%rsi)
2304         je              .Ldone16x
2305         vmovdqa32       %zmm19,%zmm16
2306         lea             64(%rsi),%rsi
2307
2308         cmp             $64*5,%rdx
2309         jb              .Less_than_64_16x
2310         vpxord          (%rsi),%zmm19,%zmm19
2311         vmovdqu32       %zmm19,(%rdi,%rsi)
2312         je              .Ldone16x
2313         vmovdqa32       %zmm1,%zmm16
2314         lea             64(%rsi),%rsi
2315
2316         cmp             $64*6,%rdx
2317         jb              .Less_than_64_16x
2318         vpxord          (%rsi),%zmm1,%zmm1
2319         vmovdqu32       %zmm1,(%rdi,%rsi)
2320         je              .Ldone16x
2321         vmovdqa32       %zmm18,%zmm16
2322         lea             64(%rsi),%rsi
2323
2324         cmp             $64*7,%rdx
2325         jb              .Less_than_64_16x
2326         vpxord          (%rsi),%zmm18,%zmm18
2327         vmovdqu32       %zmm18,(%rdi,%rsi)
2328         je              .Ldone16x
2329         vmovdqa32       %zmm3,%zmm16
2330         lea             64(%rsi),%rsi
2331
2332         cmp             $64*8,%rdx
2333         jb              .Less_than_64_16x
2334         vpxord          (%rsi),%zmm3,%zmm3
2335         vmovdqu32       %zmm3,(%rdi,%rsi)
2336         je              .Ldone16x
2337         vmovdqa32       %zmm0,%zmm16
2338         lea             64(%rsi),%rsi
2339
2340         cmp             $64*9,%rdx
2341         jb              .Less_than_64_16x
2342         vpxord          (%rsi),%zmm0,%zmm0
2343         vmovdqu32       %zmm0,(%rdi,%rsi)
2344         je              .Ldone16x
2345         vmovdqa32       %zmm9,%zmm16
2346         lea             64(%rsi),%rsi
2347
2348         cmp             $64*10,%rdx
2349         jb              .Less_than_64_16x
2350         vpxord          (%rsi),%zmm9,%zmm9
2351         vmovdqu32       %zmm9,(%rdi,%rsi)
2352         je              .Ldone16x
2353         vmovdqa32       %zmm6,%zmm16
2354         lea             64(%rsi),%rsi
2355
2356         cmp             $64*11,%rdx
2357         jb              .Less_than_64_16x
2358         vpxord          (%rsi),%zmm6,%zmm6
2359         vmovdqu32       %zmm6,(%rdi,%rsi)
2360         je              .Ldone16x
2361         vmovdqa32       %zmm11,%zmm16
2362         lea             64(%rsi),%rsi
2363
2364         cmp             $64*12,%rdx
2365         jb              .Less_than_64_16x
2366         vpxord          (%rsi),%zmm11,%zmm11
2367         vmovdqu32       %zmm11,(%rdi,%rsi)
2368         je              .Ldone16x
2369         vmovdqa32       %zmm13,%zmm16
2370         lea             64(%rsi),%rsi
2371
2372         cmp             $64*13,%rdx
2373         jb              .Less_than_64_16x
2374         vpxord          (%rsi),%zmm13,%zmm13
2375         vmovdqu32       %zmm13,(%rdi,%rsi)
2376         je              .Ldone16x
2377         vmovdqa32       %zmm10,%zmm16
2378         lea             64(%rsi),%rsi
2379
2380         cmp             $64*14,%rdx
2381         jb              .Less_than_64_16x
2382         vpxord          (%rsi),%zmm10,%zmm10
2383         vmovdqu32       %zmm10,(%rdi,%rsi)
2384         je              .Ldone16x
2385         vmovdqa32       %zmm15,%zmm16
2386         lea             64(%rsi),%rsi
2387
2388         cmp             $64*15,%rdx
2389         jb              .Less_than_64_16x
2390         vpxord          (%rsi),%zmm15,%zmm15
2391         vmovdqu32       %zmm15,(%rdi,%rsi)
2392         je              .Ldone16x
2393         vmovdqa32       %zmm12,%zmm16
2394         lea             64(%rsi),%rsi
2395
2396 .Less_than_64_16x:
2397         vmovdqa32       %zmm16,0x00(%rsp)
2398         lea             (%rdi,%rsi),%rdi
2399         and             $63,%rdx
2400
2401 .Loop_tail16x:
2402         movzb           (%rsi,%r9),%eax
2403         movzb           (%rsp,%r9),%ecx
2404         lea             1(%r9),%r9
2405         xor             %ecx,%eax
2406         mov             %al,-1(%rdi,%r9)
2407         dec             %rdx
2408         jnz             .Loop_tail16x
2409
2410         vpxord          %zmm16,%zmm16,%zmm16
2411         vmovdqa32       %zmm16,0(%rsp)
2412
2413 .Ldone16x:
2414         vzeroall
2415         lea             -8(%r10),%rsp
2416 .L16x_epilogue:
2417         ret
2418 .size   chacha20_16x,.-chacha20_16x
2419 .type   chacha20_8xvl,@function
2420 .align  32
2421 chacha20_8xvl:
2422 .Lchacha20_8xvl:
2423         lea             8(%rsp),%r10            # frame register
2424         sub             $64+8,%rsp
2425         and             $-64,%rsp
2426         vzeroupper
2427
2428         lea             .Lsigma(%rip),%r9
2429         vbroadcasti128  (%r9),%ymm3             # key[0]
2430         vbroadcasti128  (%rcx),%ymm7            # key[1]
2431         vbroadcasti128  16(%rcx),%ymm11         # key[2]
2432         vbroadcasti128  (%r8),%ymm15            # key[3]
2433
2434         vpshufd         $0x00,%ymm3,%ymm0       # smash key by lanes...
2435         vpshufd         $0x55,%ymm3,%ymm1
2436         vpshufd         $0xaa,%ymm3,%ymm2
2437         vpshufd         $0xff,%ymm3,%ymm3
2438         vmovdqa64       %ymm0,%ymm16
2439         vmovdqa64       %ymm1,%ymm17
2440         vmovdqa64       %ymm2,%ymm18
2441         vmovdqa64       %ymm3,%ymm19
2442
2443         vpshufd         $0x00,%ymm7,%ymm4
2444         vpshufd         $0x55,%ymm7,%ymm5
2445         vpshufd         $0xaa,%ymm7,%ymm6
2446         vpshufd         $0xff,%ymm7,%ymm7
2447         vmovdqa64       %ymm4,%ymm20
2448         vmovdqa64       %ymm5,%ymm21
2449         vmovdqa64       %ymm6,%ymm22
2450         vmovdqa64       %ymm7,%ymm23
2451
2452         vpshufd         $0x00,%ymm11,%ymm8
2453         vpshufd         $0x55,%ymm11,%ymm9
2454         vpshufd         $0xaa,%ymm11,%ymm10
2455         vpshufd         $0xff,%ymm11,%ymm11
2456         vmovdqa64       %ymm8,%ymm24
2457         vmovdqa64       %ymm9,%ymm25
2458         vmovdqa64       %ymm10,%ymm26
2459         vmovdqa64       %ymm11,%ymm27
2460
2461         vpshufd         $0x00,%ymm15,%ymm12
2462         vpshufd         $0x55,%ymm15,%ymm13
2463         vpshufd         $0xaa,%ymm15,%ymm14
2464         vpshufd         $0xff,%ymm15,%ymm15
2465         vpaddd          .Lincy(%rip),%ymm12,%ymm12      # don't save counters yet
2466         vmovdqa64       %ymm12,%ymm28
2467         vmovdqa64       %ymm13,%ymm29
2468         vmovdqa64       %ymm14,%ymm30
2469         vmovdqa64       %ymm15,%ymm31
2470
2471         mov             $10,%eax
2472         jmp             .Loop8xvl
2473
2474 .align  32
2475 .Loop_outer8xvl:
2476         #vpbroadcastd   0(%r9),%ymm0            # reload key
2477         #vpbroadcastd   4(%r9),%ymm1
2478         vpbroadcastd    8(%r9),%ymm2
2479         vpbroadcastd    12(%r9),%ymm3
2480         vpaddd          .Leight(%rip),%ymm28,%ymm28     # next SIMD counters
2481         vmovdqa64       %ymm20,%ymm4
2482         vmovdqa64       %ymm21,%ymm5
2483         vmovdqa64       %ymm22,%ymm6
2484         vmovdqa64       %ymm23,%ymm7
2485         vmovdqa64       %ymm24,%ymm8
2486         vmovdqa64       %ymm25,%ymm9
2487         vmovdqa64       %ymm26,%ymm10
2488         vmovdqa64       %ymm27,%ymm11
2489         vmovdqa64       %ymm28,%ymm12
2490         vmovdqa64       %ymm29,%ymm13
2491         vmovdqa64       %ymm30,%ymm14
2492         vmovdqa64       %ymm31,%ymm15
2493
2494         vmovdqa64       %ymm0,%ymm16
2495         vmovdqa64       %ymm1,%ymm17
2496         vmovdqa64       %ymm2,%ymm18
2497         vmovdqa64       %ymm3,%ymm19
2498
2499         mov             $10,%eax
2500         jmp             .Loop8xvl
2501
2502 .align  32
2503 .Loop8xvl:
2504         vpaddd  %ymm4,%ymm0,%ymm0
2505         vpaddd  %ymm5,%ymm1,%ymm1
2506         vpaddd  %ymm6,%ymm2,%ymm2
2507         vpaddd  %ymm7,%ymm3,%ymm3
2508         vpxor   %ymm0,%ymm12,%ymm12
2509         vpxor   %ymm1,%ymm13,%ymm13
2510         vpxor   %ymm2,%ymm14,%ymm14
2511         vpxor   %ymm3,%ymm15,%ymm15
2512         vprold  $16,%ymm12,%ymm12
2513         vprold  $16,%ymm13,%ymm13
2514         vprold  $16,%ymm14,%ymm14
2515         vprold  $16,%ymm15,%ymm15
2516         vpaddd  %ymm12,%ymm8,%ymm8
2517         vpaddd  %ymm13,%ymm9,%ymm9
2518         vpaddd  %ymm14,%ymm10,%ymm10
2519         vpaddd  %ymm15,%ymm11,%ymm11
2520         vpxor   %ymm8,%ymm4,%ymm4
2521         vpxor   %ymm9,%ymm5,%ymm5
2522         vpxor   %ymm10,%ymm6,%ymm6
2523         vpxor   %ymm11,%ymm7,%ymm7
2524         vprold  $12,%ymm4,%ymm4
2525         vprold  $12,%ymm5,%ymm5
2526         vprold  $12,%ymm6,%ymm6
2527         vprold  $12,%ymm7,%ymm7
2528         vpaddd  %ymm4,%ymm0,%ymm0
2529         vpaddd  %ymm5,%ymm1,%ymm1
2530         vpaddd  %ymm6,%ymm2,%ymm2
2531         vpaddd  %ymm7,%ymm3,%ymm3
2532         vpxor   %ymm0,%ymm12,%ymm12
2533         vpxor   %ymm1,%ymm13,%ymm13
2534         vpxor   %ymm2,%ymm14,%ymm14
2535         vpxor   %ymm3,%ymm15,%ymm15
2536         vprold  $8,%ymm12,%ymm12
2537         vprold  $8,%ymm13,%ymm13
2538         vprold  $8,%ymm14,%ymm14
2539         vprold  $8,%ymm15,%ymm15
2540         vpaddd  %ymm12,%ymm8,%ymm8
2541         vpaddd  %ymm13,%ymm9,%ymm9
2542         vpaddd  %ymm14,%ymm10,%ymm10
2543         vpaddd  %ymm15,%ymm11,%ymm11
2544         vpxor   %ymm8,%ymm4,%ymm4
2545         vpxor   %ymm9,%ymm5,%ymm5
2546         vpxor   %ymm10,%ymm6,%ymm6
2547         vpxor   %ymm11,%ymm7,%ymm7
2548         vprold  $7,%ymm4,%ymm4
2549         vprold  $7,%ymm5,%ymm5
2550         vprold  $7,%ymm6,%ymm6
2551         vprold  $7,%ymm7,%ymm7
2552         vpaddd  %ymm5,%ymm0,%ymm0
2553         vpaddd  %ymm6,%ymm1,%ymm1
2554         vpaddd  %ymm7,%ymm2,%ymm2
2555         vpaddd  %ymm4,%ymm3,%ymm3
2556         vpxor   %ymm0,%ymm15,%ymm15
2557         vpxor   %ymm1,%ymm12,%ymm12
2558         vpxor   %ymm2,%ymm13,%ymm13
2559         vpxor   %ymm3,%ymm14,%ymm14
2560         vprold  $16,%ymm15,%ymm15
2561         vprold  $16,%ymm12,%ymm12
2562         vprold  $16,%ymm13,%ymm13
2563         vprold  $16,%ymm14,%ymm14
2564         vpaddd  %ymm15,%ymm10,%ymm10
2565         vpaddd  %ymm12,%ymm11,%ymm11
2566         vpaddd  %ymm13,%ymm8,%ymm8
2567         vpaddd  %ymm14,%ymm9,%ymm9
2568         vpxor   %ymm10,%ymm5,%ymm5
2569         vpxor   %ymm11,%ymm6,%ymm6
2570         vpxor   %ymm8,%ymm7,%ymm7
2571         vpxor   %ymm9,%ymm4,%ymm4
2572         vprold  $12,%ymm5,%ymm5
2573         vprold  $12,%ymm6,%ymm6
2574         vprold  $12,%ymm7,%ymm7
2575         vprold  $12,%ymm4,%ymm4
2576         vpaddd  %ymm5,%ymm0,%ymm0
2577         vpaddd  %ymm6,%ymm1,%ymm1
2578         vpaddd  %ymm7,%ymm2,%ymm2
2579         vpaddd  %ymm4,%ymm3,%ymm3
2580         vpxor   %ymm0,%ymm15,%ymm15
2581         vpxor   %ymm1,%ymm12,%ymm12
2582         vpxor   %ymm2,%ymm13,%ymm13
2583         vpxor   %ymm3,%ymm14,%ymm14
2584         vprold  $8,%ymm15,%ymm15
2585         vprold  $8,%ymm12,%ymm12
2586         vprold  $8,%ymm13,%ymm13
2587         vprold  $8,%ymm14,%ymm14
2588         vpaddd  %ymm15,%ymm10,%ymm10
2589         vpaddd  %ymm12,%ymm11,%ymm11
2590         vpaddd  %ymm13,%ymm8,%ymm8
2591         vpaddd  %ymm14,%ymm9,%ymm9
2592         vpxor   %ymm10,%ymm5,%ymm5
2593         vpxor   %ymm11,%ymm6,%ymm6
2594         vpxor   %ymm8,%ymm7,%ymm7
2595         vpxor   %ymm9,%ymm4,%ymm4
2596         vprold  $7,%ymm5,%ymm5
2597         vprold  $7,%ymm6,%ymm6
2598         vprold  $7,%ymm7,%ymm7
2599         vprold  $7,%ymm4,%ymm4
2600         dec             %eax
2601         jnz             .Loop8xvl
2602
2603         vpaddd          %ymm16,%ymm0,%ymm0      # accumulate key
2604         vpaddd          %ymm17,%ymm1,%ymm1
2605         vpaddd          %ymm18,%ymm2,%ymm2
2606         vpaddd          %ymm19,%ymm3,%ymm3
2607
2608         vpunpckldq      %ymm1,%ymm0,%ymm18              # "de-interlace" data
2609         vpunpckldq      %ymm3,%ymm2,%ymm19
2610         vpunpckhdq      %ymm1,%ymm0,%ymm0
2611         vpunpckhdq      %ymm3,%ymm2,%ymm2
2612         vpunpcklqdq     %ymm19,%ymm18,%ymm1             # "a0"
2613         vpunpckhqdq     %ymm19,%ymm18,%ymm18            # "a1"
2614         vpunpcklqdq     %ymm2,%ymm0,%ymm3               # "a2"
2615         vpunpckhqdq     %ymm2,%ymm0,%ymm0               # "a3"
2616         vpaddd          %ymm20,%ymm4,%ymm4
2617         vpaddd          %ymm21,%ymm5,%ymm5
2618         vpaddd          %ymm22,%ymm6,%ymm6
2619         vpaddd          %ymm23,%ymm7,%ymm7
2620
2621         vpunpckldq      %ymm5,%ymm4,%ymm2
2622         vpunpckldq      %ymm7,%ymm6,%ymm19
2623         vpunpckhdq      %ymm5,%ymm4,%ymm4
2624         vpunpckhdq      %ymm7,%ymm6,%ymm6
2625         vpunpcklqdq     %ymm19,%ymm2,%ymm5              # "b0"
2626         vpunpckhqdq     %ymm19,%ymm2,%ymm2              # "b1"
2627         vpunpcklqdq     %ymm6,%ymm4,%ymm7               # "b2"
2628         vpunpckhqdq     %ymm6,%ymm4,%ymm4               # "b3"
2629         vshufi32x4      $0,%ymm5,%ymm1,%ymm19   # "de-interlace" further
2630         vshufi32x4      $3,%ymm5,%ymm1,%ymm5
2631         vshufi32x4      $0,%ymm2,%ymm18,%ymm1
2632         vshufi32x4      $3,%ymm2,%ymm18,%ymm2
2633         vshufi32x4      $0,%ymm7,%ymm3,%ymm18
2634         vshufi32x4      $3,%ymm7,%ymm3,%ymm7
2635         vshufi32x4      $0,%ymm4,%ymm0,%ymm3
2636         vshufi32x4      $3,%ymm4,%ymm0,%ymm4
2637         vpaddd          %ymm24,%ymm8,%ymm8
2638         vpaddd          %ymm25,%ymm9,%ymm9
2639         vpaddd          %ymm26,%ymm10,%ymm10
2640         vpaddd          %ymm27,%ymm11,%ymm11
2641
2642         vpunpckldq      %ymm9,%ymm8,%ymm6
2643         vpunpckldq      %ymm11,%ymm10,%ymm0
2644         vpunpckhdq      %ymm9,%ymm8,%ymm8
2645         vpunpckhdq      %ymm11,%ymm10,%ymm10
2646         vpunpcklqdq     %ymm0,%ymm6,%ymm9               # "c0"
2647         vpunpckhqdq     %ymm0,%ymm6,%ymm6               # "c1"
2648         vpunpcklqdq     %ymm10,%ymm8,%ymm11             # "c2"
2649         vpunpckhqdq     %ymm10,%ymm8,%ymm8              # "c3"
2650         vpaddd          %ymm28,%ymm12,%ymm12
2651         vpaddd          %ymm29,%ymm13,%ymm13
2652         vpaddd          %ymm30,%ymm14,%ymm14
2653         vpaddd          %ymm31,%ymm15,%ymm15
2654
2655         vpunpckldq      %ymm13,%ymm12,%ymm10
2656         vpunpckldq      %ymm15,%ymm14,%ymm0
2657         vpunpckhdq      %ymm13,%ymm12,%ymm12
2658         vpunpckhdq      %ymm15,%ymm14,%ymm14
2659         vpunpcklqdq     %ymm0,%ymm10,%ymm13             # "d0"
2660         vpunpckhqdq     %ymm0,%ymm10,%ymm10             # "d1"
2661         vpunpcklqdq     %ymm14,%ymm12,%ymm15            # "d2"
2662         vpunpckhqdq     %ymm14,%ymm12,%ymm12            # "d3"
2663         vperm2i128      $0x20,%ymm13,%ymm9,%ymm0        # "de-interlace" further
2664         vperm2i128      $0x31,%ymm13,%ymm9,%ymm13
2665         vperm2i128      $0x20,%ymm10,%ymm6,%ymm9
2666         vperm2i128      $0x31,%ymm10,%ymm6,%ymm10
2667         vperm2i128      $0x20,%ymm15,%ymm11,%ymm6
2668         vperm2i128      $0x31,%ymm15,%ymm11,%ymm15
2669         vperm2i128      $0x20,%ymm12,%ymm8,%ymm11
2670         vperm2i128      $0x31,%ymm12,%ymm8,%ymm12
2671         cmp             $64*8,%rdx
2672         jb              .Ltail8xvl
2673
2674         mov             $0x80,%eax              # size optimization
2675         vpxord          0x00(%rsi),%ymm19,%ymm19        # xor with input
2676         vpxor           0x20(%rsi),%ymm0,%ymm0
2677         vpxor           0x40(%rsi),%ymm5,%ymm5
2678         vpxor           0x60(%rsi),%ymm13,%ymm13
2679         lea             (%rsi,%rax),%rsi        # size optimization
2680         vmovdqu32       %ymm19,0x00(%rdi)
2681         vmovdqu         %ymm0,0x20(%rdi)
2682         vmovdqu         %ymm5,0x40(%rdi)
2683         vmovdqu         %ymm13,0x60(%rdi)
2684         lea             (%rdi,%rax),%rdi        # size optimization
2685
2686         vpxor           0x00(%rsi),%ymm1,%ymm1
2687         vpxor           0x20(%rsi),%ymm9,%ymm9
2688         vpxor           0x40(%rsi),%ymm2,%ymm2
2689         vpxor           0x60(%rsi),%ymm10,%ymm10
2690         lea             (%rsi,%rax),%rsi        # size optimization
2691         vmovdqu         %ymm1,0x00(%rdi)
2692         vmovdqu         %ymm9,0x20(%rdi)
2693         vmovdqu         %ymm2,0x40(%rdi)
2694         vmovdqu         %ymm10,0x60(%rdi)
2695         lea             (%rdi,%rax),%rdi        # size optimization
2696
2697         vpxord          0x00(%rsi),%ymm18,%ymm18
2698         vpxor           0x20(%rsi),%ymm6,%ymm6
2699         vpxor           0x40(%rsi),%ymm7,%ymm7
2700         vpxor           0x60(%rsi),%ymm15,%ymm15
2701         lea             (%rsi,%rax),%rsi        # size optimization
2702         vmovdqu32       %ymm18,0x00(%rdi)
2703         vmovdqu         %ymm6,0x20(%rdi)
2704         vmovdqu         %ymm7,0x40(%rdi)
2705         vmovdqu         %ymm15,0x60(%rdi)
2706         lea             (%rdi,%rax),%rdi        # size optimization
2707
2708         vpxor           0x00(%rsi),%ymm3,%ymm3
2709         vpxor           0x20(%rsi),%ymm11,%ymm11
2710         vpxor           0x40(%rsi),%ymm4,%ymm4
2711         vpxor           0x60(%rsi),%ymm12,%ymm12
2712         lea             (%rsi,%rax),%rsi        # size optimization
2713         vmovdqu         %ymm3,0x00(%rdi)
2714         vmovdqu         %ymm11,0x20(%rdi)
2715         vmovdqu         %ymm4,0x40(%rdi)
2716         vmovdqu         %ymm12,0x60(%rdi)
2717         lea             (%rdi,%rax),%rdi        # size optimization
2718
2719         vpbroadcastd    0(%r9),%ymm0            # reload key
2720         vpbroadcastd    4(%r9),%ymm1
2721
2722         sub             $64*8,%rdx
2723         jnz             .Loop_outer8xvl
2724
2725         jmp             .Ldone8xvl
2726
2727 .align  32
2728 .Ltail8xvl:
2729         vmovdqa64       %ymm19,%ymm8            # size optimization
2730         xor             %r9,%r9
2731         sub             %rsi,%rdi
2732         cmp             $64*1,%rdx
2733         jb              .Less_than_64_8xvl
2734         vpxor           0x00(%rsi),%ymm8,%ymm8  # xor with input
2735         vpxor           0x20(%rsi),%ymm0,%ymm0
2736         vmovdqu         %ymm8,0x00(%rdi,%rsi)
2737         vmovdqu         %ymm0,0x20(%rdi,%rsi)
2738         je              .Ldone8xvl
2739         vmovdqa         %ymm5,%ymm8
2740         vmovdqa         %ymm13,%ymm0
2741         lea             64(%rsi),%rsi
2742
2743         cmp             $64*2,%rdx
2744         jb              .Less_than_64_8xvl
2745         vpxor           0x00(%rsi),%ymm5,%ymm5
2746         vpxor           0x20(%rsi),%ymm13,%ymm13
2747         vmovdqu         %ymm5,0x00(%rdi,%rsi)
2748         vmovdqu         %ymm13,0x20(%rdi,%rsi)
2749         je              .Ldone8xvl
2750         vmovdqa         %ymm1,%ymm8
2751         vmovdqa         %ymm9,%ymm0
2752         lea             64(%rsi),%rsi
2753
2754         cmp             $64*3,%rdx
2755         jb              .Less_than_64_8xvl
2756         vpxor           0x00(%rsi),%ymm1,%ymm1
2757         vpxor           0x20(%rsi),%ymm9,%ymm9
2758         vmovdqu         %ymm1,0x00(%rdi,%rsi)
2759         vmovdqu         %ymm9,0x20(%rdi,%rsi)
2760         je              .Ldone8xvl
2761         vmovdqa         %ymm2,%ymm8
2762         vmovdqa         %ymm10,%ymm0
2763         lea             64(%rsi),%rsi
2764
2765         cmp             $64*4,%rdx
2766         jb              .Less_than_64_8xvl
2767         vpxor           0x00(%rsi),%ymm2,%ymm2
2768         vpxor           0x20(%rsi),%ymm10,%ymm10
2769         vmovdqu         %ymm2,0x00(%rdi,%rsi)
2770         vmovdqu         %ymm10,0x20(%rdi,%rsi)
2771         je              .Ldone8xvl
2772         vmovdqa32       %ymm18,%ymm8
2773         vmovdqa         %ymm6,%ymm0
2774         lea             64(%rsi),%rsi
2775
2776         cmp             $64*5,%rdx
2777         jb              .Less_than_64_8xvl
2778         vpxord          0x00(%rsi),%ymm18,%ymm18
2779         vpxor           0x20(%rsi),%ymm6,%ymm6
2780         vmovdqu32       %ymm18,0x00(%rdi,%rsi)
2781         vmovdqu         %ymm6,0x20(%rdi,%rsi)
2782         je              .Ldone8xvl
2783         vmovdqa         %ymm7,%ymm8
2784         vmovdqa         %ymm15,%ymm0
2785         lea             64(%rsi),%rsi
2786
2787         cmp             $64*6,%rdx
2788         jb              .Less_than_64_8xvl
2789         vpxor           0x00(%rsi),%ymm7,%ymm7
2790         vpxor           0x20(%rsi),%ymm15,%ymm15
2791         vmovdqu         %ymm7,0x00(%rdi,%rsi)
2792         vmovdqu         %ymm15,0x20(%rdi,%rsi)
2793         je              .Ldone8xvl
2794         vmovdqa         %ymm3,%ymm8
2795         vmovdqa         %ymm11,%ymm0
2796         lea             64(%rsi),%rsi
2797
2798         cmp             $64*7,%rdx
2799         jb              .Less_than_64_8xvl
2800         vpxor           0x00(%rsi),%ymm3,%ymm3
2801         vpxor           0x20(%rsi),%ymm11,%ymm11
2802         vmovdqu         %ymm3,0x00(%rdi,%rsi)
2803         vmovdqu         %ymm11,0x20(%rdi,%rsi)
2804         je              .Ldone8xvl
2805         vmovdqa         %ymm4,%ymm8
2806         vmovdqa         %ymm12,%ymm0
2807         lea             64(%rsi),%rsi
2808
2809 .Less_than_64_8xvl:
2810         vmovdqa         %ymm8,0x00(%rsp)
2811         vmovdqa         %ymm0,0x20(%rsp)
2812         lea             (%rdi,%rsi),%rdi
2813         and             $63,%rdx
2814
2815 .Loop_tail8xvl:
2816         movzb           (%rsi,%r9),%eax
2817         movzb           (%rsp,%r9),%ecx
2818         lea             1(%r9),%r9
2819         xor             %ecx,%eax
2820         mov             %al,-1(%rdi,%r9)
2821         dec             %rdx
2822         jnz             .Loop_tail8xvl
2823
2824         vpxor           %ymm8,%ymm8,%ymm8
2825         vmovdqa         %ymm8,0x00(%rsp)
2826         vmovdqa         %ymm8,0x20(%rsp)
2827
2828 .Ldone8xvl:
2829         vzeroall
2830         lea             -8(%r10),%rsp
2831 .L8xvl_epilogue:
2832         ret
2833 .size   chacha20_8xvl,.-chacha20_8xvl
2834 #endif