1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
3 // Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
4 // Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
5 // Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
7 // This code is taken from the OpenSSL project but the author, Andy Polyakov,
8 // has relicensed it under the licenses specified in the SPDX header above.
9 // The original headers, including the original license headers, are
10 // included below for completeness.
12 // ====================================================================
13 // Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
14 // project. The module is, however, dual licensed under OpenSSL and
15 // CRYPTOGAMS licenses depending on where you obtain it. For further
16 // details see http://www.openssl.org/~appro/cryptogams/.
17 // ====================================================================
21 // ChaCha20 for x86_64.
25 // Add AVX512F code path.
29 // Add AVX512VL code path.
31 // Performance in cycles per byte out of large buffer.
33 // IALU/gcc 4.8(i) 1x/2xSSSE3(ii) 4xSSSE3 NxAVX(v)
36 // Core2 7.83/+55% 7.90/5.76 4.35
37 // Westmere 7.19/+50% 5.60/4.50 3.00
38 // Sandy Bridge 8.31/+42% 5.45/4.00 2.72
39 // Ivy Bridge 6.71/+46% 5.40/? 2.41
40 // Haswell 5.92/+43% 5.20/3.45 2.42 1.23
41 // Skylake[-X] 5.87/+39% 4.70/3.22 2.31 1.19[0.80(vi)]
42 // Silvermont 12.0/+33% 7.75/6.90 7.03(iii)
43 // Knights L 11.7/- ? 9.60(iii) 0.80
44 // Goldmont 10.6/+17% 5.10/3.52 3.28
45 // Sledgehammer 7.28/+52% - -
46 // Bulldozer 9.66/+28% 9.85/5.35(iv) 3.06(iv)
47 // Ryzen 5.96/+50% 5.19/3.00 2.40 2.09
48 // VIA Nano 10.5/+46% 6.72/6.88 6.05
50 // (i) compared to older gcc 3.x one can observe >2x improvement on
52 // (ii) 2xSSSE3 is code path optimized specifically for 128 bytes used
53 // by chacha20_poly1305_tls_cipher, results are EVP-free;
54 // (iii) this is not optimal result for Atom because of MSROM
55 // limitations, SSE2 can do better, but gain is considered too
56 // low to justify the [maintenance] effort;
57 // (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20
58 // and 4.85 for 128-byte inputs;
59 // (v) 8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable;
60 // (vi) even though Skylake-X can execute AVX512F code and deliver 0.57
61 // cpb in single thread, the corresponding capability is suppressed;
63 //#include <linux/linkage.h>
64 .section .rodata.cst16.Lzero, "aM", @progbits, 16
68 .section .rodata.cst16.Lone, "aM", @progbits, 16
72 .section .rodata.cst16.Linc, "aM", @progbits, 16
76 .section .rodata.cst16.Lfour, "aM", @progbits, 16
80 .section .rodata.cst32.Lincy, "aM", @progbits, 32
84 .section .rodata.cst32.Leight, "aM", @progbits, 32
88 .section .rodata.cst16.Lrot16, "aM", @progbits, 16
91 .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
92 .section .rodata.cst16.Lrot24, "aM", @progbits, 16
95 .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
96 .section .rodata.cst32.Ltwoy, "aM", @progbits, 32
99 .long 2,0,0,0, 2,0,0,0
100 .section .rodata.cst64.Lzeroz, "aM", @progbits, 64
103 .long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
104 .section .rodata.cst64.Lfourz, "aM", @progbits, 64
107 .long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
108 .section .rodata.cst64.Lincz, "aM", @progbits, 64
111 .long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
112 .section .rodata.cst64.Lsixteen, "aM", @progbits, 64
115 .long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
116 .section .rodata.cst16.Lsigma, "aM", @progbits, 16
119 .ascii "expand 32-byte k"
121 #ifdef CONFIG_AS_SSSE3
123 SYM_FUNC_START(hchacha20_ssse3)
125 movdqa .Lsigma(%rip),%xmm0
127 movdqu 16(%rdx),%xmm2
129 # This code is only used when targeting kernel.
130 # If targeting win64, xmm{6,7} preserving needs to be added.
131 movdqa .Lrot16(%rip),%xmm6
132 movdqa .Lrot24(%rip),%xmm7
133 mov $10,%r8 # reuse %r8
155 pshufd $147,%xmm0,%xmm0
156 pshufd $78,%xmm3,%xmm3
157 pshufd $57,%xmm2,%xmm2
177 pshufd $57,%xmm0,%xmm0
178 pshufd $78,%xmm3,%xmm3
179 pshufd $147,%xmm2,%xmm2
183 movdqu %xmm3, 16(%rdi)
185 SYM_FUNC_END(hchacha20_ssse3)
187 SYM_FUNC_START(chacha20_ssse3)
189 lea 8(%rsp),%r10 # frame pointer
190 cmp $128,%rdx # we might throw away some data,
192 ja .Lchacha20_4x # but overall it won't be slower
194 .Ldo_ssse3_after_all:
197 movdqa .Lsigma(%rip),%xmm0
199 movdqu 16(%rcx),%xmm2
201 movdqa .Lrot16(%rip),%xmm6
202 movdqa .Lrot24(%rip),%xmm7
204 movdqa %xmm0,0x00(%rsp)
205 movdqa %xmm1,0x10(%rsp)
206 movdqa %xmm2,0x20(%rsp)
207 movdqa %xmm3,0x30(%rsp)
208 mov $10,%r8 # reuse %r8
213 movdqa .Lone(%rip),%xmm3
214 movdqa 0x00(%rsp),%xmm0
215 movdqa 0x10(%rsp),%xmm1
216 movdqa 0x20(%rsp),%xmm2
217 paddd 0x30(%rsp),%xmm3
219 movdqa %xmm3,0x30(%rsp)
242 pshufd $147,%xmm0,%xmm0
243 pshufd $78,%xmm3,%xmm3
244 pshufd $57,%xmm2,%xmm2
264 pshufd $57,%xmm0,%xmm0
265 pshufd $78,%xmm3,%xmm3
266 pshufd $147,%xmm2,%xmm2
269 paddd 0x00(%rsp),%xmm0
270 paddd 0x10(%rsp),%xmm1
271 paddd 0x20(%rsp),%xmm2
272 paddd 0x30(%rsp),%xmm3
277 movdqu 0x00(%rsi),%xmm4
278 movdqu 0x10(%rsi),%xmm5
279 pxor %xmm4,%xmm0 # xor with input
280 movdqu 0x20(%rsi),%xmm4
282 movdqu 0x30(%rsi),%xmm5
283 lea 0x40(%rsi),%rsi # inp+=64
287 movdqu %xmm0,0x00(%rdi) # write output
288 movdqu %xmm1,0x10(%rdi)
289 movdqu %xmm2,0x20(%rdi)
290 movdqu %xmm3,0x30(%rdi)
291 lea 0x40(%rdi),%rdi # out+=64
294 jnz .Loop_outer_ssse3
300 movdqa %xmm0,0x00(%rsp)
301 movdqa %xmm1,0x10(%rsp)
302 movdqa %xmm2,0x20(%rsp)
303 movdqa %xmm3,0x30(%rsp)
307 movzb (%rsi,%r8),%eax
308 movzb (%rsp,%r8),%ecx
319 SYM_FUNC_END(chacha20_ssse3)
320 .type chacha20_128,@function
324 lea 8(%rsp),%r10 # frame pointer
327 movdqa .Lsigma(%rip),%xmm8
329 movdqu 16(%rcx),%xmm2
331 movdqa .Lone(%rip),%xmm1
332 movdqa .Lrot16(%rip),%xmm6
333 movdqa .Lrot24(%rip),%xmm7
336 movdqa %xmm8,0x00(%rsp)
338 movdqa %xmm9,0x10(%rsp)
340 movdqa %xmm2,0x20(%rsp)
342 movdqa %xmm3,0x30(%rsp)
343 mov $10,%r8 # reuse %r8
384 pshufd $147,%xmm8,%xmm8
385 pshufd $78,%xmm3,%xmm3
386 pshufd $57,%xmm2,%xmm2
387 pshufd $147,%xmm10,%xmm10
388 pshufd $78,%xmm1,%xmm1
389 pshufd $57,%xmm0,%xmm0
426 pshufd $57,%xmm8,%xmm8
427 pshufd $78,%xmm3,%xmm3
428 pshufd $147,%xmm2,%xmm2
429 pshufd $57,%xmm10,%xmm10
430 pshufd $78,%xmm1,%xmm1
431 pshufd $147,%xmm0,%xmm0
434 paddd 0x00(%rsp),%xmm8
435 paddd 0x10(%rsp),%xmm9
436 paddd 0x20(%rsp),%xmm2
437 paddd 0x30(%rsp),%xmm3
438 paddd .Lone(%rip),%xmm1
439 paddd 0x00(%rsp),%xmm10
440 paddd 0x10(%rsp),%xmm11
441 paddd 0x20(%rsp),%xmm0
442 paddd 0x30(%rsp),%xmm1
444 movdqu 0x00(%rsi),%xmm4
445 movdqu 0x10(%rsi),%xmm5
446 pxor %xmm4,%xmm8 # xor with input
447 movdqu 0x20(%rsi),%xmm4
449 movdqu 0x30(%rsi),%xmm5
451 movdqu 0x40(%rsi),%xmm4
453 movdqu 0x50(%rsi),%xmm5
455 movdqu 0x60(%rsi),%xmm4
457 movdqu 0x70(%rsi),%xmm5
461 movdqu %xmm8,0x00(%rdi) # write output
462 movdqu %xmm9,0x10(%rdi)
463 movdqu %xmm2,0x20(%rdi)
464 movdqu %xmm3,0x30(%rdi)
465 movdqu %xmm10,0x40(%rdi)
466 movdqu %xmm11,0x50(%rdi)
467 movdqu %xmm0,0x60(%rdi)
468 movdqu %xmm1,0x70(%rdi)
472 .size chacha20_128,.-chacha20_128
473 .type chacha20_4x,@function
477 lea 8(%rsp),%r10 # frame pointer
483 movdqa .Lsigma(%rip),%xmm11 # key[0]
484 movdqu (%rcx),%xmm15 # key[1]
485 movdqu 16(%rcx),%xmm7 # key[2]
486 movdqu (%r8),%xmm3 # key[3]
487 lea 0x100(%rsp),%rcx # size optimization
488 lea .Lrot16(%rip),%r9
489 lea .Lrot24(%rip),%r11
491 pshufd $0x00,%xmm11,%xmm8 # smash key by lanes...
492 pshufd $0x55,%xmm11,%xmm9
493 movdqa %xmm8,0x40(%rsp) # ... and offload
494 pshufd $0xaa,%xmm11,%xmm10
495 movdqa %xmm9,0x50(%rsp)
496 pshufd $0xff,%xmm11,%xmm11
497 movdqa %xmm10,0x60(%rsp)
498 movdqa %xmm11,0x70(%rsp)
500 pshufd $0x00,%xmm15,%xmm12
501 pshufd $0x55,%xmm15,%xmm13
502 movdqa %xmm12,0x80-0x100(%rcx)
503 pshufd $0xaa,%xmm15,%xmm14
504 movdqa %xmm13,0x90-0x100(%rcx)
505 pshufd $0xff,%xmm15,%xmm15
506 movdqa %xmm14,0xa0-0x100(%rcx)
507 movdqa %xmm15,0xb0-0x100(%rcx)
509 pshufd $0x00,%xmm7,%xmm4 # ""
510 pshufd $0x55,%xmm7,%xmm5 # ""
511 movdqa %xmm4,0xc0-0x100(%rcx)
512 pshufd $0xaa,%xmm7,%xmm6 # ""
513 movdqa %xmm5,0xd0-0x100(%rcx)
514 pshufd $0xff,%xmm7,%xmm7 # ""
515 movdqa %xmm6,0xe0-0x100(%rcx)
516 movdqa %xmm7,0xf0-0x100(%rcx)
518 pshufd $0x00,%xmm3,%xmm0
519 pshufd $0x55,%xmm3,%xmm1
520 paddd .Linc(%rip),%xmm0 # don't save counters yet
521 pshufd $0xaa,%xmm3,%xmm2
522 movdqa %xmm1,0x110-0x100(%rcx)
523 pshufd $0xff,%xmm3,%xmm3
524 movdqa %xmm2,0x120-0x100(%rcx)
525 movdqa %xmm3,0x130-0x100(%rcx)
531 movdqa 0x40(%rsp),%xmm8 # re-load smashed key
532 movdqa 0x50(%rsp),%xmm9
533 movdqa 0x60(%rsp),%xmm10
534 movdqa 0x70(%rsp),%xmm11
535 movdqa 0x80-0x100(%rcx),%xmm12
536 movdqa 0x90-0x100(%rcx),%xmm13
537 movdqa 0xa0-0x100(%rcx),%xmm14
538 movdqa 0xb0-0x100(%rcx),%xmm15
539 movdqa 0xc0-0x100(%rcx),%xmm4 # ""
540 movdqa 0xd0-0x100(%rcx),%xmm5 # ""
541 movdqa 0xe0-0x100(%rcx),%xmm6 # ""
542 movdqa 0xf0-0x100(%rcx),%xmm7 # ""
543 movdqa 0x100-0x100(%rcx),%xmm0
544 movdqa 0x110-0x100(%rcx),%xmm1
545 movdqa 0x120-0x100(%rcx),%xmm2
546 movdqa 0x130-0x100(%rcx),%xmm3
547 paddd .Lfour(%rip),%xmm0 # next SIMD counters
550 movdqa %xmm6,0x20(%rsp) # SIMD equivalent of "%nox"
551 movdqa %xmm7,0x30(%rsp) # SIMD equivalent of "%nox"
552 movdqa (%r9),%xmm7 # .Lrot16(%rip)
554 movdqa %xmm0,0x100-0x100(%rcx) # save SIMD counters
598 movdqa %xmm5,16(%rsp)
599 movdqa 32(%rsp),%xmm4
600 movdqa 48(%rsp),%xmm5
677 movdqa %xmm4,32(%rsp)
678 movdqa %xmm5,48(%rsp)
680 movdqa 16(%rsp),%xmm5
722 paddd 0x40(%rsp),%xmm8 # accumulate key material
723 paddd 0x50(%rsp),%xmm9
724 paddd 0x60(%rsp),%xmm10
725 paddd 0x70(%rsp),%xmm11
727 movdqa %xmm8,%xmm6 # "de-interlace" data
728 punpckldq %xmm9,%xmm8
730 punpckldq %xmm11,%xmm10
731 punpckhdq %xmm9,%xmm6
732 punpckhdq %xmm11,%xmm7
734 punpcklqdq %xmm10,%xmm8 # "a0"
736 punpcklqdq %xmm7,%xmm6 # "a2"
737 punpckhqdq %xmm10,%xmm9 # "a1"
738 punpckhqdq %xmm7,%xmm11 # "a3"
739 paddd 0x80-0x100(%rcx),%xmm12
740 paddd 0x90-0x100(%rcx),%xmm13
741 paddd 0xa0-0x100(%rcx),%xmm14
742 paddd 0xb0-0x100(%rcx),%xmm15
744 movdqa %xmm8,0x00(%rsp) # offload
745 movdqa %xmm9,0x10(%rsp)
746 movdqa 0x20(%rsp),%xmm8 # "xc2"
747 movdqa 0x30(%rsp),%xmm9 # "xc3"
750 punpckldq %xmm13,%xmm12
752 punpckldq %xmm15,%xmm14
753 punpckhdq %xmm13,%xmm10
754 punpckhdq %xmm15,%xmm7
756 punpcklqdq %xmm14,%xmm12 # "b0"
758 punpcklqdq %xmm7,%xmm10 # "b2"
759 punpckhqdq %xmm14,%xmm13 # "b1"
760 punpckhqdq %xmm7,%xmm15 # "b3"
761 paddd 0xc0-0x100(%rcx),%xmm4
762 paddd 0xd0-0x100(%rcx),%xmm5
763 paddd 0xe0-0x100(%rcx),%xmm8
764 paddd 0xf0-0x100(%rcx),%xmm9
766 movdqa %xmm6,0x20(%rsp) # keep offloading
767 movdqa %xmm11,0x30(%rsp)
770 punpckldq %xmm5,%xmm4
772 punpckldq %xmm9,%xmm8
773 punpckhdq %xmm5,%xmm14
774 punpckhdq %xmm9,%xmm7
776 punpcklqdq %xmm8,%xmm4 # "c0"
778 punpcklqdq %xmm7,%xmm14 # "c2"
779 punpckhqdq %xmm8,%xmm5 # "c1"
780 punpckhqdq %xmm7,%xmm9 # "c3"
781 paddd 0x100-0x100(%rcx),%xmm0
782 paddd 0x110-0x100(%rcx),%xmm1
783 paddd 0x120-0x100(%rcx),%xmm2
784 paddd 0x130-0x100(%rcx),%xmm3
787 punpckldq %xmm1,%xmm0
789 punpckldq %xmm3,%xmm2
790 punpckhdq %xmm1,%xmm8
791 punpckhdq %xmm3,%xmm7
793 punpcklqdq %xmm2,%xmm0 # "d0"
795 punpcklqdq %xmm7,%xmm8 # "d2"
796 punpckhqdq %xmm2,%xmm1 # "d1"
797 punpckhqdq %xmm7,%xmm3 # "d3"
801 movdqu 0x00(%rsi),%xmm6 # xor with input
802 movdqu 0x10(%rsi),%xmm11
803 movdqu 0x20(%rsi),%xmm2
804 movdqu 0x30(%rsi),%xmm7
805 pxor 0x00(%rsp),%xmm6 # is offloaded, remember?
810 movdqu %xmm6,0x00(%rdi)
811 movdqu 0x40(%rsi),%xmm6
812 movdqu %xmm11,0x10(%rdi)
813 movdqu 0x50(%rsi),%xmm11
814 movdqu %xmm2,0x20(%rdi)
815 movdqu 0x60(%rsi),%xmm2
816 movdqu %xmm7,0x30(%rdi)
817 movdqu 0x70(%rsi),%xmm7
818 lea 0x80(%rsi),%rsi # size optimization
819 pxor 0x10(%rsp),%xmm6
824 movdqu %xmm6,0x40(%rdi)
825 movdqu 0x00(%rsi),%xmm6
826 movdqu %xmm11,0x50(%rdi)
827 movdqu 0x10(%rsi),%xmm11
828 movdqu %xmm2,0x60(%rdi)
829 movdqu 0x20(%rsi),%xmm2
830 movdqu %xmm7,0x70(%rdi)
831 lea 0x80(%rdi),%rdi # size optimization
832 movdqu 0x30(%rsi),%xmm7
833 pxor 0x20(%rsp),%xmm6
838 movdqu %xmm6,0x00(%rdi)
839 movdqu 0x40(%rsi),%xmm6
840 movdqu %xmm11,0x10(%rdi)
841 movdqu 0x50(%rsi),%xmm11
842 movdqu %xmm2,0x20(%rdi)
843 movdqu 0x60(%rsi),%xmm2
844 movdqu %xmm7,0x30(%rdi)
845 movdqu 0x70(%rsi),%xmm7
846 lea 0x80(%rsi),%rsi # inp+=64*4
847 pxor 0x30(%rsp),%xmm6
851 movdqu %xmm6,0x40(%rdi)
852 movdqu %xmm11,0x50(%rdi)
853 movdqu %xmm2,0x60(%rdi)
854 movdqu %xmm7,0x70(%rdi)
855 lea 0x80(%rdi),%rdi # out+=64*4
870 #movdqa 0x00(%rsp),%xmm6 # is offloaded, remember?
872 #movdqa %xmm6,0x00(%rsp)
873 movdqa %xmm12,0x10(%rsp)
874 movdqa %xmm4,0x20(%rsp)
875 movdqa %xmm0,0x30(%rsp)
880 movdqu 0x00(%rsi),%xmm6 # xor with input
881 movdqu 0x10(%rsi),%xmm11
882 movdqu 0x20(%rsi),%xmm2
883 movdqu 0x30(%rsi),%xmm7
884 pxor 0x00(%rsp),%xmm6 # is offloaded, remember?
888 movdqu %xmm6,0x00(%rdi)
889 movdqu %xmm11,0x10(%rdi)
890 movdqu %xmm2,0x20(%rdi)
891 movdqu %xmm7,0x30(%rdi)
894 movdqa 0x10(%rsp),%xmm6 # is offloaded, remember?
895 lea 0x40(%rsi),%rsi # inp+=64*1
897 movdqa %xmm6,0x00(%rsp)
898 movdqa %xmm13,0x10(%rsp)
899 lea 0x40(%rdi),%rdi # out+=64*1
900 movdqa %xmm5,0x20(%rsp)
901 sub $64,%rdx # len-=64*1
902 movdqa %xmm1,0x30(%rsp)
907 movdqu 0x00(%rsi),%xmm6 # xor with input
908 movdqu 0x10(%rsi),%xmm11
909 movdqu 0x20(%rsi),%xmm2
910 movdqu 0x30(%rsi),%xmm7
911 pxor 0x00(%rsp),%xmm6 # is offloaded, remember?
916 movdqu %xmm6,0x00(%rdi)
917 movdqu 0x40(%rsi),%xmm6
918 movdqu %xmm11,0x10(%rdi)
919 movdqu 0x50(%rsi),%xmm11
920 movdqu %xmm2,0x20(%rdi)
921 movdqu 0x60(%rsi),%xmm2
922 movdqu %xmm7,0x30(%rdi)
923 movdqu 0x70(%rsi),%xmm7
924 pxor 0x10(%rsp),%xmm6
928 movdqu %xmm6,0x40(%rdi)
929 movdqu %xmm11,0x50(%rdi)
930 movdqu %xmm2,0x60(%rdi)
931 movdqu %xmm7,0x70(%rdi)
934 movdqa 0x20(%rsp),%xmm6 # is offloaded, remember?
935 lea 0x80(%rsi),%rsi # inp+=64*2
937 movdqa %xmm6,0x00(%rsp)
938 movdqa %xmm10,0x10(%rsp)
939 lea 0x80(%rdi),%rdi # out+=64*2
940 movdqa %xmm14,0x20(%rsp)
941 sub $128,%rdx # len-=64*2
942 movdqa %xmm8,0x30(%rsp)
947 movdqu 0x00(%rsi),%xmm6 # xor with input
948 movdqu 0x10(%rsi),%xmm11
949 movdqu 0x20(%rsi),%xmm2
950 movdqu 0x30(%rsi),%xmm7
951 pxor 0x00(%rsp),%xmm6 # is offloaded, remember?
956 movdqu %xmm6,0x00(%rdi)
957 movdqu 0x40(%rsi),%xmm6
958 movdqu %xmm11,0x10(%rdi)
959 movdqu 0x50(%rsi),%xmm11
960 movdqu %xmm2,0x20(%rdi)
961 movdqu 0x60(%rsi),%xmm2
962 movdqu %xmm7,0x30(%rdi)
963 movdqu 0x70(%rsi),%xmm7
964 lea 0x80(%rsi),%rsi # size optimization
965 pxor 0x10(%rsp),%xmm6
970 movdqu %xmm6,0x40(%rdi)
971 movdqu 0x00(%rsi),%xmm6
972 movdqu %xmm11,0x50(%rdi)
973 movdqu 0x10(%rsi),%xmm11
974 movdqu %xmm2,0x60(%rdi)
975 movdqu 0x20(%rsi),%xmm2
976 movdqu %xmm7,0x70(%rdi)
977 lea 0x80(%rdi),%rdi # size optimization
978 movdqu 0x30(%rsi),%xmm7
979 pxor 0x20(%rsp),%xmm6
983 movdqu %xmm6,0x00(%rdi)
984 movdqu %xmm11,0x10(%rdi)
985 movdqu %xmm2,0x20(%rdi)
986 movdqu %xmm7,0x30(%rdi)
989 movdqa 0x30(%rsp),%xmm6 # is offloaded, remember?
990 lea 0x40(%rsi),%rsi # inp+=64*3
992 movdqa %xmm6,0x00(%rsp)
993 movdqa %xmm15,0x10(%rsp)
994 lea 0x40(%rdi),%rdi # out+=64*3
995 movdqa %xmm9,0x20(%rsp)
996 sub $192,%rdx # len-=64*3
997 movdqa %xmm3,0x30(%rsp)
1000 movzb (%rsi,%r9),%eax
1001 movzb (%rsp,%r9),%ecx
1004 mov %al,-1(%rdi,%r9)
1012 .size chacha20_4x,.-chacha20_4x
1014 #ifdef CONFIG_AS_AVX2
1016 SYM_FUNC_START(chacha20_avx2)
1019 lea 8(%rsp),%r10 # frame register
1024 ################ stack layout
1025 # +0x00 SIMD equivalent of %r12d
1027 # +0x80 constant copy of key[0-2] smashed by lanes
1029 # +0x200 SIMD counters (with nonce smashed by lanes)
1033 vbroadcasti128 .Lsigma(%rip),%ymm11 # key[0]
1034 vbroadcasti128 (%rcx),%ymm3 # key[1]
1035 vbroadcasti128 16(%rcx),%ymm15 # key[2]
1036 vbroadcasti128 (%r8),%ymm7 # key[3]
1037 lea 0x100(%rsp),%rcx # size optimization
1038 lea 0x200(%rsp),%rax # size optimization
1039 lea .Lrot16(%rip),%r9
1040 lea .Lrot24(%rip),%r11
1042 vpshufd $0x00,%ymm11,%ymm8 # smash key by lanes...
1043 vpshufd $0x55,%ymm11,%ymm9
1044 vmovdqa %ymm8,0x80-0x100(%rcx) # ... and offload
1045 vpshufd $0xaa,%ymm11,%ymm10
1046 vmovdqa %ymm9,0xa0-0x100(%rcx)
1047 vpshufd $0xff,%ymm11,%ymm11
1048 vmovdqa %ymm10,0xc0-0x100(%rcx)
1049 vmovdqa %ymm11,0xe0-0x100(%rcx)
1051 vpshufd $0x00,%ymm3,%ymm0
1052 vpshufd $0x55,%ymm3,%ymm1
1053 vmovdqa %ymm0,0x100-0x100(%rcx)
1054 vpshufd $0xaa,%ymm3,%ymm2
1055 vmovdqa %ymm1,0x120-0x100(%rcx)
1056 vpshufd $0xff,%ymm3,%ymm3
1057 vmovdqa %ymm2,0x140-0x100(%rcx)
1058 vmovdqa %ymm3,0x160-0x100(%rcx)
1060 vpshufd $0x00,%ymm15,%ymm12 # "xc0"
1061 vpshufd $0x55,%ymm15,%ymm13 # "xc1"
1062 vmovdqa %ymm12,0x180-0x200(%rax)
1063 vpshufd $0xaa,%ymm15,%ymm14 # "xc2"
1064 vmovdqa %ymm13,0x1a0-0x200(%rax)
1065 vpshufd $0xff,%ymm15,%ymm15 # "xc3"
1066 vmovdqa %ymm14,0x1c0-0x200(%rax)
1067 vmovdqa %ymm15,0x1e0-0x200(%rax)
1069 vpshufd $0x00,%ymm7,%ymm4
1070 vpshufd $0x55,%ymm7,%ymm5
1071 vpaddd .Lincy(%rip),%ymm4,%ymm4 # don't save counters yet
1072 vpshufd $0xaa,%ymm7,%ymm6
1073 vmovdqa %ymm5,0x220-0x200(%rax)
1074 vpshufd $0xff,%ymm7,%ymm7
1075 vmovdqa %ymm6,0x240-0x200(%rax)
1076 vmovdqa %ymm7,0x260-0x200(%rax)
1082 vmovdqa 0x80-0x100(%rcx),%ymm8 # re-load smashed key
1083 vmovdqa 0xa0-0x100(%rcx),%ymm9
1084 vmovdqa 0xc0-0x100(%rcx),%ymm10
1085 vmovdqa 0xe0-0x100(%rcx),%ymm11
1086 vmovdqa 0x100-0x100(%rcx),%ymm0
1087 vmovdqa 0x120-0x100(%rcx),%ymm1
1088 vmovdqa 0x140-0x100(%rcx),%ymm2
1089 vmovdqa 0x160-0x100(%rcx),%ymm3
1090 vmovdqa 0x180-0x200(%rax),%ymm12 # "xc0"
1091 vmovdqa 0x1a0-0x200(%rax),%ymm13 # "xc1"
1092 vmovdqa 0x1c0-0x200(%rax),%ymm14 # "xc2"
1093 vmovdqa 0x1e0-0x200(%rax),%ymm15 # "xc3"
1094 vmovdqa 0x200-0x200(%rax),%ymm4
1095 vmovdqa 0x220-0x200(%rax),%ymm5
1096 vmovdqa 0x240-0x200(%rax),%ymm6
1097 vmovdqa 0x260-0x200(%rax),%ymm7
1098 vpaddd .Leight(%rip),%ymm4,%ymm4 # next SIMD counters
1101 vmovdqa %ymm14,0x40(%rsp) # SIMD equivalent of "%nox"
1102 vmovdqa %ymm15,0x60(%rsp) # SIMD equivalent of "%nox"
1103 vbroadcasti128 (%r9),%ymm15
1104 vmovdqa %ymm4,0x200-0x200(%rax) # save SIMD counters
1110 vpaddd %ymm0,%ymm8,%ymm8
1111 vpxor %ymm4,%ymm8,%ymm4
1112 vpshufb %ymm15,%ymm4,%ymm4
1113 vpaddd %ymm1,%ymm9,%ymm9
1114 vpxor %ymm5,%ymm9,%ymm5
1115 vpshufb %ymm15,%ymm5,%ymm5
1116 vpaddd %ymm4,%ymm12,%ymm12
1117 vpxor %ymm0,%ymm12,%ymm0
1118 vpslld $12,%ymm0,%ymm14
1119 vpsrld $20,%ymm0,%ymm0
1120 vpor %ymm0,%ymm14,%ymm0
1121 vbroadcasti128 (%r11),%ymm14
1122 vpaddd %ymm5,%ymm13,%ymm13
1123 vpxor %ymm1,%ymm13,%ymm1
1124 vpslld $12,%ymm1,%ymm15
1125 vpsrld $20,%ymm1,%ymm1
1126 vpor %ymm1,%ymm15,%ymm1
1127 vpaddd %ymm0,%ymm8,%ymm8
1128 vpxor %ymm4,%ymm8,%ymm4
1129 vpshufb %ymm14,%ymm4,%ymm4
1130 vpaddd %ymm1,%ymm9,%ymm9
1131 vpxor %ymm5,%ymm9,%ymm5
1132 vpshufb %ymm14,%ymm5,%ymm5
1133 vpaddd %ymm4,%ymm12,%ymm12
1134 vpxor %ymm0,%ymm12,%ymm0
1135 vpslld $7,%ymm0,%ymm15
1136 vpsrld $25,%ymm0,%ymm0
1137 vpor %ymm0,%ymm15,%ymm0
1138 vbroadcasti128 (%r9),%ymm15
1139 vpaddd %ymm5,%ymm13,%ymm13
1140 vpxor %ymm1,%ymm13,%ymm1
1141 vpslld $7,%ymm1,%ymm14
1142 vpsrld $25,%ymm1,%ymm1
1143 vpor %ymm1,%ymm14,%ymm1
1144 vmovdqa %ymm12,0(%rsp)
1145 vmovdqa %ymm13,32(%rsp)
1146 vmovdqa 64(%rsp),%ymm12
1147 vmovdqa 96(%rsp),%ymm13
1148 vpaddd %ymm2,%ymm10,%ymm10
1149 vpxor %ymm6,%ymm10,%ymm6
1150 vpshufb %ymm15,%ymm6,%ymm6
1151 vpaddd %ymm3,%ymm11,%ymm11
1152 vpxor %ymm7,%ymm11,%ymm7
1153 vpshufb %ymm15,%ymm7,%ymm7
1154 vpaddd %ymm6,%ymm12,%ymm12
1155 vpxor %ymm2,%ymm12,%ymm2
1156 vpslld $12,%ymm2,%ymm14
1157 vpsrld $20,%ymm2,%ymm2
1158 vpor %ymm2,%ymm14,%ymm2
1159 vbroadcasti128 (%r11),%ymm14
1160 vpaddd %ymm7,%ymm13,%ymm13
1161 vpxor %ymm3,%ymm13,%ymm3
1162 vpslld $12,%ymm3,%ymm15
1163 vpsrld $20,%ymm3,%ymm3
1164 vpor %ymm3,%ymm15,%ymm3
1165 vpaddd %ymm2,%ymm10,%ymm10
1166 vpxor %ymm6,%ymm10,%ymm6
1167 vpshufb %ymm14,%ymm6,%ymm6
1168 vpaddd %ymm3,%ymm11,%ymm11
1169 vpxor %ymm7,%ymm11,%ymm7
1170 vpshufb %ymm14,%ymm7,%ymm7
1171 vpaddd %ymm6,%ymm12,%ymm12
1172 vpxor %ymm2,%ymm12,%ymm2
1173 vpslld $7,%ymm2,%ymm15
1174 vpsrld $25,%ymm2,%ymm2
1175 vpor %ymm2,%ymm15,%ymm2
1176 vbroadcasti128 (%r9),%ymm15
1177 vpaddd %ymm7,%ymm13,%ymm13
1178 vpxor %ymm3,%ymm13,%ymm3
1179 vpslld $7,%ymm3,%ymm14
1180 vpsrld $25,%ymm3,%ymm3
1181 vpor %ymm3,%ymm14,%ymm3
1182 vpaddd %ymm1,%ymm8,%ymm8
1183 vpxor %ymm7,%ymm8,%ymm7
1184 vpshufb %ymm15,%ymm7,%ymm7
1185 vpaddd %ymm2,%ymm9,%ymm9
1186 vpxor %ymm4,%ymm9,%ymm4
1187 vpshufb %ymm15,%ymm4,%ymm4
1188 vpaddd %ymm7,%ymm12,%ymm12
1189 vpxor %ymm1,%ymm12,%ymm1
1190 vpslld $12,%ymm1,%ymm14
1191 vpsrld $20,%ymm1,%ymm1
1192 vpor %ymm1,%ymm14,%ymm1
1193 vbroadcasti128 (%r11),%ymm14
1194 vpaddd %ymm4,%ymm13,%ymm13
1195 vpxor %ymm2,%ymm13,%ymm2
1196 vpslld $12,%ymm2,%ymm15
1197 vpsrld $20,%ymm2,%ymm2
1198 vpor %ymm2,%ymm15,%ymm2
1199 vpaddd %ymm1,%ymm8,%ymm8
1200 vpxor %ymm7,%ymm8,%ymm7
1201 vpshufb %ymm14,%ymm7,%ymm7
1202 vpaddd %ymm2,%ymm9,%ymm9
1203 vpxor %ymm4,%ymm9,%ymm4
1204 vpshufb %ymm14,%ymm4,%ymm4
1205 vpaddd %ymm7,%ymm12,%ymm12
1206 vpxor %ymm1,%ymm12,%ymm1
1207 vpslld $7,%ymm1,%ymm15
1208 vpsrld $25,%ymm1,%ymm1
1209 vpor %ymm1,%ymm15,%ymm1
1210 vbroadcasti128 (%r9),%ymm15
1211 vpaddd %ymm4,%ymm13,%ymm13
1212 vpxor %ymm2,%ymm13,%ymm2
1213 vpslld $7,%ymm2,%ymm14
1214 vpsrld $25,%ymm2,%ymm2
1215 vpor %ymm2,%ymm14,%ymm2
1216 vmovdqa %ymm12,64(%rsp)
1217 vmovdqa %ymm13,96(%rsp)
1218 vmovdqa 0(%rsp),%ymm12
1219 vmovdqa 32(%rsp),%ymm13
1220 vpaddd %ymm3,%ymm10,%ymm10
1221 vpxor %ymm5,%ymm10,%ymm5
1222 vpshufb %ymm15,%ymm5,%ymm5
1223 vpaddd %ymm0,%ymm11,%ymm11
1224 vpxor %ymm6,%ymm11,%ymm6
1225 vpshufb %ymm15,%ymm6,%ymm6
1226 vpaddd %ymm5,%ymm12,%ymm12
1227 vpxor %ymm3,%ymm12,%ymm3
1228 vpslld $12,%ymm3,%ymm14
1229 vpsrld $20,%ymm3,%ymm3
1230 vpor %ymm3,%ymm14,%ymm3
1231 vbroadcasti128 (%r11),%ymm14
1232 vpaddd %ymm6,%ymm13,%ymm13
1233 vpxor %ymm0,%ymm13,%ymm0
1234 vpslld $12,%ymm0,%ymm15
1235 vpsrld $20,%ymm0,%ymm0
1236 vpor %ymm0,%ymm15,%ymm0
1237 vpaddd %ymm3,%ymm10,%ymm10
1238 vpxor %ymm5,%ymm10,%ymm5
1239 vpshufb %ymm14,%ymm5,%ymm5
1240 vpaddd %ymm0,%ymm11,%ymm11
1241 vpxor %ymm6,%ymm11,%ymm6
1242 vpshufb %ymm14,%ymm6,%ymm6
1243 vpaddd %ymm5,%ymm12,%ymm12
1244 vpxor %ymm3,%ymm12,%ymm3
1245 vpslld $7,%ymm3,%ymm15
1246 vpsrld $25,%ymm3,%ymm3
1247 vpor %ymm3,%ymm15,%ymm3
1248 vbroadcasti128 (%r9),%ymm15
1249 vpaddd %ymm6,%ymm13,%ymm13
1250 vpxor %ymm0,%ymm13,%ymm0
1251 vpslld $7,%ymm0,%ymm14
1252 vpsrld $25,%ymm0,%ymm0
1253 vpor %ymm0,%ymm14,%ymm0
1257 lea 0x200(%rsp),%rax # size optimization
1258 vpaddd 0x80-0x100(%rcx),%ymm8,%ymm8 # accumulate key
1259 vpaddd 0xa0-0x100(%rcx),%ymm9,%ymm9
1260 vpaddd 0xc0-0x100(%rcx),%ymm10,%ymm10
1261 vpaddd 0xe0-0x100(%rcx),%ymm11,%ymm11
1263 vpunpckldq %ymm9,%ymm8,%ymm14 # "de-interlace" data
1264 vpunpckldq %ymm11,%ymm10,%ymm15
1265 vpunpckhdq %ymm9,%ymm8,%ymm8
1266 vpunpckhdq %ymm11,%ymm10,%ymm10
1267 vpunpcklqdq %ymm15,%ymm14,%ymm9 # "a0"
1268 vpunpckhqdq %ymm15,%ymm14,%ymm14 # "a1"
1269 vpunpcklqdq %ymm10,%ymm8,%ymm11 # "a2"
1270 vpunpckhqdq %ymm10,%ymm8,%ymm8 # "a3"
1271 vpaddd 0x100-0x100(%rcx),%ymm0,%ymm0
1272 vpaddd 0x120-0x100(%rcx),%ymm1,%ymm1
1273 vpaddd 0x140-0x100(%rcx),%ymm2,%ymm2
1274 vpaddd 0x160-0x100(%rcx),%ymm3,%ymm3
1276 vpunpckldq %ymm1,%ymm0,%ymm10
1277 vpunpckldq %ymm3,%ymm2,%ymm15
1278 vpunpckhdq %ymm1,%ymm0,%ymm0
1279 vpunpckhdq %ymm3,%ymm2,%ymm2
1280 vpunpcklqdq %ymm15,%ymm10,%ymm1 # "b0"
1281 vpunpckhqdq %ymm15,%ymm10,%ymm10 # "b1"
1282 vpunpcklqdq %ymm2,%ymm0,%ymm3 # "b2"
1283 vpunpckhqdq %ymm2,%ymm0,%ymm0 # "b3"
1284 vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 # "de-interlace" further
1285 vperm2i128 $0x31,%ymm1,%ymm9,%ymm1
1286 vperm2i128 $0x20,%ymm10,%ymm14,%ymm9
1287 vperm2i128 $0x31,%ymm10,%ymm14,%ymm10
1288 vperm2i128 $0x20,%ymm3,%ymm11,%ymm14
1289 vperm2i128 $0x31,%ymm3,%ymm11,%ymm3
1290 vperm2i128 $0x20,%ymm0,%ymm8,%ymm11
1291 vperm2i128 $0x31,%ymm0,%ymm8,%ymm0
1292 vmovdqa %ymm15,0x00(%rsp) # offload
1293 vmovdqa %ymm9,0x20(%rsp)
1294 vmovdqa 0x40(%rsp),%ymm15 # %ymm15
1295 vmovdqa 0x60(%rsp),%ymm9 # %ymm9
1297 vpaddd 0x180-0x200(%rax),%ymm12,%ymm12
1298 vpaddd 0x1a0-0x200(%rax),%ymm13,%ymm13
1299 vpaddd 0x1c0-0x200(%rax),%ymm15,%ymm15
1300 vpaddd 0x1e0-0x200(%rax),%ymm9,%ymm9
1302 vpunpckldq %ymm13,%ymm12,%ymm2
1303 vpunpckldq %ymm9,%ymm15,%ymm8
1304 vpunpckhdq %ymm13,%ymm12,%ymm12
1305 vpunpckhdq %ymm9,%ymm15,%ymm15
1306 vpunpcklqdq %ymm8,%ymm2,%ymm13 # "c0"
1307 vpunpckhqdq %ymm8,%ymm2,%ymm2 # "c1"
1308 vpunpcklqdq %ymm15,%ymm12,%ymm9 # "c2"
1309 vpunpckhqdq %ymm15,%ymm12,%ymm12 # "c3"
1310 vpaddd 0x200-0x200(%rax),%ymm4,%ymm4
1311 vpaddd 0x220-0x200(%rax),%ymm5,%ymm5
1312 vpaddd 0x240-0x200(%rax),%ymm6,%ymm6
1313 vpaddd 0x260-0x200(%rax),%ymm7,%ymm7
1315 vpunpckldq %ymm5,%ymm4,%ymm15
1316 vpunpckldq %ymm7,%ymm6,%ymm8
1317 vpunpckhdq %ymm5,%ymm4,%ymm4
1318 vpunpckhdq %ymm7,%ymm6,%ymm6
1319 vpunpcklqdq %ymm8,%ymm15,%ymm5 # "d0"
1320 vpunpckhqdq %ymm8,%ymm15,%ymm15 # "d1"
1321 vpunpcklqdq %ymm6,%ymm4,%ymm7 # "d2"
1322 vpunpckhqdq %ymm6,%ymm4,%ymm4 # "d3"
1323 vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 # "de-interlace" further
1324 vperm2i128 $0x31,%ymm5,%ymm13,%ymm5
1325 vperm2i128 $0x20,%ymm15,%ymm2,%ymm13
1326 vperm2i128 $0x31,%ymm15,%ymm2,%ymm15
1327 vperm2i128 $0x20,%ymm7,%ymm9,%ymm2
1328 vperm2i128 $0x31,%ymm7,%ymm9,%ymm7
1329 vperm2i128 $0x20,%ymm4,%ymm12,%ymm9
1330 vperm2i128 $0x31,%ymm4,%ymm12,%ymm4
1331 vmovdqa 0x00(%rsp),%ymm6 # was offloaded, remember?
1332 vmovdqa 0x20(%rsp),%ymm12
1337 vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input
1338 vpxor 0x20(%rsi),%ymm8,%ymm8
1339 vpxor 0x40(%rsi),%ymm1,%ymm1
1340 vpxor 0x60(%rsi),%ymm5,%ymm5
1341 lea 0x80(%rsi),%rsi # size optimization
1342 vmovdqu %ymm6,0x00(%rdi)
1343 vmovdqu %ymm8,0x20(%rdi)
1344 vmovdqu %ymm1,0x40(%rdi)
1345 vmovdqu %ymm5,0x60(%rdi)
1346 lea 0x80(%rdi),%rdi # size optimization
1348 vpxor 0x00(%rsi),%ymm12,%ymm12
1349 vpxor 0x20(%rsi),%ymm13,%ymm13
1350 vpxor 0x40(%rsi),%ymm10,%ymm10
1351 vpxor 0x60(%rsi),%ymm15,%ymm15
1352 lea 0x80(%rsi),%rsi # size optimization
1353 vmovdqu %ymm12,0x00(%rdi)
1354 vmovdqu %ymm13,0x20(%rdi)
1355 vmovdqu %ymm10,0x40(%rdi)
1356 vmovdqu %ymm15,0x60(%rdi)
1357 lea 0x80(%rdi),%rdi # size optimization
1359 vpxor 0x00(%rsi),%ymm14,%ymm14
1360 vpxor 0x20(%rsi),%ymm2,%ymm2
1361 vpxor 0x40(%rsi),%ymm3,%ymm3
1362 vpxor 0x60(%rsi),%ymm7,%ymm7
1363 lea 0x80(%rsi),%rsi # size optimization
1364 vmovdqu %ymm14,0x00(%rdi)
1365 vmovdqu %ymm2,0x20(%rdi)
1366 vmovdqu %ymm3,0x40(%rdi)
1367 vmovdqu %ymm7,0x60(%rdi)
1368 lea 0x80(%rdi),%rdi # size optimization
1370 vpxor 0x00(%rsi),%ymm11,%ymm11
1371 vpxor 0x20(%rsi),%ymm9,%ymm9
1372 vpxor 0x40(%rsi),%ymm0,%ymm0
1373 vpxor 0x60(%rsi),%ymm4,%ymm4
1374 lea 0x80(%rsi),%rsi # size optimization
1375 vmovdqu %ymm11,0x00(%rdi)
1376 vmovdqu %ymm9,0x20(%rdi)
1377 vmovdqu %ymm0,0x40(%rdi)
1378 vmovdqu %ymm4,0x60(%rdi)
1379 lea 0x80(%rdi),%rdi # size optimization
1403 vmovdqa %ymm6,0x00(%rsp)
1404 vmovdqa %ymm8,0x20(%rsp)
1409 vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input
1410 vpxor 0x20(%rsi),%ymm8,%ymm8
1411 vmovdqu %ymm6,0x00(%rdi)
1412 vmovdqu %ymm8,0x20(%rdi)
1415 lea 0x40(%rsi),%rsi # inp+=64*1
1417 vmovdqa %ymm1,0x00(%rsp)
1418 lea 0x40(%rdi),%rdi # out+=64*1
1419 sub $64,%rdx # len-=64*1
1420 vmovdqa %ymm5,0x20(%rsp)
1425 vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input
1426 vpxor 0x20(%rsi),%ymm8,%ymm8
1427 vpxor 0x40(%rsi),%ymm1,%ymm1
1428 vpxor 0x60(%rsi),%ymm5,%ymm5
1429 vmovdqu %ymm6,0x00(%rdi)
1430 vmovdqu %ymm8,0x20(%rdi)
1431 vmovdqu %ymm1,0x40(%rdi)
1432 vmovdqu %ymm5,0x60(%rdi)
1435 lea 0x80(%rsi),%rsi # inp+=64*2
1437 vmovdqa %ymm12,0x00(%rsp)
1438 lea 0x80(%rdi),%rdi # out+=64*2
1439 sub $128,%rdx # len-=64*2
1440 vmovdqa %ymm13,0x20(%rsp)
1445 vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input
1446 vpxor 0x20(%rsi),%ymm8,%ymm8
1447 vpxor 0x40(%rsi),%ymm1,%ymm1
1448 vpxor 0x60(%rsi),%ymm5,%ymm5
1449 vpxor 0x80(%rsi),%ymm12,%ymm12
1450 vpxor 0xa0(%rsi),%ymm13,%ymm13
1451 vmovdqu %ymm6,0x00(%rdi)
1452 vmovdqu %ymm8,0x20(%rdi)
1453 vmovdqu %ymm1,0x40(%rdi)
1454 vmovdqu %ymm5,0x60(%rdi)
1455 vmovdqu %ymm12,0x80(%rdi)
1456 vmovdqu %ymm13,0xa0(%rdi)
1459 lea 0xc0(%rsi),%rsi # inp+=64*3
1461 vmovdqa %ymm10,0x00(%rsp)
1462 lea 0xc0(%rdi),%rdi # out+=64*3
1463 sub $192,%rdx # len-=64*3
1464 vmovdqa %ymm15,0x20(%rsp)
1469 vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input
1470 vpxor 0x20(%rsi),%ymm8,%ymm8
1471 vpxor 0x40(%rsi),%ymm1,%ymm1
1472 vpxor 0x60(%rsi),%ymm5,%ymm5
1473 vpxor 0x80(%rsi),%ymm12,%ymm12
1474 vpxor 0xa0(%rsi),%ymm13,%ymm13
1475 vpxor 0xc0(%rsi),%ymm10,%ymm10
1476 vpxor 0xe0(%rsi),%ymm15,%ymm15
1477 vmovdqu %ymm6,0x00(%rdi)
1478 vmovdqu %ymm8,0x20(%rdi)
1479 vmovdqu %ymm1,0x40(%rdi)
1480 vmovdqu %ymm5,0x60(%rdi)
1481 vmovdqu %ymm12,0x80(%rdi)
1482 vmovdqu %ymm13,0xa0(%rdi)
1483 vmovdqu %ymm10,0xc0(%rdi)
1484 vmovdqu %ymm15,0xe0(%rdi)
1487 lea 0x100(%rsi),%rsi # inp+=64*4
1489 vmovdqa %ymm14,0x00(%rsp)
1490 lea 0x100(%rdi),%rdi # out+=64*4
1491 sub $256,%rdx # len-=64*4
1492 vmovdqa %ymm2,0x20(%rsp)
1497 vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input
1498 vpxor 0x20(%rsi),%ymm8,%ymm8
1499 vpxor 0x40(%rsi),%ymm1,%ymm1
1500 vpxor 0x60(%rsi),%ymm5,%ymm5
1501 vpxor 0x80(%rsi),%ymm12,%ymm12
1502 vpxor 0xa0(%rsi),%ymm13,%ymm13
1503 vpxor 0xc0(%rsi),%ymm10,%ymm10
1504 vpxor 0xe0(%rsi),%ymm15,%ymm15
1505 vpxor 0x100(%rsi),%ymm14,%ymm14
1506 vpxor 0x120(%rsi),%ymm2,%ymm2
1507 vmovdqu %ymm6,0x00(%rdi)
1508 vmovdqu %ymm8,0x20(%rdi)
1509 vmovdqu %ymm1,0x40(%rdi)
1510 vmovdqu %ymm5,0x60(%rdi)
1511 vmovdqu %ymm12,0x80(%rdi)
1512 vmovdqu %ymm13,0xa0(%rdi)
1513 vmovdqu %ymm10,0xc0(%rdi)
1514 vmovdqu %ymm15,0xe0(%rdi)
1515 vmovdqu %ymm14,0x100(%rdi)
1516 vmovdqu %ymm2,0x120(%rdi)
1519 lea 0x140(%rsi),%rsi # inp+=64*5
1521 vmovdqa %ymm3,0x00(%rsp)
1522 lea 0x140(%rdi),%rdi # out+=64*5
1523 sub $320,%rdx # len-=64*5
1524 vmovdqa %ymm7,0x20(%rsp)
1529 vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input
1530 vpxor 0x20(%rsi),%ymm8,%ymm8
1531 vpxor 0x40(%rsi),%ymm1,%ymm1
1532 vpxor 0x60(%rsi),%ymm5,%ymm5
1533 vpxor 0x80(%rsi),%ymm12,%ymm12
1534 vpxor 0xa0(%rsi),%ymm13,%ymm13
1535 vpxor 0xc0(%rsi),%ymm10,%ymm10
1536 vpxor 0xe0(%rsi),%ymm15,%ymm15
1537 vpxor 0x100(%rsi),%ymm14,%ymm14
1538 vpxor 0x120(%rsi),%ymm2,%ymm2
1539 vpxor 0x140(%rsi),%ymm3,%ymm3
1540 vpxor 0x160(%rsi),%ymm7,%ymm7
1541 vmovdqu %ymm6,0x00(%rdi)
1542 vmovdqu %ymm8,0x20(%rdi)
1543 vmovdqu %ymm1,0x40(%rdi)
1544 vmovdqu %ymm5,0x60(%rdi)
1545 vmovdqu %ymm12,0x80(%rdi)
1546 vmovdqu %ymm13,0xa0(%rdi)
1547 vmovdqu %ymm10,0xc0(%rdi)
1548 vmovdqu %ymm15,0xe0(%rdi)
1549 vmovdqu %ymm14,0x100(%rdi)
1550 vmovdqu %ymm2,0x120(%rdi)
1551 vmovdqu %ymm3,0x140(%rdi)
1552 vmovdqu %ymm7,0x160(%rdi)
1555 lea 0x180(%rsi),%rsi # inp+=64*6
1557 vmovdqa %ymm11,0x00(%rsp)
1558 lea 0x180(%rdi),%rdi # out+=64*6
1559 sub $384,%rdx # len-=64*6
1560 vmovdqa %ymm9,0x20(%rsp)
1565 vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input
1566 vpxor 0x20(%rsi),%ymm8,%ymm8
1567 vpxor 0x40(%rsi),%ymm1,%ymm1
1568 vpxor 0x60(%rsi),%ymm5,%ymm5
1569 vpxor 0x80(%rsi),%ymm12,%ymm12
1570 vpxor 0xa0(%rsi),%ymm13,%ymm13
1571 vpxor 0xc0(%rsi),%ymm10,%ymm10
1572 vpxor 0xe0(%rsi),%ymm15,%ymm15
1573 vpxor 0x100(%rsi),%ymm14,%ymm14
1574 vpxor 0x120(%rsi),%ymm2,%ymm2
1575 vpxor 0x140(%rsi),%ymm3,%ymm3
1576 vpxor 0x160(%rsi),%ymm7,%ymm7
1577 vpxor 0x180(%rsi),%ymm11,%ymm11
1578 vpxor 0x1a0(%rsi),%ymm9,%ymm9
1579 vmovdqu %ymm6,0x00(%rdi)
1580 vmovdqu %ymm8,0x20(%rdi)
1581 vmovdqu %ymm1,0x40(%rdi)
1582 vmovdqu %ymm5,0x60(%rdi)
1583 vmovdqu %ymm12,0x80(%rdi)
1584 vmovdqu %ymm13,0xa0(%rdi)
1585 vmovdqu %ymm10,0xc0(%rdi)
1586 vmovdqu %ymm15,0xe0(%rdi)
1587 vmovdqu %ymm14,0x100(%rdi)
1588 vmovdqu %ymm2,0x120(%rdi)
1589 vmovdqu %ymm3,0x140(%rdi)
1590 vmovdqu %ymm7,0x160(%rdi)
1591 vmovdqu %ymm11,0x180(%rdi)
1592 vmovdqu %ymm9,0x1a0(%rdi)
1595 lea 0x1c0(%rsi),%rsi # inp+=64*7
1597 vmovdqa %ymm0,0x00(%rsp)
1598 lea 0x1c0(%rdi),%rdi # out+=64*7
1599 sub $448,%rdx # len-=64*7
1600 vmovdqa %ymm4,0x20(%rsp)
1603 movzb (%rsi,%r9),%eax
1604 movzb (%rsp,%r9),%ecx
1607 mov %al,-1(%rdi,%r9)
1616 SYM_FUNC_END(chacha20_avx2)
1618 #ifdef CONFIG_AS_AVX512
1620 SYM_FUNC_START(chacha20_avx512)
1622 lea 8(%rsp),%r10 # frame pointer
1628 vbroadcasti32x4 .Lsigma(%rip),%zmm0
1629 vbroadcasti32x4 (%rcx),%zmm1
1630 vbroadcasti32x4 16(%rcx),%zmm2
1631 vbroadcasti32x4 (%r8),%zmm3
1633 vmovdqa32 %zmm0,%zmm16
1634 vmovdqa32 %zmm1,%zmm17
1635 vmovdqa32 %zmm2,%zmm18
1636 vpaddd .Lzeroz(%rip),%zmm3,%zmm3
1637 vmovdqa32 .Lfourz(%rip),%zmm20
1638 mov $10,%r8 # reuse %r8
1639 vmovdqa32 %zmm3,%zmm19
1644 vmovdqa32 %zmm16,%zmm0
1645 vmovdqa32 %zmm17,%zmm1
1646 vmovdqa32 %zmm18,%zmm2
1647 vpaddd %zmm20,%zmm19,%zmm3
1649 vmovdqa32 %zmm3,%zmm19
1654 vpaddd %zmm1,%zmm0,%zmm0
1655 vpxord %zmm0,%zmm3,%zmm3
1656 vprold $16,%zmm3,%zmm3
1657 vpaddd %zmm3,%zmm2,%zmm2
1658 vpxord %zmm2,%zmm1,%zmm1
1659 vprold $12,%zmm1,%zmm1
1660 vpaddd %zmm1,%zmm0,%zmm0
1661 vpxord %zmm0,%zmm3,%zmm3
1662 vprold $8,%zmm3,%zmm3
1663 vpaddd %zmm3,%zmm2,%zmm2
1664 vpxord %zmm2,%zmm1,%zmm1
1665 vprold $7,%zmm1,%zmm1
1666 vpshufd $78,%zmm2,%zmm2
1667 vpshufd $57,%zmm1,%zmm1
1668 vpshufd $147,%zmm3,%zmm3
1669 vpaddd %zmm1,%zmm0,%zmm0
1670 vpxord %zmm0,%zmm3,%zmm3
1671 vprold $16,%zmm3,%zmm3
1672 vpaddd %zmm3,%zmm2,%zmm2
1673 vpxord %zmm2,%zmm1,%zmm1
1674 vprold $12,%zmm1,%zmm1
1675 vpaddd %zmm1,%zmm0,%zmm0
1676 vpxord %zmm0,%zmm3,%zmm3
1677 vprold $8,%zmm3,%zmm3
1678 vpaddd %zmm3,%zmm2,%zmm2
1679 vpxord %zmm2,%zmm1,%zmm1
1680 vprold $7,%zmm1,%zmm1
1681 vpshufd $78,%zmm2,%zmm2
1682 vpshufd $147,%zmm1,%zmm1
1683 vpshufd $57,%zmm3,%zmm3
1686 vpaddd %zmm16,%zmm0,%zmm0
1687 vpaddd %zmm17,%zmm1,%zmm1
1688 vpaddd %zmm18,%zmm2,%zmm2
1689 vpaddd %zmm19,%zmm3,%zmm3
1694 vpxor 0x00(%rsi),%xmm0,%xmm4 # xor with input
1695 vpxor 0x10(%rsi),%xmm1,%xmm5
1696 vpxor 0x20(%rsi),%xmm2,%xmm6
1697 vpxor 0x30(%rsi),%xmm3,%xmm7
1698 lea 0x40(%rsi),%rsi # inp+=64
1700 vmovdqu %xmm4,0x00(%rdi) # write output
1701 vmovdqu %xmm5,0x10(%rdi)
1702 vmovdqu %xmm6,0x20(%rdi)
1703 vmovdqu %xmm7,0x30(%rdi)
1704 lea 0x40(%rdi),%rdi # out+=64
1708 vextracti32x4 $1,%zmm0,%xmm4
1709 vextracti32x4 $1,%zmm1,%xmm5
1710 vextracti32x4 $1,%zmm2,%xmm6
1711 vextracti32x4 $1,%zmm3,%xmm7
1716 vpxor 0x00(%rsi),%xmm4,%xmm4 # xor with input
1717 vpxor 0x10(%rsi),%xmm5,%xmm5
1718 vpxor 0x20(%rsi),%xmm6,%xmm6
1719 vpxor 0x30(%rsi),%xmm7,%xmm7
1720 lea 0x40(%rsi),%rsi # inp+=64
1722 vmovdqu %xmm4,0x00(%rdi) # write output
1723 vmovdqu %xmm5,0x10(%rdi)
1724 vmovdqu %xmm6,0x20(%rdi)
1725 vmovdqu %xmm7,0x30(%rdi)
1726 lea 0x40(%rdi),%rdi # out+=64
1730 vextracti32x4 $2,%zmm0,%xmm4
1731 vextracti32x4 $2,%zmm1,%xmm5
1732 vextracti32x4 $2,%zmm2,%xmm6
1733 vextracti32x4 $2,%zmm3,%xmm7
1738 vpxor 0x00(%rsi),%xmm4,%xmm4 # xor with input
1739 vpxor 0x10(%rsi),%xmm5,%xmm5
1740 vpxor 0x20(%rsi),%xmm6,%xmm6
1741 vpxor 0x30(%rsi),%xmm7,%xmm7
1742 lea 0x40(%rsi),%rsi # inp+=64
1744 vmovdqu %xmm4,0x00(%rdi) # write output
1745 vmovdqu %xmm5,0x10(%rdi)
1746 vmovdqu %xmm6,0x20(%rdi)
1747 vmovdqu %xmm7,0x30(%rdi)
1748 lea 0x40(%rdi),%rdi # out+=64
1752 vextracti32x4 $3,%zmm0,%xmm4
1753 vextracti32x4 $3,%zmm1,%xmm5
1754 vextracti32x4 $3,%zmm2,%xmm6
1755 vextracti32x4 $3,%zmm3,%xmm7
1760 vpxor 0x00(%rsi),%xmm4,%xmm4 # xor with input
1761 vpxor 0x10(%rsi),%xmm5,%xmm5
1762 vpxor 0x20(%rsi),%xmm6,%xmm6
1763 vpxor 0x30(%rsi),%xmm7,%xmm7
1764 lea 0x40(%rsi),%rsi # inp+=64
1766 vmovdqu %xmm4,0x00(%rdi) # write output
1767 vmovdqu %xmm5,0x10(%rdi)
1768 vmovdqu %xmm6,0x20(%rdi)
1769 vmovdqu %xmm7,0x30(%rdi)
1770 lea 0x40(%rdi),%rdi # out+=64
1772 jnz .Loop_outer_avx512
1778 vmovdqa %xmm0,0x00(%rsp)
1779 vmovdqa %xmm1,0x10(%rsp)
1780 vmovdqa %xmm2,0x20(%rsp)
1781 vmovdqa %xmm3,0x30(%rsp)
1783 jmp .Loop_tail_avx512
1787 vmovdqa %xmm4,0x00(%rsp)
1788 vmovdqa %xmm5,0x10(%rsp)
1789 vmovdqa %xmm6,0x20(%rsp)
1790 vmovdqa %xmm7,0x30(%rsp)
1794 movzb (%rsi,%r8),%eax
1795 movzb (%rsp,%r8),%ecx
1798 mov %al,-1(%rdi,%r8)
1800 jnz .Loop_tail_avx512
1802 vmovdqu32 %zmm16,0x00(%rsp)
1809 SYM_FUNC_END(chacha20_avx512)
1811 SYM_FUNC_START(chacha20_avx512vl)
1812 .Lchacha20_avx512vl:
1813 lea 8(%rsp),%r10 # frame pointer
1819 vbroadcasti128 .Lsigma(%rip),%ymm0
1820 vbroadcasti128 (%rcx),%ymm1
1821 vbroadcasti128 16(%rcx),%ymm2
1822 vbroadcasti128 (%r8),%ymm3
1824 vmovdqa32 %ymm0,%ymm16
1825 vmovdqa32 %ymm1,%ymm17
1826 vmovdqa32 %ymm2,%ymm18
1827 vpaddd .Lzeroz(%rip),%ymm3,%ymm3
1828 vmovdqa32 .Ltwoy(%rip),%ymm20
1829 mov $10,%r8 # reuse %r8
1830 vmovdqa32 %ymm3,%ymm19
1834 .Loop_outer_avx512vl:
1835 vmovdqa32 %ymm18,%ymm2
1836 vpaddd %ymm20,%ymm19,%ymm3
1838 vmovdqa32 %ymm3,%ymm19
1843 vpaddd %ymm1,%ymm0,%ymm0
1844 vpxor %ymm0,%ymm3,%ymm3
1845 vprold $16,%ymm3,%ymm3
1846 vpaddd %ymm3,%ymm2,%ymm2
1847 vpxor %ymm2,%ymm1,%ymm1
1848 vprold $12,%ymm1,%ymm1
1849 vpaddd %ymm1,%ymm0,%ymm0
1850 vpxor %ymm0,%ymm3,%ymm3
1851 vprold $8,%ymm3,%ymm3
1852 vpaddd %ymm3,%ymm2,%ymm2
1853 vpxor %ymm2,%ymm1,%ymm1
1854 vprold $7,%ymm1,%ymm1
1855 vpshufd $78,%ymm2,%ymm2
1856 vpshufd $57,%ymm1,%ymm1
1857 vpshufd $147,%ymm3,%ymm3
1858 vpaddd %ymm1,%ymm0,%ymm0
1859 vpxor %ymm0,%ymm3,%ymm3
1860 vprold $16,%ymm3,%ymm3
1861 vpaddd %ymm3,%ymm2,%ymm2
1862 vpxor %ymm2,%ymm1,%ymm1
1863 vprold $12,%ymm1,%ymm1
1864 vpaddd %ymm1,%ymm0,%ymm0
1865 vpxor %ymm0,%ymm3,%ymm3
1866 vprold $8,%ymm3,%ymm3
1867 vpaddd %ymm3,%ymm2,%ymm2
1868 vpxor %ymm2,%ymm1,%ymm1
1869 vprold $7,%ymm1,%ymm1
1870 vpshufd $78,%ymm2,%ymm2
1871 vpshufd $147,%ymm1,%ymm1
1872 vpshufd $57,%ymm3,%ymm3
1875 vpaddd %ymm16,%ymm0,%ymm0
1876 vpaddd %ymm17,%ymm1,%ymm1
1877 vpaddd %ymm18,%ymm2,%ymm2
1878 vpaddd %ymm19,%ymm3,%ymm3
1881 jb .Ltail64_avx512vl
1883 vpxor 0x00(%rsi),%xmm0,%xmm4 # xor with input
1884 vpxor 0x10(%rsi),%xmm1,%xmm5
1885 vpxor 0x20(%rsi),%xmm2,%xmm6
1886 vpxor 0x30(%rsi),%xmm3,%xmm7
1887 lea 0x40(%rsi),%rsi # inp+=64
1889 vmovdqu %xmm4,0x00(%rdi) # write output
1890 vmovdqu %xmm5,0x10(%rdi)
1891 vmovdqu %xmm6,0x20(%rdi)
1892 vmovdqu %xmm7,0x30(%rdi)
1893 lea 0x40(%rdi),%rdi # out+=64
1897 vextracti128 $1,%ymm0,%xmm4
1898 vextracti128 $1,%ymm1,%xmm5
1899 vextracti128 $1,%ymm2,%xmm6
1900 vextracti128 $1,%ymm3,%xmm7
1905 vpxor 0x00(%rsi),%xmm4,%xmm4 # xor with input
1906 vpxor 0x10(%rsi),%xmm5,%xmm5
1907 vpxor 0x20(%rsi),%xmm6,%xmm6
1908 vpxor 0x30(%rsi),%xmm7,%xmm7
1909 lea 0x40(%rsi),%rsi # inp+=64
1911 vmovdqu %xmm4,0x00(%rdi) # write output
1912 vmovdqu %xmm5,0x10(%rdi)
1913 vmovdqu %xmm6,0x20(%rdi)
1914 vmovdqu %xmm7,0x30(%rdi)
1915 lea 0x40(%rdi),%rdi # out+=64
1917 vmovdqa32 %ymm16,%ymm0
1918 vmovdqa32 %ymm17,%ymm1
1919 jnz .Loop_outer_avx512vl
1925 vmovdqa %xmm0,0x00(%rsp)
1926 vmovdqa %xmm1,0x10(%rsp)
1927 vmovdqa %xmm2,0x20(%rsp)
1928 vmovdqa %xmm3,0x30(%rsp)
1930 jmp .Loop_tail_avx512vl
1934 vmovdqa %xmm4,0x00(%rsp)
1935 vmovdqa %xmm5,0x10(%rsp)
1936 vmovdqa %xmm6,0x20(%rsp)
1937 vmovdqa %xmm7,0x30(%rsp)
1940 .Loop_tail_avx512vl:
1941 movzb (%rsi,%r8),%eax
1942 movzb (%rsp,%r8),%ecx
1945 mov %al,-1(%rdi,%r8)
1947 jnz .Loop_tail_avx512vl
1949 vmovdqu32 %ymm16,0x00(%rsp)
1950 vmovdqu32 %ymm16,0x20(%rsp)
1955 .Lavx512vl_epilogue:
1957 SYM_FUNC_END(chacha20_avx512vl)
1958 .type chacha20_16x,@function
1962 lea 8(%rsp),%r10 # frame register
1967 lea .Lsigma(%rip),%r9
1968 vbroadcasti32x4 (%r9),%zmm3 # key[0]
1969 vbroadcasti32x4 (%rcx),%zmm7 # key[1]
1970 vbroadcasti32x4 16(%rcx),%zmm11 # key[2]
1971 vbroadcasti32x4 (%r8),%zmm15 # key[3]
1973 vpshufd $0x00,%zmm3,%zmm0 # smash key by lanes...
1974 vpshufd $0x55,%zmm3,%zmm1
1975 vpshufd $0xaa,%zmm3,%zmm2
1976 vpshufd $0xff,%zmm3,%zmm3
1977 vmovdqa64 %zmm0,%zmm16
1978 vmovdqa64 %zmm1,%zmm17
1979 vmovdqa64 %zmm2,%zmm18
1980 vmovdqa64 %zmm3,%zmm19
1982 vpshufd $0x00,%zmm7,%zmm4
1983 vpshufd $0x55,%zmm7,%zmm5
1984 vpshufd $0xaa,%zmm7,%zmm6
1985 vpshufd $0xff,%zmm7,%zmm7
1986 vmovdqa64 %zmm4,%zmm20
1987 vmovdqa64 %zmm5,%zmm21
1988 vmovdqa64 %zmm6,%zmm22
1989 vmovdqa64 %zmm7,%zmm23
1991 vpshufd $0x00,%zmm11,%zmm8
1992 vpshufd $0x55,%zmm11,%zmm9
1993 vpshufd $0xaa,%zmm11,%zmm10
1994 vpshufd $0xff,%zmm11,%zmm11
1995 vmovdqa64 %zmm8,%zmm24
1996 vmovdqa64 %zmm9,%zmm25
1997 vmovdqa64 %zmm10,%zmm26
1998 vmovdqa64 %zmm11,%zmm27
2000 vpshufd $0x00,%zmm15,%zmm12
2001 vpshufd $0x55,%zmm15,%zmm13
2002 vpshufd $0xaa,%zmm15,%zmm14
2003 vpshufd $0xff,%zmm15,%zmm15
2004 vpaddd .Lincz(%rip),%zmm12,%zmm12 # don't save counters yet
2005 vmovdqa64 %zmm12,%zmm28
2006 vmovdqa64 %zmm13,%zmm29
2007 vmovdqa64 %zmm14,%zmm30
2008 vmovdqa64 %zmm15,%zmm31
2015 vpbroadcastd 0(%r9),%zmm0 # reload key
2016 vpbroadcastd 4(%r9),%zmm1
2017 vpbroadcastd 8(%r9),%zmm2
2018 vpbroadcastd 12(%r9),%zmm3
2019 vpaddd .Lsixteen(%rip),%zmm28,%zmm28 # next SIMD counters
2020 vmovdqa64 %zmm20,%zmm4
2021 vmovdqa64 %zmm21,%zmm5
2022 vmovdqa64 %zmm22,%zmm6
2023 vmovdqa64 %zmm23,%zmm7
2024 vmovdqa64 %zmm24,%zmm8
2025 vmovdqa64 %zmm25,%zmm9
2026 vmovdqa64 %zmm26,%zmm10
2027 vmovdqa64 %zmm27,%zmm11
2028 vmovdqa64 %zmm28,%zmm12
2029 vmovdqa64 %zmm29,%zmm13
2030 vmovdqa64 %zmm30,%zmm14
2031 vmovdqa64 %zmm31,%zmm15
2033 vmovdqa64 %zmm0,%zmm16
2034 vmovdqa64 %zmm1,%zmm17
2035 vmovdqa64 %zmm2,%zmm18
2036 vmovdqa64 %zmm3,%zmm19
2043 vpaddd %zmm4,%zmm0,%zmm0
2044 vpaddd %zmm5,%zmm1,%zmm1
2045 vpaddd %zmm6,%zmm2,%zmm2
2046 vpaddd %zmm7,%zmm3,%zmm3
2047 vpxord %zmm0,%zmm12,%zmm12
2048 vpxord %zmm1,%zmm13,%zmm13
2049 vpxord %zmm2,%zmm14,%zmm14
2050 vpxord %zmm3,%zmm15,%zmm15
2051 vprold $16,%zmm12,%zmm12
2052 vprold $16,%zmm13,%zmm13
2053 vprold $16,%zmm14,%zmm14
2054 vprold $16,%zmm15,%zmm15
2055 vpaddd %zmm12,%zmm8,%zmm8
2056 vpaddd %zmm13,%zmm9,%zmm9
2057 vpaddd %zmm14,%zmm10,%zmm10
2058 vpaddd %zmm15,%zmm11,%zmm11
2059 vpxord %zmm8,%zmm4,%zmm4
2060 vpxord %zmm9,%zmm5,%zmm5
2061 vpxord %zmm10,%zmm6,%zmm6
2062 vpxord %zmm11,%zmm7,%zmm7
2063 vprold $12,%zmm4,%zmm4
2064 vprold $12,%zmm5,%zmm5
2065 vprold $12,%zmm6,%zmm6
2066 vprold $12,%zmm7,%zmm7
2067 vpaddd %zmm4,%zmm0,%zmm0
2068 vpaddd %zmm5,%zmm1,%zmm1
2069 vpaddd %zmm6,%zmm2,%zmm2
2070 vpaddd %zmm7,%zmm3,%zmm3
2071 vpxord %zmm0,%zmm12,%zmm12
2072 vpxord %zmm1,%zmm13,%zmm13
2073 vpxord %zmm2,%zmm14,%zmm14
2074 vpxord %zmm3,%zmm15,%zmm15
2075 vprold $8,%zmm12,%zmm12
2076 vprold $8,%zmm13,%zmm13
2077 vprold $8,%zmm14,%zmm14
2078 vprold $8,%zmm15,%zmm15
2079 vpaddd %zmm12,%zmm8,%zmm8
2080 vpaddd %zmm13,%zmm9,%zmm9
2081 vpaddd %zmm14,%zmm10,%zmm10
2082 vpaddd %zmm15,%zmm11,%zmm11
2083 vpxord %zmm8,%zmm4,%zmm4
2084 vpxord %zmm9,%zmm5,%zmm5
2085 vpxord %zmm10,%zmm6,%zmm6
2086 vpxord %zmm11,%zmm7,%zmm7
2087 vprold $7,%zmm4,%zmm4
2088 vprold $7,%zmm5,%zmm5
2089 vprold $7,%zmm6,%zmm6
2090 vprold $7,%zmm7,%zmm7
2091 vpaddd %zmm5,%zmm0,%zmm0
2092 vpaddd %zmm6,%zmm1,%zmm1
2093 vpaddd %zmm7,%zmm2,%zmm2
2094 vpaddd %zmm4,%zmm3,%zmm3
2095 vpxord %zmm0,%zmm15,%zmm15
2096 vpxord %zmm1,%zmm12,%zmm12
2097 vpxord %zmm2,%zmm13,%zmm13
2098 vpxord %zmm3,%zmm14,%zmm14
2099 vprold $16,%zmm15,%zmm15
2100 vprold $16,%zmm12,%zmm12
2101 vprold $16,%zmm13,%zmm13
2102 vprold $16,%zmm14,%zmm14
2103 vpaddd %zmm15,%zmm10,%zmm10
2104 vpaddd %zmm12,%zmm11,%zmm11
2105 vpaddd %zmm13,%zmm8,%zmm8
2106 vpaddd %zmm14,%zmm9,%zmm9
2107 vpxord %zmm10,%zmm5,%zmm5
2108 vpxord %zmm11,%zmm6,%zmm6
2109 vpxord %zmm8,%zmm7,%zmm7
2110 vpxord %zmm9,%zmm4,%zmm4
2111 vprold $12,%zmm5,%zmm5
2112 vprold $12,%zmm6,%zmm6
2113 vprold $12,%zmm7,%zmm7
2114 vprold $12,%zmm4,%zmm4
2115 vpaddd %zmm5,%zmm0,%zmm0
2116 vpaddd %zmm6,%zmm1,%zmm1
2117 vpaddd %zmm7,%zmm2,%zmm2
2118 vpaddd %zmm4,%zmm3,%zmm3
2119 vpxord %zmm0,%zmm15,%zmm15
2120 vpxord %zmm1,%zmm12,%zmm12
2121 vpxord %zmm2,%zmm13,%zmm13
2122 vpxord %zmm3,%zmm14,%zmm14
2123 vprold $8,%zmm15,%zmm15
2124 vprold $8,%zmm12,%zmm12
2125 vprold $8,%zmm13,%zmm13
2126 vprold $8,%zmm14,%zmm14
2127 vpaddd %zmm15,%zmm10,%zmm10
2128 vpaddd %zmm12,%zmm11,%zmm11
2129 vpaddd %zmm13,%zmm8,%zmm8
2130 vpaddd %zmm14,%zmm9,%zmm9
2131 vpxord %zmm10,%zmm5,%zmm5
2132 vpxord %zmm11,%zmm6,%zmm6
2133 vpxord %zmm8,%zmm7,%zmm7
2134 vpxord %zmm9,%zmm4,%zmm4
2135 vprold $7,%zmm5,%zmm5
2136 vprold $7,%zmm6,%zmm6
2137 vprold $7,%zmm7,%zmm7
2138 vprold $7,%zmm4,%zmm4
2142 vpaddd %zmm16,%zmm0,%zmm0 # accumulate key
2143 vpaddd %zmm17,%zmm1,%zmm1
2144 vpaddd %zmm18,%zmm2,%zmm2
2145 vpaddd %zmm19,%zmm3,%zmm3
2147 vpunpckldq %zmm1,%zmm0,%zmm18 # "de-interlace" data
2148 vpunpckldq %zmm3,%zmm2,%zmm19
2149 vpunpckhdq %zmm1,%zmm0,%zmm0
2150 vpunpckhdq %zmm3,%zmm2,%zmm2
2151 vpunpcklqdq %zmm19,%zmm18,%zmm1 # "a0"
2152 vpunpckhqdq %zmm19,%zmm18,%zmm18 # "a1"
2153 vpunpcklqdq %zmm2,%zmm0,%zmm3 # "a2"
2154 vpunpckhqdq %zmm2,%zmm0,%zmm0 # "a3"
2155 vpaddd %zmm20,%zmm4,%zmm4
2156 vpaddd %zmm21,%zmm5,%zmm5
2157 vpaddd %zmm22,%zmm6,%zmm6
2158 vpaddd %zmm23,%zmm7,%zmm7
2160 vpunpckldq %zmm5,%zmm4,%zmm2
2161 vpunpckldq %zmm7,%zmm6,%zmm19
2162 vpunpckhdq %zmm5,%zmm4,%zmm4
2163 vpunpckhdq %zmm7,%zmm6,%zmm6
2164 vpunpcklqdq %zmm19,%zmm2,%zmm5 # "b0"
2165 vpunpckhqdq %zmm19,%zmm2,%zmm2 # "b1"
2166 vpunpcklqdq %zmm6,%zmm4,%zmm7 # "b2"
2167 vpunpckhqdq %zmm6,%zmm4,%zmm4 # "b3"
2168 vshufi32x4 $0x44,%zmm5,%zmm1,%zmm19 # "de-interlace" further
2169 vshufi32x4 $0xee,%zmm5,%zmm1,%zmm5
2170 vshufi32x4 $0x44,%zmm2,%zmm18,%zmm1
2171 vshufi32x4 $0xee,%zmm2,%zmm18,%zmm2
2172 vshufi32x4 $0x44,%zmm7,%zmm3,%zmm18
2173 vshufi32x4 $0xee,%zmm7,%zmm3,%zmm7
2174 vshufi32x4 $0x44,%zmm4,%zmm0,%zmm3
2175 vshufi32x4 $0xee,%zmm4,%zmm0,%zmm4
2176 vpaddd %zmm24,%zmm8,%zmm8
2177 vpaddd %zmm25,%zmm9,%zmm9
2178 vpaddd %zmm26,%zmm10,%zmm10
2179 vpaddd %zmm27,%zmm11,%zmm11
2181 vpunpckldq %zmm9,%zmm8,%zmm6
2182 vpunpckldq %zmm11,%zmm10,%zmm0
2183 vpunpckhdq %zmm9,%zmm8,%zmm8
2184 vpunpckhdq %zmm11,%zmm10,%zmm10
2185 vpunpcklqdq %zmm0,%zmm6,%zmm9 # "c0"
2186 vpunpckhqdq %zmm0,%zmm6,%zmm6 # "c1"
2187 vpunpcklqdq %zmm10,%zmm8,%zmm11 # "c2"
2188 vpunpckhqdq %zmm10,%zmm8,%zmm8 # "c3"
2189 vpaddd %zmm28,%zmm12,%zmm12
2190 vpaddd %zmm29,%zmm13,%zmm13
2191 vpaddd %zmm30,%zmm14,%zmm14
2192 vpaddd %zmm31,%zmm15,%zmm15
2194 vpunpckldq %zmm13,%zmm12,%zmm10
2195 vpunpckldq %zmm15,%zmm14,%zmm0
2196 vpunpckhdq %zmm13,%zmm12,%zmm12
2197 vpunpckhdq %zmm15,%zmm14,%zmm14
2198 vpunpcklqdq %zmm0,%zmm10,%zmm13 # "d0"
2199 vpunpckhqdq %zmm0,%zmm10,%zmm10 # "d1"
2200 vpunpcklqdq %zmm14,%zmm12,%zmm15 # "d2"
2201 vpunpckhqdq %zmm14,%zmm12,%zmm12 # "d3"
2202 vshufi32x4 $0x44,%zmm13,%zmm9,%zmm0 # "de-interlace" further
2203 vshufi32x4 $0xee,%zmm13,%zmm9,%zmm13
2204 vshufi32x4 $0x44,%zmm10,%zmm6,%zmm9
2205 vshufi32x4 $0xee,%zmm10,%zmm6,%zmm10
2206 vshufi32x4 $0x44,%zmm15,%zmm11,%zmm6
2207 vshufi32x4 $0xee,%zmm15,%zmm11,%zmm15
2208 vshufi32x4 $0x44,%zmm12,%zmm8,%zmm11
2209 vshufi32x4 $0xee,%zmm12,%zmm8,%zmm12
2210 vshufi32x4 $0x88,%zmm0,%zmm19,%zmm16 # "de-interlace" further
2211 vshufi32x4 $0xdd,%zmm0,%zmm19,%zmm19
2212 vshufi32x4 $0x88,%zmm13,%zmm5,%zmm0
2213 vshufi32x4 $0xdd,%zmm13,%zmm5,%zmm13
2214 vshufi32x4 $0x88,%zmm9,%zmm1,%zmm17
2215 vshufi32x4 $0xdd,%zmm9,%zmm1,%zmm1
2216 vshufi32x4 $0x88,%zmm10,%zmm2,%zmm9
2217 vshufi32x4 $0xdd,%zmm10,%zmm2,%zmm10
2218 vshufi32x4 $0x88,%zmm6,%zmm18,%zmm14
2219 vshufi32x4 $0xdd,%zmm6,%zmm18,%zmm18
2220 vshufi32x4 $0x88,%zmm15,%zmm7,%zmm6
2221 vshufi32x4 $0xdd,%zmm15,%zmm7,%zmm15
2222 vshufi32x4 $0x88,%zmm11,%zmm3,%zmm8
2223 vshufi32x4 $0xdd,%zmm11,%zmm3,%zmm3
2224 vshufi32x4 $0x88,%zmm12,%zmm4,%zmm11
2225 vshufi32x4 $0xdd,%zmm12,%zmm4,%zmm12
2229 vpxord 0x00(%rsi),%zmm16,%zmm16 # xor with input
2230 vpxord 0x40(%rsi),%zmm17,%zmm17
2231 vpxord 0x80(%rsi),%zmm14,%zmm14
2232 vpxord 0xc0(%rsi),%zmm8,%zmm8
2233 vmovdqu32 %zmm16,0x00(%rdi)
2234 vmovdqu32 %zmm17,0x40(%rdi)
2235 vmovdqu32 %zmm14,0x80(%rdi)
2236 vmovdqu32 %zmm8,0xc0(%rdi)
2238 vpxord 0x100(%rsi),%zmm19,%zmm19
2239 vpxord 0x140(%rsi),%zmm1,%zmm1
2240 vpxord 0x180(%rsi),%zmm18,%zmm18
2241 vpxord 0x1c0(%rsi),%zmm3,%zmm3
2242 vmovdqu32 %zmm19,0x100(%rdi)
2243 vmovdqu32 %zmm1,0x140(%rdi)
2244 vmovdqu32 %zmm18,0x180(%rdi)
2245 vmovdqu32 %zmm3,0x1c0(%rdi)
2247 vpxord 0x200(%rsi),%zmm0,%zmm0
2248 vpxord 0x240(%rsi),%zmm9,%zmm9
2249 vpxord 0x280(%rsi),%zmm6,%zmm6
2250 vpxord 0x2c0(%rsi),%zmm11,%zmm11
2251 vmovdqu32 %zmm0,0x200(%rdi)
2252 vmovdqu32 %zmm9,0x240(%rdi)
2253 vmovdqu32 %zmm6,0x280(%rdi)
2254 vmovdqu32 %zmm11,0x2c0(%rdi)
2256 vpxord 0x300(%rsi),%zmm13,%zmm13
2257 vpxord 0x340(%rsi),%zmm10,%zmm10
2258 vpxord 0x380(%rsi),%zmm15,%zmm15
2259 vpxord 0x3c0(%rsi),%zmm12,%zmm12
2260 lea 0x400(%rsi),%rsi
2261 vmovdqu32 %zmm13,0x300(%rdi)
2262 vmovdqu32 %zmm10,0x340(%rdi)
2263 vmovdqu32 %zmm15,0x380(%rdi)
2264 vmovdqu32 %zmm12,0x3c0(%rdi)
2265 lea 0x400(%rdi),%rdi
2277 jb .Less_than_64_16x
2278 vpxord (%rsi),%zmm16,%zmm16 # xor with input
2279 vmovdqu32 %zmm16,(%rdi,%rsi)
2281 vmovdqa32 %zmm17,%zmm16
2285 jb .Less_than_64_16x
2286 vpxord (%rsi),%zmm17,%zmm17
2287 vmovdqu32 %zmm17,(%rdi,%rsi)
2289 vmovdqa32 %zmm14,%zmm16
2293 jb .Less_than_64_16x
2294 vpxord (%rsi),%zmm14,%zmm14
2295 vmovdqu32 %zmm14,(%rdi,%rsi)
2297 vmovdqa32 %zmm8,%zmm16
2301 jb .Less_than_64_16x
2302 vpxord (%rsi),%zmm8,%zmm8
2303 vmovdqu32 %zmm8,(%rdi,%rsi)
2305 vmovdqa32 %zmm19,%zmm16
2309 jb .Less_than_64_16x
2310 vpxord (%rsi),%zmm19,%zmm19
2311 vmovdqu32 %zmm19,(%rdi,%rsi)
2313 vmovdqa32 %zmm1,%zmm16
2317 jb .Less_than_64_16x
2318 vpxord (%rsi),%zmm1,%zmm1
2319 vmovdqu32 %zmm1,(%rdi,%rsi)
2321 vmovdqa32 %zmm18,%zmm16
2325 jb .Less_than_64_16x
2326 vpxord (%rsi),%zmm18,%zmm18
2327 vmovdqu32 %zmm18,(%rdi,%rsi)
2329 vmovdqa32 %zmm3,%zmm16
2333 jb .Less_than_64_16x
2334 vpxord (%rsi),%zmm3,%zmm3
2335 vmovdqu32 %zmm3,(%rdi,%rsi)
2337 vmovdqa32 %zmm0,%zmm16
2341 jb .Less_than_64_16x
2342 vpxord (%rsi),%zmm0,%zmm0
2343 vmovdqu32 %zmm0,(%rdi,%rsi)
2345 vmovdqa32 %zmm9,%zmm16
2349 jb .Less_than_64_16x
2350 vpxord (%rsi),%zmm9,%zmm9
2351 vmovdqu32 %zmm9,(%rdi,%rsi)
2353 vmovdqa32 %zmm6,%zmm16
2357 jb .Less_than_64_16x
2358 vpxord (%rsi),%zmm6,%zmm6
2359 vmovdqu32 %zmm6,(%rdi,%rsi)
2361 vmovdqa32 %zmm11,%zmm16
2365 jb .Less_than_64_16x
2366 vpxord (%rsi),%zmm11,%zmm11
2367 vmovdqu32 %zmm11,(%rdi,%rsi)
2369 vmovdqa32 %zmm13,%zmm16
2373 jb .Less_than_64_16x
2374 vpxord (%rsi),%zmm13,%zmm13
2375 vmovdqu32 %zmm13,(%rdi,%rsi)
2377 vmovdqa32 %zmm10,%zmm16
2381 jb .Less_than_64_16x
2382 vpxord (%rsi),%zmm10,%zmm10
2383 vmovdqu32 %zmm10,(%rdi,%rsi)
2385 vmovdqa32 %zmm15,%zmm16
2389 jb .Less_than_64_16x
2390 vpxord (%rsi),%zmm15,%zmm15
2391 vmovdqu32 %zmm15,(%rdi,%rsi)
2393 vmovdqa32 %zmm12,%zmm16
2397 vmovdqa32 %zmm16,0x00(%rsp)
2398 lea (%rdi,%rsi),%rdi
2402 movzb (%rsi,%r9),%eax
2403 movzb (%rsp,%r9),%ecx
2406 mov %al,-1(%rdi,%r9)
2410 vpxord %zmm16,%zmm16,%zmm16
2411 vmovdqa32 %zmm16,0(%rsp)
2418 .size chacha20_16x,.-chacha20_16x
2419 .type chacha20_8xvl,@function
2423 lea 8(%rsp),%r10 # frame register
2428 lea .Lsigma(%rip),%r9
2429 vbroadcasti128 (%r9),%ymm3 # key[0]
2430 vbroadcasti128 (%rcx),%ymm7 # key[1]
2431 vbroadcasti128 16(%rcx),%ymm11 # key[2]
2432 vbroadcasti128 (%r8),%ymm15 # key[3]
2434 vpshufd $0x00,%ymm3,%ymm0 # smash key by lanes...
2435 vpshufd $0x55,%ymm3,%ymm1
2436 vpshufd $0xaa,%ymm3,%ymm2
2437 vpshufd $0xff,%ymm3,%ymm3
2438 vmovdqa64 %ymm0,%ymm16
2439 vmovdqa64 %ymm1,%ymm17
2440 vmovdqa64 %ymm2,%ymm18
2441 vmovdqa64 %ymm3,%ymm19
2443 vpshufd $0x00,%ymm7,%ymm4
2444 vpshufd $0x55,%ymm7,%ymm5
2445 vpshufd $0xaa,%ymm7,%ymm6
2446 vpshufd $0xff,%ymm7,%ymm7
2447 vmovdqa64 %ymm4,%ymm20
2448 vmovdqa64 %ymm5,%ymm21
2449 vmovdqa64 %ymm6,%ymm22
2450 vmovdqa64 %ymm7,%ymm23
2452 vpshufd $0x00,%ymm11,%ymm8
2453 vpshufd $0x55,%ymm11,%ymm9
2454 vpshufd $0xaa,%ymm11,%ymm10
2455 vpshufd $0xff,%ymm11,%ymm11
2456 vmovdqa64 %ymm8,%ymm24
2457 vmovdqa64 %ymm9,%ymm25
2458 vmovdqa64 %ymm10,%ymm26
2459 vmovdqa64 %ymm11,%ymm27
2461 vpshufd $0x00,%ymm15,%ymm12
2462 vpshufd $0x55,%ymm15,%ymm13
2463 vpshufd $0xaa,%ymm15,%ymm14
2464 vpshufd $0xff,%ymm15,%ymm15
2465 vpaddd .Lincy(%rip),%ymm12,%ymm12 # don't save counters yet
2466 vmovdqa64 %ymm12,%ymm28
2467 vmovdqa64 %ymm13,%ymm29
2468 vmovdqa64 %ymm14,%ymm30
2469 vmovdqa64 %ymm15,%ymm31
2476 #vpbroadcastd 0(%r9),%ymm0 # reload key
2477 #vpbroadcastd 4(%r9),%ymm1
2478 vpbroadcastd 8(%r9),%ymm2
2479 vpbroadcastd 12(%r9),%ymm3
2480 vpaddd .Leight(%rip),%ymm28,%ymm28 # next SIMD counters
2481 vmovdqa64 %ymm20,%ymm4
2482 vmovdqa64 %ymm21,%ymm5
2483 vmovdqa64 %ymm22,%ymm6
2484 vmovdqa64 %ymm23,%ymm7
2485 vmovdqa64 %ymm24,%ymm8
2486 vmovdqa64 %ymm25,%ymm9
2487 vmovdqa64 %ymm26,%ymm10
2488 vmovdqa64 %ymm27,%ymm11
2489 vmovdqa64 %ymm28,%ymm12
2490 vmovdqa64 %ymm29,%ymm13
2491 vmovdqa64 %ymm30,%ymm14
2492 vmovdqa64 %ymm31,%ymm15
2494 vmovdqa64 %ymm0,%ymm16
2495 vmovdqa64 %ymm1,%ymm17
2496 vmovdqa64 %ymm2,%ymm18
2497 vmovdqa64 %ymm3,%ymm19
2504 vpaddd %ymm4,%ymm0,%ymm0
2505 vpaddd %ymm5,%ymm1,%ymm1
2506 vpaddd %ymm6,%ymm2,%ymm2
2507 vpaddd %ymm7,%ymm3,%ymm3
2508 vpxor %ymm0,%ymm12,%ymm12
2509 vpxor %ymm1,%ymm13,%ymm13
2510 vpxor %ymm2,%ymm14,%ymm14
2511 vpxor %ymm3,%ymm15,%ymm15
2512 vprold $16,%ymm12,%ymm12
2513 vprold $16,%ymm13,%ymm13
2514 vprold $16,%ymm14,%ymm14
2515 vprold $16,%ymm15,%ymm15
2516 vpaddd %ymm12,%ymm8,%ymm8
2517 vpaddd %ymm13,%ymm9,%ymm9
2518 vpaddd %ymm14,%ymm10,%ymm10
2519 vpaddd %ymm15,%ymm11,%ymm11
2520 vpxor %ymm8,%ymm4,%ymm4
2521 vpxor %ymm9,%ymm5,%ymm5
2522 vpxor %ymm10,%ymm6,%ymm6
2523 vpxor %ymm11,%ymm7,%ymm7
2524 vprold $12,%ymm4,%ymm4
2525 vprold $12,%ymm5,%ymm5
2526 vprold $12,%ymm6,%ymm6
2527 vprold $12,%ymm7,%ymm7
2528 vpaddd %ymm4,%ymm0,%ymm0
2529 vpaddd %ymm5,%ymm1,%ymm1
2530 vpaddd %ymm6,%ymm2,%ymm2
2531 vpaddd %ymm7,%ymm3,%ymm3
2532 vpxor %ymm0,%ymm12,%ymm12
2533 vpxor %ymm1,%ymm13,%ymm13
2534 vpxor %ymm2,%ymm14,%ymm14
2535 vpxor %ymm3,%ymm15,%ymm15
2536 vprold $8,%ymm12,%ymm12
2537 vprold $8,%ymm13,%ymm13
2538 vprold $8,%ymm14,%ymm14
2539 vprold $8,%ymm15,%ymm15
2540 vpaddd %ymm12,%ymm8,%ymm8
2541 vpaddd %ymm13,%ymm9,%ymm9
2542 vpaddd %ymm14,%ymm10,%ymm10
2543 vpaddd %ymm15,%ymm11,%ymm11
2544 vpxor %ymm8,%ymm4,%ymm4
2545 vpxor %ymm9,%ymm5,%ymm5
2546 vpxor %ymm10,%ymm6,%ymm6
2547 vpxor %ymm11,%ymm7,%ymm7
2548 vprold $7,%ymm4,%ymm4
2549 vprold $7,%ymm5,%ymm5
2550 vprold $7,%ymm6,%ymm6
2551 vprold $7,%ymm7,%ymm7
2552 vpaddd %ymm5,%ymm0,%ymm0
2553 vpaddd %ymm6,%ymm1,%ymm1
2554 vpaddd %ymm7,%ymm2,%ymm2
2555 vpaddd %ymm4,%ymm3,%ymm3
2556 vpxor %ymm0,%ymm15,%ymm15
2557 vpxor %ymm1,%ymm12,%ymm12
2558 vpxor %ymm2,%ymm13,%ymm13
2559 vpxor %ymm3,%ymm14,%ymm14
2560 vprold $16,%ymm15,%ymm15
2561 vprold $16,%ymm12,%ymm12
2562 vprold $16,%ymm13,%ymm13
2563 vprold $16,%ymm14,%ymm14
2564 vpaddd %ymm15,%ymm10,%ymm10
2565 vpaddd %ymm12,%ymm11,%ymm11
2566 vpaddd %ymm13,%ymm8,%ymm8
2567 vpaddd %ymm14,%ymm9,%ymm9
2568 vpxor %ymm10,%ymm5,%ymm5
2569 vpxor %ymm11,%ymm6,%ymm6
2570 vpxor %ymm8,%ymm7,%ymm7
2571 vpxor %ymm9,%ymm4,%ymm4
2572 vprold $12,%ymm5,%ymm5
2573 vprold $12,%ymm6,%ymm6
2574 vprold $12,%ymm7,%ymm7
2575 vprold $12,%ymm4,%ymm4
2576 vpaddd %ymm5,%ymm0,%ymm0
2577 vpaddd %ymm6,%ymm1,%ymm1
2578 vpaddd %ymm7,%ymm2,%ymm2
2579 vpaddd %ymm4,%ymm3,%ymm3
2580 vpxor %ymm0,%ymm15,%ymm15
2581 vpxor %ymm1,%ymm12,%ymm12
2582 vpxor %ymm2,%ymm13,%ymm13
2583 vpxor %ymm3,%ymm14,%ymm14
2584 vprold $8,%ymm15,%ymm15
2585 vprold $8,%ymm12,%ymm12
2586 vprold $8,%ymm13,%ymm13
2587 vprold $8,%ymm14,%ymm14
2588 vpaddd %ymm15,%ymm10,%ymm10
2589 vpaddd %ymm12,%ymm11,%ymm11
2590 vpaddd %ymm13,%ymm8,%ymm8
2591 vpaddd %ymm14,%ymm9,%ymm9
2592 vpxor %ymm10,%ymm5,%ymm5
2593 vpxor %ymm11,%ymm6,%ymm6
2594 vpxor %ymm8,%ymm7,%ymm7
2595 vpxor %ymm9,%ymm4,%ymm4
2596 vprold $7,%ymm5,%ymm5
2597 vprold $7,%ymm6,%ymm6
2598 vprold $7,%ymm7,%ymm7
2599 vprold $7,%ymm4,%ymm4
2603 vpaddd %ymm16,%ymm0,%ymm0 # accumulate key
2604 vpaddd %ymm17,%ymm1,%ymm1
2605 vpaddd %ymm18,%ymm2,%ymm2
2606 vpaddd %ymm19,%ymm3,%ymm3
2608 vpunpckldq %ymm1,%ymm0,%ymm18 # "de-interlace" data
2609 vpunpckldq %ymm3,%ymm2,%ymm19
2610 vpunpckhdq %ymm1,%ymm0,%ymm0
2611 vpunpckhdq %ymm3,%ymm2,%ymm2
2612 vpunpcklqdq %ymm19,%ymm18,%ymm1 # "a0"
2613 vpunpckhqdq %ymm19,%ymm18,%ymm18 # "a1"
2614 vpunpcklqdq %ymm2,%ymm0,%ymm3 # "a2"
2615 vpunpckhqdq %ymm2,%ymm0,%ymm0 # "a3"
2616 vpaddd %ymm20,%ymm4,%ymm4
2617 vpaddd %ymm21,%ymm5,%ymm5
2618 vpaddd %ymm22,%ymm6,%ymm6
2619 vpaddd %ymm23,%ymm7,%ymm7
2621 vpunpckldq %ymm5,%ymm4,%ymm2
2622 vpunpckldq %ymm7,%ymm6,%ymm19
2623 vpunpckhdq %ymm5,%ymm4,%ymm4
2624 vpunpckhdq %ymm7,%ymm6,%ymm6
2625 vpunpcklqdq %ymm19,%ymm2,%ymm5 # "b0"
2626 vpunpckhqdq %ymm19,%ymm2,%ymm2 # "b1"
2627 vpunpcklqdq %ymm6,%ymm4,%ymm7 # "b2"
2628 vpunpckhqdq %ymm6,%ymm4,%ymm4 # "b3"
2629 vshufi32x4 $0,%ymm5,%ymm1,%ymm19 # "de-interlace" further
2630 vshufi32x4 $3,%ymm5,%ymm1,%ymm5
2631 vshufi32x4 $0,%ymm2,%ymm18,%ymm1
2632 vshufi32x4 $3,%ymm2,%ymm18,%ymm2
2633 vshufi32x4 $0,%ymm7,%ymm3,%ymm18
2634 vshufi32x4 $3,%ymm7,%ymm3,%ymm7
2635 vshufi32x4 $0,%ymm4,%ymm0,%ymm3
2636 vshufi32x4 $3,%ymm4,%ymm0,%ymm4
2637 vpaddd %ymm24,%ymm8,%ymm8
2638 vpaddd %ymm25,%ymm9,%ymm9
2639 vpaddd %ymm26,%ymm10,%ymm10
2640 vpaddd %ymm27,%ymm11,%ymm11
2642 vpunpckldq %ymm9,%ymm8,%ymm6
2643 vpunpckldq %ymm11,%ymm10,%ymm0
2644 vpunpckhdq %ymm9,%ymm8,%ymm8
2645 vpunpckhdq %ymm11,%ymm10,%ymm10
2646 vpunpcklqdq %ymm0,%ymm6,%ymm9 # "c0"
2647 vpunpckhqdq %ymm0,%ymm6,%ymm6 # "c1"
2648 vpunpcklqdq %ymm10,%ymm8,%ymm11 # "c2"
2649 vpunpckhqdq %ymm10,%ymm8,%ymm8 # "c3"
2650 vpaddd %ymm28,%ymm12,%ymm12
2651 vpaddd %ymm29,%ymm13,%ymm13
2652 vpaddd %ymm30,%ymm14,%ymm14
2653 vpaddd %ymm31,%ymm15,%ymm15
2655 vpunpckldq %ymm13,%ymm12,%ymm10
2656 vpunpckldq %ymm15,%ymm14,%ymm0
2657 vpunpckhdq %ymm13,%ymm12,%ymm12
2658 vpunpckhdq %ymm15,%ymm14,%ymm14
2659 vpunpcklqdq %ymm0,%ymm10,%ymm13 # "d0"
2660 vpunpckhqdq %ymm0,%ymm10,%ymm10 # "d1"
2661 vpunpcklqdq %ymm14,%ymm12,%ymm15 # "d2"
2662 vpunpckhqdq %ymm14,%ymm12,%ymm12 # "d3"
2663 vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 # "de-interlace" further
2664 vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
2665 vperm2i128 $0x20,%ymm10,%ymm6,%ymm9
2666 vperm2i128 $0x31,%ymm10,%ymm6,%ymm10
2667 vperm2i128 $0x20,%ymm15,%ymm11,%ymm6
2668 vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
2669 vperm2i128 $0x20,%ymm12,%ymm8,%ymm11
2670 vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
2674 mov $0x80,%eax # size optimization
2675 vpxord 0x00(%rsi),%ymm19,%ymm19 # xor with input
2676 vpxor 0x20(%rsi),%ymm0,%ymm0
2677 vpxor 0x40(%rsi),%ymm5,%ymm5
2678 vpxor 0x60(%rsi),%ymm13,%ymm13
2679 lea (%rsi,%rax),%rsi # size optimization
2680 vmovdqu32 %ymm19,0x00(%rdi)
2681 vmovdqu %ymm0,0x20(%rdi)
2682 vmovdqu %ymm5,0x40(%rdi)
2683 vmovdqu %ymm13,0x60(%rdi)
2684 lea (%rdi,%rax),%rdi # size optimization
2686 vpxor 0x00(%rsi),%ymm1,%ymm1
2687 vpxor 0x20(%rsi),%ymm9,%ymm9
2688 vpxor 0x40(%rsi),%ymm2,%ymm2
2689 vpxor 0x60(%rsi),%ymm10,%ymm10
2690 lea (%rsi,%rax),%rsi # size optimization
2691 vmovdqu %ymm1,0x00(%rdi)
2692 vmovdqu %ymm9,0x20(%rdi)
2693 vmovdqu %ymm2,0x40(%rdi)
2694 vmovdqu %ymm10,0x60(%rdi)
2695 lea (%rdi,%rax),%rdi # size optimization
2697 vpxord 0x00(%rsi),%ymm18,%ymm18
2698 vpxor 0x20(%rsi),%ymm6,%ymm6
2699 vpxor 0x40(%rsi),%ymm7,%ymm7
2700 vpxor 0x60(%rsi),%ymm15,%ymm15
2701 lea (%rsi,%rax),%rsi # size optimization
2702 vmovdqu32 %ymm18,0x00(%rdi)
2703 vmovdqu %ymm6,0x20(%rdi)
2704 vmovdqu %ymm7,0x40(%rdi)
2705 vmovdqu %ymm15,0x60(%rdi)
2706 lea (%rdi,%rax),%rdi # size optimization
2708 vpxor 0x00(%rsi),%ymm3,%ymm3
2709 vpxor 0x20(%rsi),%ymm11,%ymm11
2710 vpxor 0x40(%rsi),%ymm4,%ymm4
2711 vpxor 0x60(%rsi),%ymm12,%ymm12
2712 lea (%rsi,%rax),%rsi # size optimization
2713 vmovdqu %ymm3,0x00(%rdi)
2714 vmovdqu %ymm11,0x20(%rdi)
2715 vmovdqu %ymm4,0x40(%rdi)
2716 vmovdqu %ymm12,0x60(%rdi)
2717 lea (%rdi,%rax),%rdi # size optimization
2719 vpbroadcastd 0(%r9),%ymm0 # reload key
2720 vpbroadcastd 4(%r9),%ymm1
2729 vmovdqa64 %ymm19,%ymm8 # size optimization
2733 jb .Less_than_64_8xvl
2734 vpxor 0x00(%rsi),%ymm8,%ymm8 # xor with input
2735 vpxor 0x20(%rsi),%ymm0,%ymm0
2736 vmovdqu %ymm8,0x00(%rdi,%rsi)
2737 vmovdqu %ymm0,0x20(%rdi,%rsi)
2740 vmovdqa %ymm13,%ymm0
2744 jb .Less_than_64_8xvl
2745 vpxor 0x00(%rsi),%ymm5,%ymm5
2746 vpxor 0x20(%rsi),%ymm13,%ymm13
2747 vmovdqu %ymm5,0x00(%rdi,%rsi)
2748 vmovdqu %ymm13,0x20(%rdi,%rsi)
2755 jb .Less_than_64_8xvl
2756 vpxor 0x00(%rsi),%ymm1,%ymm1
2757 vpxor 0x20(%rsi),%ymm9,%ymm9
2758 vmovdqu %ymm1,0x00(%rdi,%rsi)
2759 vmovdqu %ymm9,0x20(%rdi,%rsi)
2762 vmovdqa %ymm10,%ymm0
2766 jb .Less_than_64_8xvl
2767 vpxor 0x00(%rsi),%ymm2,%ymm2
2768 vpxor 0x20(%rsi),%ymm10,%ymm10
2769 vmovdqu %ymm2,0x00(%rdi,%rsi)
2770 vmovdqu %ymm10,0x20(%rdi,%rsi)
2772 vmovdqa32 %ymm18,%ymm8
2777 jb .Less_than_64_8xvl
2778 vpxord 0x00(%rsi),%ymm18,%ymm18
2779 vpxor 0x20(%rsi),%ymm6,%ymm6
2780 vmovdqu32 %ymm18,0x00(%rdi,%rsi)
2781 vmovdqu %ymm6,0x20(%rdi,%rsi)
2784 vmovdqa %ymm15,%ymm0
2788 jb .Less_than_64_8xvl
2789 vpxor 0x00(%rsi),%ymm7,%ymm7
2790 vpxor 0x20(%rsi),%ymm15,%ymm15
2791 vmovdqu %ymm7,0x00(%rdi,%rsi)
2792 vmovdqu %ymm15,0x20(%rdi,%rsi)
2795 vmovdqa %ymm11,%ymm0
2799 jb .Less_than_64_8xvl
2800 vpxor 0x00(%rsi),%ymm3,%ymm3
2801 vpxor 0x20(%rsi),%ymm11,%ymm11
2802 vmovdqu %ymm3,0x00(%rdi,%rsi)
2803 vmovdqu %ymm11,0x20(%rdi,%rsi)
2806 vmovdqa %ymm12,%ymm0
2810 vmovdqa %ymm8,0x00(%rsp)
2811 vmovdqa %ymm0,0x20(%rsp)
2812 lea (%rdi,%rsi),%rdi
2816 movzb (%rsi,%r9),%eax
2817 movzb (%rsp,%r9),%ecx
2820 mov %al,-1(%rdi,%r9)
2824 vpxor %ymm8,%ymm8,%ymm8
2825 vmovdqa %ymm8,0x00(%rsp)
2826 vmovdqa %ymm8,0x20(%rsp)
2833 .size chacha20_8xvl,.-chacha20_8xvl