2 # Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # ChaCha20 for x86_64.
23 # Add AVX512F code path.
27 # Add AVX512VL code path.
29 # Performance in cycles per byte out of large buffer.
31 # IALU/gcc 4.8(i) 1x/2xSSSE3(ii) 4xSSSE3 NxAVX(v)
34 # Core2 7.83/+55% 7.90/5.76 4.35
35 # Westmere 7.19/+50% 5.60/4.50 3.00
36 # Sandy Bridge 8.31/+42% 5.45/4.00 2.72
37 # Ivy Bridge 6.71/+46% 5.40/? 2.41
38 # Haswell 5.92/+43% 5.20/3.45 2.42 1.23
39 # Skylake[-X] 5.87/+39% 4.70/3.22 2.31 1.19[0.80(vi)]
40 # Silvermont 12.0/+33% 7.75/6.90 7.03(iii)
41 # Knights L 11.7/- ? 9.60(iii) 0.80
42 # Goldmont 10.6/+17% 5.10/3.52 3.28
43 # Sledgehammer 7.28/+52% - -
44 # Bulldozer 9.66/+28% 9.85/5.35(iv) 3.06(iv)
45 # Ryzen 5.96/+50% 5.19/3.00 2.40 2.09
46 # VIA Nano 10.5/+46% 6.72/6.88 6.05
48 # (i) compared to older gcc 3.x one can observe >2x improvement on
50 # (ii) 2xSSSE3 is code path optimized specifically for 128 bytes used
51 # by chacha20_poly1305_tls_cipher, results are EVP-free;
52 # (iii) this is not optimal result for Atom because of MSROM
53 # limitations, SSE2 can do better, but gain is considered too
54 # low to justify the [maintenance] effort;
55 # (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20
56 # and 4.85 for 128-byte inputs;
57 # (v) 8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable;
58 # (vi) even though Skylake-X can execute AVX512F code and deliver 0.57
59 # cpb in single thread, the corresponding capability is suppressed;
63 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
65 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
67 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
68 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
69 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
70 die "can't locate x86_64-xlate.pl";
72 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
73 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
74 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
77 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
78 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
79 $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
80 $avx += 1 if ($1==2.11 && $2>=8);
83 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
84 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
85 $avx = ($1>=10) + ($1>=11);
88 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
89 $avx = ($2>=3.0) + ($2>3.0);
92 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
95 # input parameter block
96 ($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
101 .extern OPENSSL_ia32cap_P
113 .long 0,2,4,6,1,3,5,7
115 .long 8,8,8,8,8,8,8,8
117 .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
119 .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
121 .long 2,0,0,0, 2,0,0,0
124 .long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
126 .long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
128 .long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
130 .long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
132 .asciz "expand 32-byte k"
133 .asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
136 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
137 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
139 $arg = "\$$arg" if ($arg*1 eq $arg);
140 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
143 @x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
144 "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
147 sub ROUND { # critical path is 24 cycles per round
148 my ($a0,$b0,$c0,$d0)=@_;
149 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
150 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
151 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
152 my ($xc,$xc_)=map("\"$_\"",@t);
153 my @x=map("\"$_\"",@x);
155 # Consider order in which variables are addressed by their
160 # 0 4 8 12 < even round
164 # 0 5 10 15 < odd round
169 # 'a', 'b' and 'd's are permanently allocated in registers,
170 # @x[0..7,12..15], while 'c's are maintained in memory. If
171 # you observe 'c' column, you'll notice that pair of 'c's is
172 # invariant between rounds. This means that we have to reload
173 # them once per round, in the middle. This is why you'll see
174 # bunch of 'c' stores and loads in the middle, but none in
175 # the beginning or end.
177 # Normally instructions would be interleaved to favour in-order
178 # execution. Generally out-of-order cores manage it gracefully,
179 # but not this time for some reason. As in-order execution
180 # cores are dying breed, old Atom is the only one around,
181 # instructions are left uninterleaved. Besides, Atom is better
182 # off executing 1xSSSE3 code anyway...
185 "&add (@x[$a0],@x[$b0])", # Q1
186 "&xor (@x[$d0],@x[$a0])",
188 "&add (@x[$a1],@x[$b1])", # Q2
189 "&xor (@x[$d1],@x[$a1])",
192 "&add ($xc,@x[$d0])",
193 "&xor (@x[$b0],$xc)",
195 "&add ($xc_,@x[$d1])",
196 "&xor (@x[$b1],$xc_)",
199 "&add (@x[$a0],@x[$b0])",
200 "&xor (@x[$d0],@x[$a0])",
202 "&add (@x[$a1],@x[$b1])",
203 "&xor (@x[$d1],@x[$a1])",
206 "&add ($xc,@x[$d0])",
207 "&xor (@x[$b0],$xc)",
209 "&add ($xc_,@x[$d1])",
210 "&xor (@x[$b1],$xc_)",
213 "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's
214 "&mov (\"4*$c1(%rsp)\",$xc_)",
215 "&mov ($xc,\"4*$c2(%rsp)\")",
216 "&mov ($xc_,\"4*$c3(%rsp)\")",
218 "&add (@x[$a2],@x[$b2])", # Q3
219 "&xor (@x[$d2],@x[$a2])",
221 "&add (@x[$a3],@x[$b3])", # Q4
222 "&xor (@x[$d3],@x[$a3])",
225 "&add ($xc,@x[$d2])",
226 "&xor (@x[$b2],$xc)",
228 "&add ($xc_,@x[$d3])",
229 "&xor (@x[$b3],$xc_)",
232 "&add (@x[$a2],@x[$b2])",
233 "&xor (@x[$d2],@x[$a2])",
235 "&add (@x[$a3],@x[$b3])",
236 "&xor (@x[$d3],@x[$a3])",
239 "&add ($xc,@x[$d2])",
240 "&xor (@x[$b2],$xc)",
242 "&add ($xc_,@x[$d3])",
243 "&xor (@x[$b3],$xc_)",
248 ########################################################################
249 # Generic code path that handles all lengths on pre-SSSE3 processors.
251 .globl ChaCha20_ctr32
252 .type ChaCha20_ctr32,\@function,5
258 mov OPENSSL_ia32cap_P+4(%rip),%r10
260 $code.=<<___ if ($avx>2);
261 bt \$48,%r10 # check for AVX512F
263 test %r10,%r10 # check for AVX512VL
264 js .LChaCha20_avx512vl
267 test \$`1<<(41-32)`,%r10d
283 .cfi_adjust_cfa_offset 64+24
286 #movdqa .Lsigma(%rip),%xmm0
288 movdqu 16($key),%xmm2
289 movdqu ($counter),%xmm3
290 movdqa .Lone(%rip),%xmm4
292 #movdqa %xmm0,4*0(%rsp) # key[0]
293 movdqa %xmm1,4*4(%rsp) # key[1]
294 movdqa %xmm2,4*8(%rsp) # key[2]
295 movdqa %xmm3,4*12(%rsp) # key[3]
296 mov $len,%rbp # reassign $len
301 mov \$0x61707865,@x[0] # 'expa'
302 mov \$0x3320646e,@x[1] # 'nd 3'
303 mov \$0x79622d32,@x[2] # '2-by'
304 mov \$0x6b206574,@x[3] # 'te k'
310 mov 4*13(%rsp),@x[13]
311 mov 4*14(%rsp),@x[14]
312 mov 4*15(%rsp),@x[15]
314 mov %rbp,64+0(%rsp) # save len
316 mov $inp,64+8(%rsp) # save inp
317 movq %xmm2,%rsi # "@x[8]"
318 mov $out,64+16(%rsp) # save out
320 shr \$32,%rdi # "@x[9]"
326 foreach (&ROUND (0, 4, 8,12)) { eval; }
327 foreach (&ROUND (0, 5,10,15)) { eval; }
332 mov @t[1],4*9(%rsp) # modulo-scheduled
334 mov 64(%rsp),%rbp # load len
336 mov 64+8(%rsp),$inp # load inp
337 paddd %xmm4,%xmm3 # increment counter
338 mov 64+16(%rsp),$out # load out
340 add \$0x61707865,@x[0] # 'expa'
341 add \$0x3320646e,@x[1] # 'nd 3'
342 add \$0x79622d32,@x[2] # '2-by'
343 add \$0x6b206574,@x[3] # 'te k'
348 add 4*12(%rsp),@x[12]
349 add 4*13(%rsp),@x[13]
350 add 4*14(%rsp),@x[14]
351 add 4*15(%rsp),@x[15]
352 paddd 4*8(%rsp),%xmm1
357 xor 4*0($inp),@x[0] # xor with input
365 movdqu 4*8($inp),%xmm0
366 xor 4*12($inp),@x[12]
367 xor 4*13($inp),@x[13]
368 xor 4*14($inp),@x[14]
369 xor 4*15($inp),@x[15]
370 lea 4*16($inp),$inp # inp+=64
373 movdqa %xmm2,4*8(%rsp)
374 movd %xmm3,4*12(%rsp)
376 mov @x[0],4*0($out) # write output
384 movdqu %xmm0,4*8($out)
385 mov @x[12],4*12($out)
386 mov @x[13],4*13($out)
387 mov @x[14],4*14($out)
388 mov @x[15],4*15($out)
389 lea 4*16($out),$out # out+=64
407 movdqa %xmm1,4*8(%rsp)
408 mov @x[12],4*12(%rsp)
409 mov @x[13],4*13(%rsp)
410 mov @x[14],4*14(%rsp)
411 mov @x[15],4*15(%rsp)
414 movzb ($inp,%rbx),%eax
415 movzb (%rsp,%rbx),%edx
418 mov %al,-1($out,%rbx)
423 lea 64+24+48(%rsp),%rsi
438 .cfi_def_cfa_register %rsp
442 .size ChaCha20_ctr32,.-ChaCha20_ctr32
445 ########################################################################
446 # SSSE3 code path that handles shorter lengths
448 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
450 sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
474 my $xframe = $win64 ? 32+8 : 8;
477 .type ChaCha20_ssse3,\@function,5
482 mov %rsp,%r9 # frame pointer
483 .cfi_def_cfa_register %r9
485 $code.=<<___ if ($avx);
486 test \$`1<<(43-32)`,%r10d
487 jnz .LChaCha20_4xop # XOP is fastest even if we use 1/4
490 cmp \$128,$len # we might throw away some data,
492 ja .LChaCha20_4x # but overall it won't be slower
495 sub \$64+$xframe,%rsp
497 $code.=<<___ if ($win64);
498 movaps %xmm6,-0x28(%r9)
499 movaps %xmm7,-0x18(%r9)
503 movdqa .Lsigma(%rip),$a
507 movdqa .Lrot16(%rip),$rot16
508 movdqa .Lrot24(%rip),$rot24
514 mov \$10,$counter # reuse $counter
519 movdqa .Lone(%rip),$d
532 &pshufd ($c,$c,0b01001110);
533 &pshufd ($b,$b,0b00111001);
534 &pshufd ($d,$d,0b10010011);
538 &pshufd ($c,$c,0b01001110);
539 &pshufd ($b,$b,0b10010011);
540 &pshufd ($d,$d,0b00111001);
543 &jnz (".Loop_ssse3");
555 movdqu 0x10($inp),$t1
556 pxor $t,$a # xor with input
559 movdqu 0x30($inp),$t1
560 lea 0x40($inp),$inp # inp+=64
564 movdqu $a,0x00($out) # write output
568 lea 0x40($out),$out # out+=64
571 jnz .Loop_outer_ssse3
581 xor $counter,$counter
584 movzb ($inp,$counter),%eax
585 movzb (%rsp,$counter),%ecx
586 lea 1($counter),$counter
588 mov %al,-1($out,$counter)
594 $code.=<<___ if ($win64);
595 movaps -0x28(%r9),%xmm6
596 movaps -0x18(%r9),%xmm7
600 .cfi_def_cfa_register %rsp
604 .size ChaCha20_ssse3,.-ChaCha20_ssse3
608 ########################################################################
609 # SSSE3 code path that handles 128-byte inputs
611 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(8,9,2..7));
612 my ($a1,$b1,$c1,$d1)=map("%xmm$_",(10,11,0,1));
656 my $xframe = $win64 ? 0x68 : 8;
659 .type ChaCha20_128,\@function,5
664 mov %rsp,%r9 # frame pointer
665 .cfi_def_cfa_register %r9
666 sub \$64+$xframe,%rsp
668 $code.=<<___ if ($win64);
669 movaps %xmm6,-0x68(%r9)
670 movaps %xmm7,-0x58(%r9)
671 movaps %xmm8,-0x48(%r9)
672 movaps %xmm9,-0x38(%r9)
673 movaps %xmm10,-0x28(%r9)
674 movaps %xmm11,-0x18(%r9)
678 movdqa .Lsigma(%rip),$a
682 movdqa .Lone(%rip),$d1
683 movdqa .Lrot16(%rip),$rot16
684 movdqa .Lrot24(%rip),$rot24
694 mov \$10,$counter # reuse $counter
701 &pshufd ($c,$c,0b01001110);
702 &pshufd ($b,$b,0b00111001);
703 &pshufd ($d,$d,0b10010011);
704 &pshufd ($c1,$c1,0b01001110);
705 &pshufd ($b1,$b1,0b00111001);
706 &pshufd ($d1,$d1,0b10010011);
709 &pshufd ($c,$c,0b01001110);
710 &pshufd ($b,$b,0b10010011);
711 &pshufd ($d,$d,0b00111001);
712 &pshufd ($c1,$c1,0b01001110);
713 &pshufd ($b1,$b1,0b10010011);
714 &pshufd ($d1,$d1,0b00111001);
724 paddd .Lone(%rip),$d1
731 movdqu 0x10($inp),$t1
732 pxor $t,$a # xor with input
735 movdqu 0x30($inp),$t1
739 movdqu 0x50($inp),$t1
743 movdqu 0x70($inp),$t1
747 movdqu $a,0x00($out) # write output
751 movdqu $a1,0x40($out)
752 movdqu $b1,0x50($out)
753 movdqu $c1,0x60($out)
754 movdqu $d1,0x70($out)
756 $code.=<<___ if ($win64);
757 movaps -0x68(%r9),%xmm6
758 movaps -0x58(%r9),%xmm7
759 movaps -0x48(%r9),%xmm8
760 movaps -0x38(%r9),%xmm9
761 movaps -0x28(%r9),%xmm10
762 movaps -0x18(%r9),%xmm11
766 .cfi_def_cfa_register %rsp
770 .size ChaCha20_128,.-ChaCha20_128
774 ########################################################################
775 # SSSE3 code path that handles longer messages.
777 # assign variables to favor Atom front-end
778 my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
779 $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
780 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
781 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
783 sub SSSE3_lane_ROUND {
784 my ($a0,$b0,$c0,$d0)=@_;
785 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
786 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
787 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
788 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
789 my @x=map("\"$_\"",@xx);
791 # Consider order in which variables are addressed by their
796 # 0 4 8 12 < even round
800 # 0 5 10 15 < odd round
805 # 'a', 'b' and 'd's are permanently allocated in registers,
806 # @x[0..7,12..15], while 'c's are maintained in memory. If
807 # you observe 'c' column, you'll notice that pair of 'c's is
808 # invariant between rounds. This means that we have to reload
809 # them once per round, in the middle. This is why you'll see
810 # bunch of 'c' stores and loads in the middle, but none in
811 # the beginning or end.
814 "&paddd (@x[$a0],@x[$b0])", # Q1
815 "&paddd (@x[$a1],@x[$b1])", # Q2
816 "&pxor (@x[$d0],@x[$a0])",
817 "&pxor (@x[$d1],@x[$a1])",
818 "&pshufb (@x[$d0],$t1)",
819 "&pshufb (@x[$d1],$t1)",
821 "&paddd ($xc,@x[$d0])",
822 "&paddd ($xc_,@x[$d1])",
823 "&pxor (@x[$b0],$xc)",
824 "&pxor (@x[$b1],$xc_)",
825 "&movdqa ($t0,@x[$b0])",
826 "&pslld (@x[$b0],12)",
828 "&movdqa ($t1,@x[$b1])",
829 "&pslld (@x[$b1],12)",
830 "&por (@x[$b0],$t0)",
832 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
833 "&por (@x[$b1],$t1)",
835 "&paddd (@x[$a0],@x[$b0])",
836 "&paddd (@x[$a1],@x[$b1])",
837 "&pxor (@x[$d0],@x[$a0])",
838 "&pxor (@x[$d1],@x[$a1])",
839 "&pshufb (@x[$d0],$t0)",
840 "&pshufb (@x[$d1],$t0)",
842 "&paddd ($xc,@x[$d0])",
843 "&paddd ($xc_,@x[$d1])",
844 "&pxor (@x[$b0],$xc)",
845 "&pxor (@x[$b1],$xc_)",
846 "&movdqa ($t1,@x[$b0])",
847 "&pslld (@x[$b0],7)",
849 "&movdqa ($t0,@x[$b1])",
850 "&pslld (@x[$b1],7)",
851 "&por (@x[$b0],$t1)",
853 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
854 "&por (@x[$b1],$t0)",
856 "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
857 "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)",
858 "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")",
859 "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")",
861 "&paddd (@x[$a2],@x[$b2])", # Q3
862 "&paddd (@x[$a3],@x[$b3])", # Q4
863 "&pxor (@x[$d2],@x[$a2])",
864 "&pxor (@x[$d3],@x[$a3])",
865 "&pshufb (@x[$d2],$t1)",
866 "&pshufb (@x[$d3],$t1)",
868 "&paddd ($xc,@x[$d2])",
869 "&paddd ($xc_,@x[$d3])",
870 "&pxor (@x[$b2],$xc)",
871 "&pxor (@x[$b3],$xc_)",
872 "&movdqa ($t0,@x[$b2])",
873 "&pslld (@x[$b2],12)",
875 "&movdqa ($t1,@x[$b3])",
876 "&pslld (@x[$b3],12)",
877 "&por (@x[$b2],$t0)",
879 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
880 "&por (@x[$b3],$t1)",
882 "&paddd (@x[$a2],@x[$b2])",
883 "&paddd (@x[$a3],@x[$b3])",
884 "&pxor (@x[$d2],@x[$a2])",
885 "&pxor (@x[$d3],@x[$a3])",
886 "&pshufb (@x[$d2],$t0)",
887 "&pshufb (@x[$d3],$t0)",
889 "&paddd ($xc,@x[$d2])",
890 "&paddd ($xc_,@x[$d3])",
891 "&pxor (@x[$b2],$xc)",
892 "&pxor (@x[$b3],$xc_)",
893 "&movdqa ($t1,@x[$b2])",
894 "&pslld (@x[$b2],7)",
896 "&movdqa ($t0,@x[$b3])",
897 "&pslld (@x[$b3],7)",
898 "&por (@x[$b2],$t1)",
900 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
905 my $xframe = $win64 ? 0xa8 : 8;
908 .type ChaCha20_4x,\@function,5
913 mov %rsp,%r9 # frame pointer
914 .cfi_def_cfa_register %r9
917 $code.=<<___ if ($avx>1);
918 shr \$32,%r10 # OPENSSL_ia32cap_P+8
919 test \$`1<<5`,%r10 # test AVX2
926 and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE
927 cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE
928 je .Ldo_sse3_after_all # to detect Atom
931 sub \$0x140+$xframe,%rsp
933 ################ stack layout
934 # +0x00 SIMD equivalent of @x[8-12]
936 # +0x40 constant copy of key[0-2] smashed by lanes
938 # +0x100 SIMD counters (with nonce smashed by lanes)
941 $code.=<<___ if ($win64);
942 movaps %xmm6,-0xa8(%r9)
943 movaps %xmm7,-0x98(%r9)
944 movaps %xmm8,-0x88(%r9)
945 movaps %xmm9,-0x78(%r9)
946 movaps %xmm10,-0x68(%r9)
947 movaps %xmm11,-0x58(%r9)
948 movaps %xmm12,-0x48(%r9)
949 movaps %xmm13,-0x38(%r9)
950 movaps %xmm14,-0x28(%r9)
951 movaps %xmm15,-0x18(%r9)
955 movdqa .Lsigma(%rip),$xa3 # key[0]
956 movdqu ($key),$xb3 # key[1]
957 movdqu 16($key),$xt3 # key[2]
958 movdqu ($counter),$xd3 # key[3]
959 lea 0x100(%rsp),%rcx # size optimization
960 lea .Lrot16(%rip),%r10
961 lea .Lrot24(%rip),%r11
963 pshufd \$0x00,$xa3,$xa0 # smash key by lanes...
964 pshufd \$0x55,$xa3,$xa1
965 movdqa $xa0,0x40(%rsp) # ... and offload
966 pshufd \$0xaa,$xa3,$xa2
967 movdqa $xa1,0x50(%rsp)
968 pshufd \$0xff,$xa3,$xa3
969 movdqa $xa2,0x60(%rsp)
970 movdqa $xa3,0x70(%rsp)
972 pshufd \$0x00,$xb3,$xb0
973 pshufd \$0x55,$xb3,$xb1
974 movdqa $xb0,0x80-0x100(%rcx)
975 pshufd \$0xaa,$xb3,$xb2
976 movdqa $xb1,0x90-0x100(%rcx)
977 pshufd \$0xff,$xb3,$xb3
978 movdqa $xb2,0xa0-0x100(%rcx)
979 movdqa $xb3,0xb0-0x100(%rcx)
981 pshufd \$0x00,$xt3,$xt0 # "$xc0"
982 pshufd \$0x55,$xt3,$xt1 # "$xc1"
983 movdqa $xt0,0xc0-0x100(%rcx)
984 pshufd \$0xaa,$xt3,$xt2 # "$xc2"
985 movdqa $xt1,0xd0-0x100(%rcx)
986 pshufd \$0xff,$xt3,$xt3 # "$xc3"
987 movdqa $xt2,0xe0-0x100(%rcx)
988 movdqa $xt3,0xf0-0x100(%rcx)
990 pshufd \$0x00,$xd3,$xd0
991 pshufd \$0x55,$xd3,$xd1
992 paddd .Linc(%rip),$xd0 # don't save counters yet
993 pshufd \$0xaa,$xd3,$xd2
994 movdqa $xd1,0x110-0x100(%rcx)
995 pshufd \$0xff,$xd3,$xd3
996 movdqa $xd2,0x120-0x100(%rcx)
997 movdqa $xd3,0x130-0x100(%rcx)
1003 movdqa 0x40(%rsp),$xa0 # re-load smashed key
1004 movdqa 0x50(%rsp),$xa1
1005 movdqa 0x60(%rsp),$xa2
1006 movdqa 0x70(%rsp),$xa3
1007 movdqa 0x80-0x100(%rcx),$xb0
1008 movdqa 0x90-0x100(%rcx),$xb1
1009 movdqa 0xa0-0x100(%rcx),$xb2
1010 movdqa 0xb0-0x100(%rcx),$xb3
1011 movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
1012 movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
1013 movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
1014 movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
1015 movdqa 0x100-0x100(%rcx),$xd0
1016 movdqa 0x110-0x100(%rcx),$xd1
1017 movdqa 0x120-0x100(%rcx),$xd2
1018 movdqa 0x130-0x100(%rcx),$xd3
1019 paddd .Lfour(%rip),$xd0 # next SIMD counters
1022 movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]"
1023 movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]"
1024 movdqa (%r10),$xt3 # .Lrot16(%rip)
1026 movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
1032 foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
1033 foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
1038 paddd 0x40(%rsp),$xa0 # accumulate key material
1039 paddd 0x50(%rsp),$xa1
1040 paddd 0x60(%rsp),$xa2
1041 paddd 0x70(%rsp),$xa3
1043 movdqa $xa0,$xt2 # "de-interlace" data
1050 punpcklqdq $xa2,$xa0 # "a0"
1052 punpcklqdq $xt3,$xt2 # "a2"
1053 punpckhqdq $xa2,$xa1 # "a1"
1054 punpckhqdq $xt3,$xa3 # "a3"
1056 ($xa2,$xt2)=($xt2,$xa2);
1058 paddd 0x80-0x100(%rcx),$xb0
1059 paddd 0x90-0x100(%rcx),$xb1
1060 paddd 0xa0-0x100(%rcx),$xb2
1061 paddd 0xb0-0x100(%rcx),$xb3
1063 movdqa $xa0,0x00(%rsp) # offload $xaN
1064 movdqa $xa1,0x10(%rsp)
1065 movdqa 0x20(%rsp),$xa0 # "xc2"
1066 movdqa 0x30(%rsp),$xa1 # "xc3"
1075 punpcklqdq $xb2,$xb0 # "b0"
1077 punpcklqdq $xt3,$xt2 # "b2"
1078 punpckhqdq $xb2,$xb1 # "b1"
1079 punpckhqdq $xt3,$xb3 # "b3"
1081 ($xb2,$xt2)=($xt2,$xb2);
1082 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1084 paddd 0xc0-0x100(%rcx),$xc0
1085 paddd 0xd0-0x100(%rcx),$xc1
1086 paddd 0xe0-0x100(%rcx),$xc2
1087 paddd 0xf0-0x100(%rcx),$xc3
1089 movdqa $xa2,0x20(%rsp) # keep offloading $xaN
1090 movdqa $xa3,0x30(%rsp)
1099 punpcklqdq $xc2,$xc0 # "c0"
1101 punpcklqdq $xt3,$xt2 # "c2"
1102 punpckhqdq $xc2,$xc1 # "c1"
1103 punpckhqdq $xt3,$xc3 # "c3"
1105 ($xc2,$xt2)=($xt2,$xc2);
1106 ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary
1108 paddd 0x100-0x100(%rcx),$xd0
1109 paddd 0x110-0x100(%rcx),$xd1
1110 paddd 0x120-0x100(%rcx),$xd2
1111 paddd 0x130-0x100(%rcx),$xd3
1120 punpcklqdq $xd2,$xd0 # "d0"
1122 punpcklqdq $xt3,$xt2 # "d2"
1123 punpckhqdq $xd2,$xd1 # "d1"
1124 punpckhqdq $xt3,$xd3 # "d3"
1126 ($xd2,$xt2)=($xt2,$xd2);
1131 movdqu 0x00($inp),$xt0 # xor with input
1132 movdqu 0x10($inp),$xt1
1133 movdqu 0x20($inp),$xt2
1134 movdqu 0x30($inp),$xt3
1135 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1140 movdqu $xt0,0x00($out)
1141 movdqu 0x40($inp),$xt0
1142 movdqu $xt1,0x10($out)
1143 movdqu 0x50($inp),$xt1
1144 movdqu $xt2,0x20($out)
1145 movdqu 0x60($inp),$xt2
1146 movdqu $xt3,0x30($out)
1147 movdqu 0x70($inp),$xt3
1148 lea 0x80($inp),$inp # size optimization
1149 pxor 0x10(%rsp),$xt0
1154 movdqu $xt0,0x40($out)
1155 movdqu 0x00($inp),$xt0
1156 movdqu $xt1,0x50($out)
1157 movdqu 0x10($inp),$xt1
1158 movdqu $xt2,0x60($out)
1159 movdqu 0x20($inp),$xt2
1160 movdqu $xt3,0x70($out)
1161 lea 0x80($out),$out # size optimization
1162 movdqu 0x30($inp),$xt3
1163 pxor 0x20(%rsp),$xt0
1168 movdqu $xt0,0x00($out)
1169 movdqu 0x40($inp),$xt0
1170 movdqu $xt1,0x10($out)
1171 movdqu 0x50($inp),$xt1
1172 movdqu $xt2,0x20($out)
1173 movdqu 0x60($inp),$xt2
1174 movdqu $xt3,0x30($out)
1175 movdqu 0x70($inp),$xt3
1176 lea 0x80($inp),$inp # inp+=64*4
1177 pxor 0x30(%rsp),$xt0
1181 movdqu $xt0,0x40($out)
1182 movdqu $xt1,0x50($out)
1183 movdqu $xt2,0x60($out)
1184 movdqu $xt3,0x70($out)
1185 lea 0x80($out),$out # out+=64*4
1200 #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1202 #movdqa $xt0,0x00(%rsp)
1203 movdqa $xb0,0x10(%rsp)
1204 movdqa $xc0,0x20(%rsp)
1205 movdqa $xd0,0x30(%rsp)
1210 movdqu 0x00($inp),$xt0 # xor with input
1211 movdqu 0x10($inp),$xt1
1212 movdqu 0x20($inp),$xt2
1213 movdqu 0x30($inp),$xt3
1214 pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember?
1218 movdqu $xt0,0x00($out)
1219 movdqu $xt1,0x10($out)
1220 movdqu $xt2,0x20($out)
1221 movdqu $xt3,0x30($out)
1224 movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember?
1225 lea 0x40($inp),$inp # inp+=64*1
1227 movdqa $xt0,0x00(%rsp)
1228 movdqa $xb1,0x10(%rsp)
1229 lea 0x40($out),$out # out+=64*1
1230 movdqa $xc1,0x20(%rsp)
1231 sub \$64,$len # len-=64*1
1232 movdqa $xd1,0x30(%rsp)
1237 movdqu 0x00($inp),$xt0 # xor with input
1238 movdqu 0x10($inp),$xt1
1239 movdqu 0x20($inp),$xt2
1240 movdqu 0x30($inp),$xt3
1241 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1246 movdqu $xt0,0x00($out)
1247 movdqu 0x40($inp),$xt0
1248 movdqu $xt1,0x10($out)
1249 movdqu 0x50($inp),$xt1
1250 movdqu $xt2,0x20($out)
1251 movdqu 0x60($inp),$xt2
1252 movdqu $xt3,0x30($out)
1253 movdqu 0x70($inp),$xt3
1254 pxor 0x10(%rsp),$xt0
1258 movdqu $xt0,0x40($out)
1259 movdqu $xt1,0x50($out)
1260 movdqu $xt2,0x60($out)
1261 movdqu $xt3,0x70($out)
1264 movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember?
1265 lea 0x80($inp),$inp # inp+=64*2
1267 movdqa $xt0,0x00(%rsp)
1268 movdqa $xb2,0x10(%rsp)
1269 lea 0x80($out),$out # out+=64*2
1270 movdqa $xc2,0x20(%rsp)
1271 sub \$128,$len # len-=64*2
1272 movdqa $xd2,0x30(%rsp)
1277 movdqu 0x00($inp),$xt0 # xor with input
1278 movdqu 0x10($inp),$xt1
1279 movdqu 0x20($inp),$xt2
1280 movdqu 0x30($inp),$xt3
1281 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1286 movdqu $xt0,0x00($out)
1287 movdqu 0x40($inp),$xt0
1288 movdqu $xt1,0x10($out)
1289 movdqu 0x50($inp),$xt1
1290 movdqu $xt2,0x20($out)
1291 movdqu 0x60($inp),$xt2
1292 movdqu $xt3,0x30($out)
1293 movdqu 0x70($inp),$xt3
1294 lea 0x80($inp),$inp # size optimization
1295 pxor 0x10(%rsp),$xt0
1300 movdqu $xt0,0x40($out)
1301 movdqu 0x00($inp),$xt0
1302 movdqu $xt1,0x50($out)
1303 movdqu 0x10($inp),$xt1
1304 movdqu $xt2,0x60($out)
1305 movdqu 0x20($inp),$xt2
1306 movdqu $xt3,0x70($out)
1307 lea 0x80($out),$out # size optimization
1308 movdqu 0x30($inp),$xt3
1309 pxor 0x20(%rsp),$xt0
1313 movdqu $xt0,0x00($out)
1314 movdqu $xt1,0x10($out)
1315 movdqu $xt2,0x20($out)
1316 movdqu $xt3,0x30($out)
1319 movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember?
1320 lea 0x40($inp),$inp # inp+=64*3
1322 movdqa $xt0,0x00(%rsp)
1323 movdqa $xb3,0x10(%rsp)
1324 lea 0x40($out),$out # out+=64*3
1325 movdqa $xc3,0x20(%rsp)
1326 sub \$192,$len # len-=64*3
1327 movdqa $xd3,0x30(%rsp)
1330 movzb ($inp,%r10),%eax
1331 movzb (%rsp,%r10),%ecx
1334 mov %al,-1($out,%r10)
1340 $code.=<<___ if ($win64);
1341 movaps -0xa8(%r9),%xmm6
1342 movaps -0x98(%r9),%xmm7
1343 movaps -0x88(%r9),%xmm8
1344 movaps -0x78(%r9),%xmm9
1345 movaps -0x68(%r9),%xmm10
1346 movaps -0x58(%r9),%xmm11
1347 movaps -0x48(%r9),%xmm12
1348 movaps -0x38(%r9),%xmm13
1349 movaps -0x28(%r9),%xmm14
1350 movaps -0x18(%r9),%xmm15
1354 .cfi_def_cfa_register %rsp
1358 .size ChaCha20_4x,.-ChaCha20_4x
1362 ########################################################################
1363 # XOP code path that handles all lengths.
1365 # There is some "anomaly" observed depending on instructions' size or
1366 # alignment. If you look closely at below code you'll notice that
1367 # sometimes argument order varies. The order affects instruction
1368 # encoding by making it larger, and such fiddling gives 5% performance
1369 # improvement. This is on FX-4100...
1371 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1372 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15));
1373 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1374 $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3);
1376 sub XOP_lane_ROUND {
1377 my ($a0,$b0,$c0,$d0)=@_;
1378 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1379 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1380 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1381 my @x=map("\"$_\"",@xx);
1384 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
1385 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
1386 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
1387 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
1388 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1389 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1390 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1391 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1392 "&vprotd (@x[$d0],@x[$d0],16)",
1393 "&vprotd (@x[$d1],@x[$d1],16)",
1394 "&vprotd (@x[$d2],@x[$d2],16)",
1395 "&vprotd (@x[$d3],@x[$d3],16)",
1397 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
1398 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
1399 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
1400 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
1401 "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
1402 "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
1403 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
1404 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
1405 "&vprotd (@x[$b0],@x[$b0],12)",
1406 "&vprotd (@x[$b1],@x[$b1],12)",
1407 "&vprotd (@x[$b2],@x[$b2],12)",
1408 "&vprotd (@x[$b3],@x[$b3],12)",
1410 "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip
1411 "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip
1412 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
1413 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
1414 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1415 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1416 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1417 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1418 "&vprotd (@x[$d0],@x[$d0],8)",
1419 "&vprotd (@x[$d1],@x[$d1],8)",
1420 "&vprotd (@x[$d2],@x[$d2],8)",
1421 "&vprotd (@x[$d3],@x[$d3],8)",
1423 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
1424 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
1425 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
1426 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
1427 "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
1428 "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
1429 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
1430 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
1431 "&vprotd (@x[$b0],@x[$b0],7)",
1432 "&vprotd (@x[$b1],@x[$b1],7)",
1433 "&vprotd (@x[$b2],@x[$b2],7)",
1434 "&vprotd (@x[$b3],@x[$b3],7)"
1438 my $xframe = $win64 ? 0xa8 : 8;
1441 .type ChaCha20_4xop,\@function,5
1446 mov %rsp,%r9 # frame pointer
1447 .cfi_def_cfa_register %r9
1448 sub \$0x140+$xframe,%rsp
1450 ################ stack layout
1451 # +0x00 SIMD equivalent of @x[8-12]
1453 # +0x40 constant copy of key[0-2] smashed by lanes
1455 # +0x100 SIMD counters (with nonce smashed by lanes)
1458 $code.=<<___ if ($win64);
1459 movaps %xmm6,-0xa8(%r9)
1460 movaps %xmm7,-0x98(%r9)
1461 movaps %xmm8,-0x88(%r9)
1462 movaps %xmm9,-0x78(%r9)
1463 movaps %xmm10,-0x68(%r9)
1464 movaps %xmm11,-0x58(%r9)
1465 movaps %xmm12,-0x48(%r9)
1466 movaps %xmm13,-0x38(%r9)
1467 movaps %xmm14,-0x28(%r9)
1468 movaps %xmm15,-0x18(%r9)
1474 vmovdqa .Lsigma(%rip),$xa3 # key[0]
1475 vmovdqu ($key),$xb3 # key[1]
1476 vmovdqu 16($key),$xt3 # key[2]
1477 vmovdqu ($counter),$xd3 # key[3]
1478 lea 0x100(%rsp),%rcx # size optimization
1480 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
1481 vpshufd \$0x55,$xa3,$xa1
1482 vmovdqa $xa0,0x40(%rsp) # ... and offload
1483 vpshufd \$0xaa,$xa3,$xa2
1484 vmovdqa $xa1,0x50(%rsp)
1485 vpshufd \$0xff,$xa3,$xa3
1486 vmovdqa $xa2,0x60(%rsp)
1487 vmovdqa $xa3,0x70(%rsp)
1489 vpshufd \$0x00,$xb3,$xb0
1490 vpshufd \$0x55,$xb3,$xb1
1491 vmovdqa $xb0,0x80-0x100(%rcx)
1492 vpshufd \$0xaa,$xb3,$xb2
1493 vmovdqa $xb1,0x90-0x100(%rcx)
1494 vpshufd \$0xff,$xb3,$xb3
1495 vmovdqa $xb2,0xa0-0x100(%rcx)
1496 vmovdqa $xb3,0xb0-0x100(%rcx)
1498 vpshufd \$0x00,$xt3,$xt0 # "$xc0"
1499 vpshufd \$0x55,$xt3,$xt1 # "$xc1"
1500 vmovdqa $xt0,0xc0-0x100(%rcx)
1501 vpshufd \$0xaa,$xt3,$xt2 # "$xc2"
1502 vmovdqa $xt1,0xd0-0x100(%rcx)
1503 vpshufd \$0xff,$xt3,$xt3 # "$xc3"
1504 vmovdqa $xt2,0xe0-0x100(%rcx)
1505 vmovdqa $xt3,0xf0-0x100(%rcx)
1507 vpshufd \$0x00,$xd3,$xd0
1508 vpshufd \$0x55,$xd3,$xd1
1509 vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet
1510 vpshufd \$0xaa,$xd3,$xd2
1511 vmovdqa $xd1,0x110-0x100(%rcx)
1512 vpshufd \$0xff,$xd3,$xd3
1513 vmovdqa $xd2,0x120-0x100(%rcx)
1514 vmovdqa $xd3,0x130-0x100(%rcx)
1520 vmovdqa 0x40(%rsp),$xa0 # re-load smashed key
1521 vmovdqa 0x50(%rsp),$xa1
1522 vmovdqa 0x60(%rsp),$xa2
1523 vmovdqa 0x70(%rsp),$xa3
1524 vmovdqa 0x80-0x100(%rcx),$xb0
1525 vmovdqa 0x90-0x100(%rcx),$xb1
1526 vmovdqa 0xa0-0x100(%rcx),$xb2
1527 vmovdqa 0xb0-0x100(%rcx),$xb3
1528 vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
1529 vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
1530 vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
1531 vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
1532 vmovdqa 0x100-0x100(%rcx),$xd0
1533 vmovdqa 0x110-0x100(%rcx),$xd1
1534 vmovdqa 0x120-0x100(%rcx),$xd2
1535 vmovdqa 0x130-0x100(%rcx),$xd3
1536 vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters
1540 vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
1546 foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; }
1547 foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; }
1552 vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material
1553 vpaddd 0x50(%rsp),$xa1,$xa1
1554 vpaddd 0x60(%rsp),$xa2,$xa2
1555 vpaddd 0x70(%rsp),$xa3,$xa3
1557 vmovdqa $xt2,0x20(%rsp) # offload $xc2,3
1558 vmovdqa $xt3,0x30(%rsp)
1560 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
1561 vpunpckldq $xa3,$xa2,$xt3
1562 vpunpckhdq $xa1,$xa0,$xa0
1563 vpunpckhdq $xa3,$xa2,$xa2
1564 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
1565 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
1566 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
1567 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
1569 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1571 vpaddd 0x80-0x100(%rcx),$xb0,$xb0
1572 vpaddd 0x90-0x100(%rcx),$xb1,$xb1
1573 vpaddd 0xa0-0x100(%rcx),$xb2,$xb2
1574 vpaddd 0xb0-0x100(%rcx),$xb3,$xb3
1576 vmovdqa $xa0,0x00(%rsp) # offload $xa0,1
1577 vmovdqa $xa1,0x10(%rsp)
1578 vmovdqa 0x20(%rsp),$xa0 # "xc2"
1579 vmovdqa 0x30(%rsp),$xa1 # "xc3"
1581 vpunpckldq $xb1,$xb0,$xt2
1582 vpunpckldq $xb3,$xb2,$xt3
1583 vpunpckhdq $xb1,$xb0,$xb0
1584 vpunpckhdq $xb3,$xb2,$xb2
1585 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
1586 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
1587 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
1588 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
1590 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1591 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1593 vpaddd 0xc0-0x100(%rcx),$xc0,$xc0
1594 vpaddd 0xd0-0x100(%rcx),$xc1,$xc1
1595 vpaddd 0xe0-0x100(%rcx),$xc2,$xc2
1596 vpaddd 0xf0-0x100(%rcx),$xc3,$xc3
1598 vpunpckldq $xc1,$xc0,$xt2
1599 vpunpckldq $xc3,$xc2,$xt3
1600 vpunpckhdq $xc1,$xc0,$xc0
1601 vpunpckhdq $xc3,$xc2,$xc2
1602 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
1603 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
1604 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
1605 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
1607 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1609 vpaddd 0x100-0x100(%rcx),$xd0,$xd0
1610 vpaddd 0x110-0x100(%rcx),$xd1,$xd1
1611 vpaddd 0x120-0x100(%rcx),$xd2,$xd2
1612 vpaddd 0x130-0x100(%rcx),$xd3,$xd3
1614 vpunpckldq $xd1,$xd0,$xt2
1615 vpunpckldq $xd3,$xd2,$xt3
1616 vpunpckhdq $xd1,$xd0,$xd0
1617 vpunpckhdq $xd3,$xd2,$xd2
1618 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
1619 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
1620 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
1621 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
1623 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1624 ($xa0,$xa1)=($xt2,$xt3);
1626 vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1
1627 vmovdqa 0x10(%rsp),$xa1
1632 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1633 vpxor 0x10($inp),$xb0,$xb0
1634 vpxor 0x20($inp),$xc0,$xc0
1635 vpxor 0x30($inp),$xd0,$xd0
1636 vpxor 0x40($inp),$xa1,$xa1
1637 vpxor 0x50($inp),$xb1,$xb1
1638 vpxor 0x60($inp),$xc1,$xc1
1639 vpxor 0x70($inp),$xd1,$xd1
1640 lea 0x80($inp),$inp # size optimization
1641 vpxor 0x00($inp),$xa2,$xa2
1642 vpxor 0x10($inp),$xb2,$xb2
1643 vpxor 0x20($inp),$xc2,$xc2
1644 vpxor 0x30($inp),$xd2,$xd2
1645 vpxor 0x40($inp),$xa3,$xa3
1646 vpxor 0x50($inp),$xb3,$xb3
1647 vpxor 0x60($inp),$xc3,$xc3
1648 vpxor 0x70($inp),$xd3,$xd3
1649 lea 0x80($inp),$inp # inp+=64*4
1651 vmovdqu $xa0,0x00($out)
1652 vmovdqu $xb0,0x10($out)
1653 vmovdqu $xc0,0x20($out)
1654 vmovdqu $xd0,0x30($out)
1655 vmovdqu $xa1,0x40($out)
1656 vmovdqu $xb1,0x50($out)
1657 vmovdqu $xc1,0x60($out)
1658 vmovdqu $xd1,0x70($out)
1659 lea 0x80($out),$out # size optimization
1660 vmovdqu $xa2,0x00($out)
1661 vmovdqu $xb2,0x10($out)
1662 vmovdqu $xc2,0x20($out)
1663 vmovdqu $xd2,0x30($out)
1664 vmovdqu $xa3,0x40($out)
1665 vmovdqu $xb3,0x50($out)
1666 vmovdqu $xc3,0x60($out)
1667 vmovdqu $xd3,0x70($out)
1668 lea 0x80($out),$out # out+=64*4
1678 jae .L192_or_more4xop
1680 jae .L128_or_more4xop
1682 jae .L64_or_more4xop
1685 vmovdqa $xa0,0x00(%rsp)
1686 vmovdqa $xb0,0x10(%rsp)
1687 vmovdqa $xc0,0x20(%rsp)
1688 vmovdqa $xd0,0x30(%rsp)
1693 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1694 vpxor 0x10($inp),$xb0,$xb0
1695 vpxor 0x20($inp),$xc0,$xc0
1696 vpxor 0x30($inp),$xd0,$xd0
1697 vmovdqu $xa0,0x00($out)
1698 vmovdqu $xb0,0x10($out)
1699 vmovdqu $xc0,0x20($out)
1700 vmovdqu $xd0,0x30($out)
1703 lea 0x40($inp),$inp # inp+=64*1
1704 vmovdqa $xa1,0x00(%rsp)
1706 vmovdqa $xb1,0x10(%rsp)
1707 lea 0x40($out),$out # out+=64*1
1708 vmovdqa $xc1,0x20(%rsp)
1709 sub \$64,$len # len-=64*1
1710 vmovdqa $xd1,0x30(%rsp)
1715 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1716 vpxor 0x10($inp),$xb0,$xb0
1717 vpxor 0x20($inp),$xc0,$xc0
1718 vpxor 0x30($inp),$xd0,$xd0
1719 vpxor 0x40($inp),$xa1,$xa1
1720 vpxor 0x50($inp),$xb1,$xb1
1721 vpxor 0x60($inp),$xc1,$xc1
1722 vpxor 0x70($inp),$xd1,$xd1
1724 vmovdqu $xa0,0x00($out)
1725 vmovdqu $xb0,0x10($out)
1726 vmovdqu $xc0,0x20($out)
1727 vmovdqu $xd0,0x30($out)
1728 vmovdqu $xa1,0x40($out)
1729 vmovdqu $xb1,0x50($out)
1730 vmovdqu $xc1,0x60($out)
1731 vmovdqu $xd1,0x70($out)
1734 lea 0x80($inp),$inp # inp+=64*2
1735 vmovdqa $xa2,0x00(%rsp)
1737 vmovdqa $xb2,0x10(%rsp)
1738 lea 0x80($out),$out # out+=64*2
1739 vmovdqa $xc2,0x20(%rsp)
1740 sub \$128,$len # len-=64*2
1741 vmovdqa $xd2,0x30(%rsp)
1746 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1747 vpxor 0x10($inp),$xb0,$xb0
1748 vpxor 0x20($inp),$xc0,$xc0
1749 vpxor 0x30($inp),$xd0,$xd0
1750 vpxor 0x40($inp),$xa1,$xa1
1751 vpxor 0x50($inp),$xb1,$xb1
1752 vpxor 0x60($inp),$xc1,$xc1
1753 vpxor 0x70($inp),$xd1,$xd1
1754 lea 0x80($inp),$inp # size optimization
1755 vpxor 0x00($inp),$xa2,$xa2
1756 vpxor 0x10($inp),$xb2,$xb2
1757 vpxor 0x20($inp),$xc2,$xc2
1758 vpxor 0x30($inp),$xd2,$xd2
1760 vmovdqu $xa0,0x00($out)
1761 vmovdqu $xb0,0x10($out)
1762 vmovdqu $xc0,0x20($out)
1763 vmovdqu $xd0,0x30($out)
1764 vmovdqu $xa1,0x40($out)
1765 vmovdqu $xb1,0x50($out)
1766 vmovdqu $xc1,0x60($out)
1767 vmovdqu $xd1,0x70($out)
1768 lea 0x80($out),$out # size optimization
1769 vmovdqu $xa2,0x00($out)
1770 vmovdqu $xb2,0x10($out)
1771 vmovdqu $xc2,0x20($out)
1772 vmovdqu $xd2,0x30($out)
1775 lea 0x40($inp),$inp # inp+=64*3
1776 vmovdqa $xa3,0x00(%rsp)
1778 vmovdqa $xb3,0x10(%rsp)
1779 lea 0x40($out),$out # out+=64*3
1780 vmovdqa $xc3,0x20(%rsp)
1781 sub \$192,$len # len-=64*3
1782 vmovdqa $xd3,0x30(%rsp)
1785 movzb ($inp,%r10),%eax
1786 movzb (%rsp,%r10),%ecx
1789 mov %al,-1($out,%r10)
1796 $code.=<<___ if ($win64);
1797 movaps -0xa8(%r9),%xmm6
1798 movaps -0x98(%r9),%xmm7
1799 movaps -0x88(%r9),%xmm8
1800 movaps -0x78(%r9),%xmm9
1801 movaps -0x68(%r9),%xmm10
1802 movaps -0x58(%r9),%xmm11
1803 movaps -0x48(%r9),%xmm12
1804 movaps -0x38(%r9),%xmm13
1805 movaps -0x28(%r9),%xmm14
1806 movaps -0x18(%r9),%xmm15
1810 .cfi_def_cfa_register %rsp
1814 .size ChaCha20_4xop,.-ChaCha20_4xop
1818 ########################################################################
1821 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1822 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
1823 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1824 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
1826 sub AVX2_lane_ROUND {
1827 my ($a0,$b0,$c0,$d0)=@_;
1828 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1829 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1830 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1831 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
1832 my @x=map("\"$_\"",@xx);
1834 # Consider order in which variables are addressed by their
1839 # 0 4 8 12 < even round
1843 # 0 5 10 15 < odd round
1848 # 'a', 'b' and 'd's are permanently allocated in registers,
1849 # @x[0..7,12..15], while 'c's are maintained in memory. If
1850 # you observe 'c' column, you'll notice that pair of 'c's is
1851 # invariant between rounds. This means that we have to reload
1852 # them once per round, in the middle. This is why you'll see
1853 # bunch of 'c' stores and loads in the middle, but none in
1854 # the beginning or end.
1857 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
1858 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1859 "&vpshufb (@x[$d0],@x[$d0],$t1)",
1860 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
1861 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1862 "&vpshufb (@x[$d1],@x[$d1],$t1)",
1864 "&vpaddd ($xc,$xc,@x[$d0])",
1865 "&vpxor (@x[$b0],$xc,@x[$b0])",
1866 "&vpslld ($t0,@x[$b0],12)",
1867 "&vpsrld (@x[$b0],@x[$b0],20)",
1868 "&vpor (@x[$b0],$t0,@x[$b0])",
1869 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1870 "&vpaddd ($xc_,$xc_,@x[$d1])",
1871 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1872 "&vpslld ($t1,@x[$b1],12)",
1873 "&vpsrld (@x[$b1],@x[$b1],20)",
1874 "&vpor (@x[$b1],$t1,@x[$b1])",
1876 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
1877 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1878 "&vpshufb (@x[$d0],@x[$d0],$t0)",
1879 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
1880 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1881 "&vpshufb (@x[$d1],@x[$d1],$t0)",
1883 "&vpaddd ($xc,$xc,@x[$d0])",
1884 "&vpxor (@x[$b0],$xc,@x[$b0])",
1885 "&vpslld ($t1,@x[$b0],7)",
1886 "&vpsrld (@x[$b0],@x[$b0],25)",
1887 "&vpor (@x[$b0],$t1,@x[$b0])",
1888 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1889 "&vpaddd ($xc_,$xc_,@x[$d1])",
1890 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1891 "&vpslld ($t0,@x[$b1],7)",
1892 "&vpsrld (@x[$b1],@x[$b1],25)",
1893 "&vpor (@x[$b1],$t0,@x[$b1])",
1895 "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
1896 "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)",
1897 "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")",
1898 "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")",
1900 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
1901 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1902 "&vpshufb (@x[$d2],@x[$d2],$t1)",
1903 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
1904 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1905 "&vpshufb (@x[$d3],@x[$d3],$t1)",
1907 "&vpaddd ($xc,$xc,@x[$d2])",
1908 "&vpxor (@x[$b2],$xc,@x[$b2])",
1909 "&vpslld ($t0,@x[$b2],12)",
1910 "&vpsrld (@x[$b2],@x[$b2],20)",
1911 "&vpor (@x[$b2],$t0,@x[$b2])",
1912 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1913 "&vpaddd ($xc_,$xc_,@x[$d3])",
1914 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1915 "&vpslld ($t1,@x[$b3],12)",
1916 "&vpsrld (@x[$b3],@x[$b3],20)",
1917 "&vpor (@x[$b3],$t1,@x[$b3])",
1919 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
1920 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1921 "&vpshufb (@x[$d2],@x[$d2],$t0)",
1922 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
1923 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1924 "&vpshufb (@x[$d3],@x[$d3],$t0)",
1926 "&vpaddd ($xc,$xc,@x[$d2])",
1927 "&vpxor (@x[$b2],$xc,@x[$b2])",
1928 "&vpslld ($t1,@x[$b2],7)",
1929 "&vpsrld (@x[$b2],@x[$b2],25)",
1930 "&vpor (@x[$b2],$t1,@x[$b2])",
1931 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1932 "&vpaddd ($xc_,$xc_,@x[$d3])",
1933 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1934 "&vpslld ($t0,@x[$b3],7)",
1935 "&vpsrld (@x[$b3],@x[$b3],25)",
1936 "&vpor (@x[$b3],$t0,@x[$b3])"
1940 my $xframe = $win64 ? 0xa8 : 8;
1943 .type ChaCha20_8x,\@function,5
1948 mov %rsp,%r9 # frame register
1949 .cfi_def_cfa_register %r9
1950 sub \$0x280+$xframe,%rsp
1953 $code.=<<___ if ($win64);
1954 movaps %xmm6,-0xa8(%r9)
1955 movaps %xmm7,-0x98(%r9)
1956 movaps %xmm8,-0x88(%r9)
1957 movaps %xmm9,-0x78(%r9)
1958 movaps %xmm10,-0x68(%r9)
1959 movaps %xmm11,-0x58(%r9)
1960 movaps %xmm12,-0x48(%r9)
1961 movaps %xmm13,-0x38(%r9)
1962 movaps %xmm14,-0x28(%r9)
1963 movaps %xmm15,-0x18(%r9)
1969 ################ stack layout
1970 # +0x00 SIMD equivalent of @x[8-12]
1972 # +0x80 constant copy of key[0-2] smashed by lanes
1974 # +0x200 SIMD counters (with nonce smashed by lanes)
1978 vbroadcasti128 .Lsigma(%rip),$xa3 # key[0]
1979 vbroadcasti128 ($key),$xb3 # key[1]
1980 vbroadcasti128 16($key),$xt3 # key[2]
1981 vbroadcasti128 ($counter),$xd3 # key[3]
1982 lea 0x100(%rsp),%rcx # size optimization
1983 lea 0x200(%rsp),%rax # size optimization
1984 lea .Lrot16(%rip),%r10
1985 lea .Lrot24(%rip),%r11
1987 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
1988 vpshufd \$0x55,$xa3,$xa1
1989 vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload
1990 vpshufd \$0xaa,$xa3,$xa2
1991 vmovdqa $xa1,0xa0-0x100(%rcx)
1992 vpshufd \$0xff,$xa3,$xa3
1993 vmovdqa $xa2,0xc0-0x100(%rcx)
1994 vmovdqa $xa3,0xe0-0x100(%rcx)
1996 vpshufd \$0x00,$xb3,$xb0
1997 vpshufd \$0x55,$xb3,$xb1
1998 vmovdqa $xb0,0x100-0x100(%rcx)
1999 vpshufd \$0xaa,$xb3,$xb2
2000 vmovdqa $xb1,0x120-0x100(%rcx)
2001 vpshufd \$0xff,$xb3,$xb3
2002 vmovdqa $xb2,0x140-0x100(%rcx)
2003 vmovdqa $xb3,0x160-0x100(%rcx)
2005 vpshufd \$0x00,$xt3,$xt0 # "xc0"
2006 vpshufd \$0x55,$xt3,$xt1 # "xc1"
2007 vmovdqa $xt0,0x180-0x200(%rax)
2008 vpshufd \$0xaa,$xt3,$xt2 # "xc2"
2009 vmovdqa $xt1,0x1a0-0x200(%rax)
2010 vpshufd \$0xff,$xt3,$xt3 # "xc3"
2011 vmovdqa $xt2,0x1c0-0x200(%rax)
2012 vmovdqa $xt3,0x1e0-0x200(%rax)
2014 vpshufd \$0x00,$xd3,$xd0
2015 vpshufd \$0x55,$xd3,$xd1
2016 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet
2017 vpshufd \$0xaa,$xd3,$xd2
2018 vmovdqa $xd1,0x220-0x200(%rax)
2019 vpshufd \$0xff,$xd3,$xd3
2020 vmovdqa $xd2,0x240-0x200(%rax)
2021 vmovdqa $xd3,0x260-0x200(%rax)
2027 vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key
2028 vmovdqa 0xa0-0x100(%rcx),$xa1
2029 vmovdqa 0xc0-0x100(%rcx),$xa2
2030 vmovdqa 0xe0-0x100(%rcx),$xa3
2031 vmovdqa 0x100-0x100(%rcx),$xb0
2032 vmovdqa 0x120-0x100(%rcx),$xb1
2033 vmovdqa 0x140-0x100(%rcx),$xb2
2034 vmovdqa 0x160-0x100(%rcx),$xb3
2035 vmovdqa 0x180-0x200(%rax),$xt0 # "xc0"
2036 vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1"
2037 vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2"
2038 vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3"
2039 vmovdqa 0x200-0x200(%rax),$xd0
2040 vmovdqa 0x220-0x200(%rax),$xd1
2041 vmovdqa 0x240-0x200(%rax),$xd2
2042 vmovdqa 0x260-0x200(%rax),$xd3
2043 vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters
2046 vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]"
2047 vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]"
2048 vbroadcasti128 (%r10),$xt3
2049 vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters
2056 foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
2057 foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
2062 lea 0x200(%rsp),%rax # size optimization
2063 vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key
2064 vpaddd 0xa0-0x100(%rcx),$xa1,$xa1
2065 vpaddd 0xc0-0x100(%rcx),$xa2,$xa2
2066 vpaddd 0xe0-0x100(%rcx),$xa3,$xa3
2068 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
2069 vpunpckldq $xa3,$xa2,$xt3
2070 vpunpckhdq $xa1,$xa0,$xa0
2071 vpunpckhdq $xa3,$xa2,$xa2
2072 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
2073 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
2074 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
2075 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
2077 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
2079 vpaddd 0x100-0x100(%rcx),$xb0,$xb0
2080 vpaddd 0x120-0x100(%rcx),$xb1,$xb1
2081 vpaddd 0x140-0x100(%rcx),$xb2,$xb2
2082 vpaddd 0x160-0x100(%rcx),$xb3,$xb3
2084 vpunpckldq $xb1,$xb0,$xt2
2085 vpunpckldq $xb3,$xb2,$xt3
2086 vpunpckhdq $xb1,$xb0,$xb0
2087 vpunpckhdq $xb3,$xb2,$xb2
2088 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
2089 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
2090 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
2091 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
2093 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
2095 vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further
2096 vperm2i128 \$0x31,$xb0,$xa0,$xb0
2097 vperm2i128 \$0x20,$xb1,$xa1,$xa0
2098 vperm2i128 \$0x31,$xb1,$xa1,$xb1
2099 vperm2i128 \$0x20,$xb2,$xa2,$xa1
2100 vperm2i128 \$0x31,$xb2,$xa2,$xb2
2101 vperm2i128 \$0x20,$xb3,$xa3,$xa2
2102 vperm2i128 \$0x31,$xb3,$xa3,$xb3
2104 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
2105 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
2107 vmovdqa $xa0,0x00(%rsp) # offload $xaN
2108 vmovdqa $xa1,0x20(%rsp)
2109 vmovdqa 0x40(%rsp),$xc2 # $xa0
2110 vmovdqa 0x60(%rsp),$xc3 # $xa1
2112 vpaddd 0x180-0x200(%rax),$xc0,$xc0
2113 vpaddd 0x1a0-0x200(%rax),$xc1,$xc1
2114 vpaddd 0x1c0-0x200(%rax),$xc2,$xc2
2115 vpaddd 0x1e0-0x200(%rax),$xc3,$xc3
2117 vpunpckldq $xc1,$xc0,$xt2
2118 vpunpckldq $xc3,$xc2,$xt3
2119 vpunpckhdq $xc1,$xc0,$xc0
2120 vpunpckhdq $xc3,$xc2,$xc2
2121 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
2122 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
2123 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
2124 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
2126 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
2128 vpaddd 0x200-0x200(%rax),$xd0,$xd0
2129 vpaddd 0x220-0x200(%rax),$xd1,$xd1
2130 vpaddd 0x240-0x200(%rax),$xd2,$xd2
2131 vpaddd 0x260-0x200(%rax),$xd3,$xd3
2133 vpunpckldq $xd1,$xd0,$xt2
2134 vpunpckldq $xd3,$xd2,$xt3
2135 vpunpckhdq $xd1,$xd0,$xd0
2136 vpunpckhdq $xd3,$xd2,$xd2
2137 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
2138 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
2139 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
2140 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
2142 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
2144 vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
2145 vperm2i128 \$0x31,$xd0,$xc0,$xd0
2146 vperm2i128 \$0x20,$xd1,$xc1,$xc0
2147 vperm2i128 \$0x31,$xd1,$xc1,$xd1
2148 vperm2i128 \$0x20,$xd2,$xc2,$xc1
2149 vperm2i128 \$0x31,$xd2,$xc2,$xd2
2150 vperm2i128 \$0x20,$xd3,$xc3,$xc2
2151 vperm2i128 \$0x31,$xd3,$xc3,$xd3
2153 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
2154 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
2155 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
2156 ($xa0,$xa1)=($xt2,$xt3);
2158 vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember?
2159 vmovdqa 0x20(%rsp),$xa1
2164 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2165 vpxor 0x20($inp),$xb0,$xb0
2166 vpxor 0x40($inp),$xc0,$xc0
2167 vpxor 0x60($inp),$xd0,$xd0
2168 lea 0x80($inp),$inp # size optimization
2169 vmovdqu $xa0,0x00($out)
2170 vmovdqu $xb0,0x20($out)
2171 vmovdqu $xc0,0x40($out)
2172 vmovdqu $xd0,0x60($out)
2173 lea 0x80($out),$out # size optimization
2175 vpxor 0x00($inp),$xa1,$xa1
2176 vpxor 0x20($inp),$xb1,$xb1
2177 vpxor 0x40($inp),$xc1,$xc1
2178 vpxor 0x60($inp),$xd1,$xd1
2179 lea 0x80($inp),$inp # size optimization
2180 vmovdqu $xa1,0x00($out)
2181 vmovdqu $xb1,0x20($out)
2182 vmovdqu $xc1,0x40($out)
2183 vmovdqu $xd1,0x60($out)
2184 lea 0x80($out),$out # size optimization
2186 vpxor 0x00($inp),$xa2,$xa2
2187 vpxor 0x20($inp),$xb2,$xb2
2188 vpxor 0x40($inp),$xc2,$xc2
2189 vpxor 0x60($inp),$xd2,$xd2
2190 lea 0x80($inp),$inp # size optimization
2191 vmovdqu $xa2,0x00($out)
2192 vmovdqu $xb2,0x20($out)
2193 vmovdqu $xc2,0x40($out)
2194 vmovdqu $xd2,0x60($out)
2195 lea 0x80($out),$out # size optimization
2197 vpxor 0x00($inp),$xa3,$xa3
2198 vpxor 0x20($inp),$xb3,$xb3
2199 vpxor 0x40($inp),$xc3,$xc3
2200 vpxor 0x60($inp),$xd3,$xd3
2201 lea 0x80($inp),$inp # size optimization
2202 vmovdqu $xa3,0x00($out)
2203 vmovdqu $xb3,0x20($out)
2204 vmovdqu $xc3,0x40($out)
2205 vmovdqu $xd3,0x60($out)
2206 lea 0x80($out),$out # size optimization
2230 vmovdqa $xa0,0x00(%rsp)
2231 vmovdqa $xb0,0x20(%rsp)
2236 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2237 vpxor 0x20($inp),$xb0,$xb0
2238 vmovdqu $xa0,0x00($out)
2239 vmovdqu $xb0,0x20($out)
2242 lea 0x40($inp),$inp # inp+=64*1
2244 vmovdqa $xc0,0x00(%rsp)
2245 lea 0x40($out),$out # out+=64*1
2246 sub \$64,$len # len-=64*1
2247 vmovdqa $xd0,0x20(%rsp)
2252 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2253 vpxor 0x20($inp),$xb0,$xb0
2254 vpxor 0x40($inp),$xc0,$xc0
2255 vpxor 0x60($inp),$xd0,$xd0
2256 vmovdqu $xa0,0x00($out)
2257 vmovdqu $xb0,0x20($out)
2258 vmovdqu $xc0,0x40($out)
2259 vmovdqu $xd0,0x60($out)
2262 lea 0x80($inp),$inp # inp+=64*2
2264 vmovdqa $xa1,0x00(%rsp)
2265 lea 0x80($out),$out # out+=64*2
2266 sub \$128,$len # len-=64*2
2267 vmovdqa $xb1,0x20(%rsp)
2272 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2273 vpxor 0x20($inp),$xb0,$xb0
2274 vpxor 0x40($inp),$xc0,$xc0
2275 vpxor 0x60($inp),$xd0,$xd0
2276 vpxor 0x80($inp),$xa1,$xa1
2277 vpxor 0xa0($inp),$xb1,$xb1
2278 vmovdqu $xa0,0x00($out)
2279 vmovdqu $xb0,0x20($out)
2280 vmovdqu $xc0,0x40($out)
2281 vmovdqu $xd0,0x60($out)
2282 vmovdqu $xa1,0x80($out)
2283 vmovdqu $xb1,0xa0($out)
2286 lea 0xc0($inp),$inp # inp+=64*3
2288 vmovdqa $xc1,0x00(%rsp)
2289 lea 0xc0($out),$out # out+=64*3
2290 sub \$192,$len # len-=64*3
2291 vmovdqa $xd1,0x20(%rsp)
2296 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2297 vpxor 0x20($inp),$xb0,$xb0
2298 vpxor 0x40($inp),$xc0,$xc0
2299 vpxor 0x60($inp),$xd0,$xd0
2300 vpxor 0x80($inp),$xa1,$xa1
2301 vpxor 0xa0($inp),$xb1,$xb1
2302 vpxor 0xc0($inp),$xc1,$xc1
2303 vpxor 0xe0($inp),$xd1,$xd1
2304 vmovdqu $xa0,0x00($out)
2305 vmovdqu $xb0,0x20($out)
2306 vmovdqu $xc0,0x40($out)
2307 vmovdqu $xd0,0x60($out)
2308 vmovdqu $xa1,0x80($out)
2309 vmovdqu $xb1,0xa0($out)
2310 vmovdqu $xc1,0xc0($out)
2311 vmovdqu $xd1,0xe0($out)
2314 lea 0x100($inp),$inp # inp+=64*4
2316 vmovdqa $xa2,0x00(%rsp)
2317 lea 0x100($out),$out # out+=64*4
2318 sub \$256,$len # len-=64*4
2319 vmovdqa $xb2,0x20(%rsp)
2324 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2325 vpxor 0x20($inp),$xb0,$xb0
2326 vpxor 0x40($inp),$xc0,$xc0
2327 vpxor 0x60($inp),$xd0,$xd0
2328 vpxor 0x80($inp),$xa1,$xa1
2329 vpxor 0xa0($inp),$xb1,$xb1
2330 vpxor 0xc0($inp),$xc1,$xc1
2331 vpxor 0xe0($inp),$xd1,$xd1
2332 vpxor 0x100($inp),$xa2,$xa2
2333 vpxor 0x120($inp),$xb2,$xb2
2334 vmovdqu $xa0,0x00($out)
2335 vmovdqu $xb0,0x20($out)
2336 vmovdqu $xc0,0x40($out)
2337 vmovdqu $xd0,0x60($out)
2338 vmovdqu $xa1,0x80($out)
2339 vmovdqu $xb1,0xa0($out)
2340 vmovdqu $xc1,0xc0($out)
2341 vmovdqu $xd1,0xe0($out)
2342 vmovdqu $xa2,0x100($out)
2343 vmovdqu $xb2,0x120($out)
2346 lea 0x140($inp),$inp # inp+=64*5
2348 vmovdqa $xc2,0x00(%rsp)
2349 lea 0x140($out),$out # out+=64*5
2350 sub \$320,$len # len-=64*5
2351 vmovdqa $xd2,0x20(%rsp)
2356 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2357 vpxor 0x20($inp),$xb0,$xb0
2358 vpxor 0x40($inp),$xc0,$xc0
2359 vpxor 0x60($inp),$xd0,$xd0
2360 vpxor 0x80($inp),$xa1,$xa1
2361 vpxor 0xa0($inp),$xb1,$xb1
2362 vpxor 0xc0($inp),$xc1,$xc1
2363 vpxor 0xe0($inp),$xd1,$xd1
2364 vpxor 0x100($inp),$xa2,$xa2
2365 vpxor 0x120($inp),$xb2,$xb2
2366 vpxor 0x140($inp),$xc2,$xc2
2367 vpxor 0x160($inp),$xd2,$xd2
2368 vmovdqu $xa0,0x00($out)
2369 vmovdqu $xb0,0x20($out)
2370 vmovdqu $xc0,0x40($out)
2371 vmovdqu $xd0,0x60($out)
2372 vmovdqu $xa1,0x80($out)
2373 vmovdqu $xb1,0xa0($out)
2374 vmovdqu $xc1,0xc0($out)
2375 vmovdqu $xd1,0xe0($out)
2376 vmovdqu $xa2,0x100($out)
2377 vmovdqu $xb2,0x120($out)
2378 vmovdqu $xc2,0x140($out)
2379 vmovdqu $xd2,0x160($out)
2382 lea 0x180($inp),$inp # inp+=64*6
2384 vmovdqa $xa3,0x00(%rsp)
2385 lea 0x180($out),$out # out+=64*6
2386 sub \$384,$len # len-=64*6
2387 vmovdqa $xb3,0x20(%rsp)
2392 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2393 vpxor 0x20($inp),$xb0,$xb0
2394 vpxor 0x40($inp),$xc0,$xc0
2395 vpxor 0x60($inp),$xd0,$xd0
2396 vpxor 0x80($inp),$xa1,$xa1
2397 vpxor 0xa0($inp),$xb1,$xb1
2398 vpxor 0xc0($inp),$xc1,$xc1
2399 vpxor 0xe0($inp),$xd1,$xd1
2400 vpxor 0x100($inp),$xa2,$xa2
2401 vpxor 0x120($inp),$xb2,$xb2
2402 vpxor 0x140($inp),$xc2,$xc2
2403 vpxor 0x160($inp),$xd2,$xd2
2404 vpxor 0x180($inp),$xa3,$xa3
2405 vpxor 0x1a0($inp),$xb3,$xb3
2406 vmovdqu $xa0,0x00($out)
2407 vmovdqu $xb0,0x20($out)
2408 vmovdqu $xc0,0x40($out)
2409 vmovdqu $xd0,0x60($out)
2410 vmovdqu $xa1,0x80($out)
2411 vmovdqu $xb1,0xa0($out)
2412 vmovdqu $xc1,0xc0($out)
2413 vmovdqu $xd1,0xe0($out)
2414 vmovdqu $xa2,0x100($out)
2415 vmovdqu $xb2,0x120($out)
2416 vmovdqu $xc2,0x140($out)
2417 vmovdqu $xd2,0x160($out)
2418 vmovdqu $xa3,0x180($out)
2419 vmovdqu $xb3,0x1a0($out)
2422 lea 0x1c0($inp),$inp # inp+=64*7
2424 vmovdqa $xc3,0x00(%rsp)
2425 lea 0x1c0($out),$out # out+=64*7
2426 sub \$448,$len # len-=64*7
2427 vmovdqa $xd3,0x20(%rsp)
2430 movzb ($inp,%r10),%eax
2431 movzb (%rsp,%r10),%ecx
2434 mov %al,-1($out,%r10)
2441 $code.=<<___ if ($win64);
2442 movaps -0xa8(%r9),%xmm6
2443 movaps -0x98(%r9),%xmm7
2444 movaps -0x88(%r9),%xmm8
2445 movaps -0x78(%r9),%xmm9
2446 movaps -0x68(%r9),%xmm10
2447 movaps -0x58(%r9),%xmm11
2448 movaps -0x48(%r9),%xmm12
2449 movaps -0x38(%r9),%xmm13
2450 movaps -0x28(%r9),%xmm14
2451 movaps -0x18(%r9),%xmm15
2455 .cfi_def_cfa_register %rsp
2459 .size ChaCha20_8x,.-ChaCha20_8x
2463 ########################################################################
2466 # This one handles shorter inputs...
2468 my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20));
2469 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
2471 sub vpxord() # size optimization
2472 { my $opcode = "vpxor"; # adhere to vpxor when possible
2475 if (/%([zy])mm([0-9]+)/ && ($1 eq "z" || $2>=16)) {
2481 $code .= "\t$opcode\t".join(',',reverse @_)."\n";
2484 sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round
2502 my $xframe = $win64 ? 32+8 : 8;
2505 .type ChaCha20_avx512,\@function,5
2510 mov %rsp,%r9 # frame pointer
2511 .cfi_def_cfa_register %r9
2515 sub \$64+$xframe,%rsp
2517 $code.=<<___ if ($win64);
2518 movaps %xmm6,-0x28(%r9)
2519 movaps %xmm7,-0x18(%r9)
2523 vbroadcasti32x4 .Lsigma(%rip),$a
2524 vbroadcasti32x4 ($key),$b
2525 vbroadcasti32x4 16($key),$c
2526 vbroadcasti32x4 ($counter),$d
2531 vpaddd .Lzeroz(%rip),$d,$d
2532 vmovdqa32 .Lfourz(%rip),$fourz
2533 mov \$10,$counter # reuse $counter
2542 vpaddd $fourz,$d_,$d
2551 &vpshufd ($c,$c,0b01001110);
2552 &vpshufd ($b,$b,0b00111001);
2553 &vpshufd ($d,$d,0b10010011);
2556 &vpshufd ($c,$c,0b01001110);
2557 &vpshufd ($b,$b,0b10010011);
2558 &vpshufd ($d,$d,0b00111001);
2561 &jnz (".Loop_avx512");
2572 vpxor 0x00($inp),%x#$a,$t0 # xor with input
2573 vpxor 0x10($inp),%x#$b,$t1
2574 vpxor 0x20($inp),%x#$c,$t2
2575 vpxor 0x30($inp),%x#$d,$t3
2576 lea 0x40($inp),$inp # inp+=64
2578 vmovdqu $t0,0x00($out) # write output
2579 vmovdqu $t1,0x10($out)
2580 vmovdqu $t2,0x20($out)
2581 vmovdqu $t3,0x30($out)
2582 lea 0x40($out),$out # out+=64
2586 vextracti32x4 \$1,$a,$t0
2587 vextracti32x4 \$1,$b,$t1
2588 vextracti32x4 \$1,$c,$t2
2589 vextracti32x4 \$1,$d,$t3
2594 vpxor 0x00($inp),$t0,$t0 # xor with input
2595 vpxor 0x10($inp),$t1,$t1
2596 vpxor 0x20($inp),$t2,$t2
2597 vpxor 0x30($inp),$t3,$t3
2598 lea 0x40($inp),$inp # inp+=64
2600 vmovdqu $t0,0x00($out) # write output
2601 vmovdqu $t1,0x10($out)
2602 vmovdqu $t2,0x20($out)
2603 vmovdqu $t3,0x30($out)
2604 lea 0x40($out),$out # out+=64
2608 vextracti32x4 \$2,$a,$t0
2609 vextracti32x4 \$2,$b,$t1
2610 vextracti32x4 \$2,$c,$t2
2611 vextracti32x4 \$2,$d,$t3
2616 vpxor 0x00($inp),$t0,$t0 # xor with input
2617 vpxor 0x10($inp),$t1,$t1
2618 vpxor 0x20($inp),$t2,$t2
2619 vpxor 0x30($inp),$t3,$t3
2620 lea 0x40($inp),$inp # inp+=64
2622 vmovdqu $t0,0x00($out) # write output
2623 vmovdqu $t1,0x10($out)
2624 vmovdqu $t2,0x20($out)
2625 vmovdqu $t3,0x30($out)
2626 lea 0x40($out),$out # out+=64
2630 vextracti32x4 \$3,$a,$t0
2631 vextracti32x4 \$3,$b,$t1
2632 vextracti32x4 \$3,$c,$t2
2633 vextracti32x4 \$3,$d,$t3
2638 vpxor 0x00($inp),$t0,$t0 # xor with input
2639 vpxor 0x10($inp),$t1,$t1
2640 vpxor 0x20($inp),$t2,$t2
2641 vpxor 0x30($inp),$t3,$t3
2642 lea 0x40($inp),$inp # inp+=64
2644 vmovdqu $t0,0x00($out) # write output
2645 vmovdqu $t1,0x10($out)
2646 vmovdqu $t2,0x20($out)
2647 vmovdqu $t3,0x30($out)
2648 lea 0x40($out),$out # out+=64
2650 jnz .Loop_outer_avx512
2656 vmovdqa %x#$a,0x00(%rsp)
2657 vmovdqa %x#$b,0x10(%rsp)
2658 vmovdqa %x#$c,0x20(%rsp)
2659 vmovdqa %x#$d,0x30(%rsp)
2661 jmp .Loop_tail_avx512
2665 vmovdqa $t0,0x00(%rsp)
2666 vmovdqa $t1,0x10(%rsp)
2667 vmovdqa $t2,0x20(%rsp)
2668 vmovdqa $t3,0x30(%rsp)
2672 movzb ($inp,$counter),%eax
2673 movzb (%rsp,$counter),%ecx
2674 lea 1($counter),$counter
2676 mov %al,-1($out,$counter)
2678 jnz .Loop_tail_avx512
2680 vmovdqu32 $a_,0x00(%rsp)
2685 $code.=<<___ if ($win64);
2686 movaps -0x28(%r9),%xmm6
2687 movaps -0x18(%r9),%xmm7
2691 .cfi_def_cfa_register %rsp
2695 .size ChaCha20_avx512,.-ChaCha20_avx512
2698 map(s/%z/%y/, $a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz);
2701 .type ChaCha20_avx512vl,\@function,5
2705 .LChaCha20_avx512vl:
2706 mov %rsp,%r9 # frame pointer
2707 .cfi_def_cfa_register %r9
2711 sub \$64+$xframe,%rsp
2713 $code.=<<___ if ($win64);
2714 movaps %xmm6,-0x28(%r9)
2715 movaps %xmm7,-0x18(%r9)
2719 vbroadcasti128 .Lsigma(%rip),$a
2720 vbroadcasti128 ($key),$b
2721 vbroadcasti128 16($key),$c
2722 vbroadcasti128 ($counter),$d
2727 vpaddd .Lzeroz(%rip),$d,$d
2728 vmovdqa32 .Ltwoy(%rip),$fourz
2729 mov \$10,$counter # reuse $counter
2734 .Loop_outer_avx512vl:
2736 vpaddd $fourz,$d_,$d
2745 &vpshufd ($c,$c,0b01001110);
2746 &vpshufd ($b,$b,0b00111001);
2747 &vpshufd ($d,$d,0b10010011);
2750 &vpshufd ($c,$c,0b01001110);
2751 &vpshufd ($b,$b,0b10010011);
2752 &vpshufd ($d,$d,0b00111001);
2755 &jnz (".Loop_avx512vl");
2764 jb .Ltail64_avx512vl
2766 vpxor 0x00($inp),%x#$a,$t0 # xor with input
2767 vpxor 0x10($inp),%x#$b,$t1
2768 vpxor 0x20($inp),%x#$c,$t2
2769 vpxor 0x30($inp),%x#$d,$t3
2770 lea 0x40($inp),$inp # inp+=64
2772 vmovdqu $t0,0x00($out) # write output
2773 vmovdqu $t1,0x10($out)
2774 vmovdqu $t2,0x20($out)
2775 vmovdqu $t3,0x30($out)
2776 lea 0x40($out),$out # out+=64
2780 vextracti128 \$1,$a,$t0
2781 vextracti128 \$1,$b,$t1
2782 vextracti128 \$1,$c,$t2
2783 vextracti128 \$1,$d,$t3
2788 vpxor 0x00($inp),$t0,$t0 # xor with input
2789 vpxor 0x10($inp),$t1,$t1
2790 vpxor 0x20($inp),$t2,$t2
2791 vpxor 0x30($inp),$t3,$t3
2792 lea 0x40($inp),$inp # inp+=64
2794 vmovdqu $t0,0x00($out) # write output
2795 vmovdqu $t1,0x10($out)
2796 vmovdqu $t2,0x20($out)
2797 vmovdqu $t3,0x30($out)
2798 lea 0x40($out),$out # out+=64
2802 jnz .Loop_outer_avx512vl
2808 vmovdqa %x#$a,0x00(%rsp)
2809 vmovdqa %x#$b,0x10(%rsp)
2810 vmovdqa %x#$c,0x20(%rsp)
2811 vmovdqa %x#$d,0x30(%rsp)
2813 jmp .Loop_tail_avx512vl
2817 vmovdqa $t0,0x00(%rsp)
2818 vmovdqa $t1,0x10(%rsp)
2819 vmovdqa $t2,0x20(%rsp)
2820 vmovdqa $t3,0x30(%rsp)
2823 .Loop_tail_avx512vl:
2824 movzb ($inp,$counter),%eax
2825 movzb (%rsp,$counter),%ecx
2826 lea 1($counter),$counter
2828 mov %al,-1($out,$counter)
2830 jnz .Loop_tail_avx512vl
2832 vmovdqu32 $a_,0x00(%rsp)
2833 vmovdqu32 $a_,0x20(%rsp)
2838 $code.=<<___ if ($win64);
2839 movaps -0x28(%r9),%xmm6
2840 movaps -0x18(%r9),%xmm7
2844 .cfi_def_cfa_register %rsp
2845 .Lavx512vl_epilogue:
2848 .size ChaCha20_avx512vl,.-ChaCha20_avx512vl
2852 # This one handles longer inputs...
2854 my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2855 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15));
2856 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2857 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
2858 my @key=map("%zmm$_",(16..31));
2859 my ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
2861 sub AVX512_lane_ROUND {
2862 my ($a0,$b0,$c0,$d0)=@_;
2863 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
2864 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
2865 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
2866 my @x=map("\"$_\"",@xx);
2869 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
2870 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
2871 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
2872 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
2873 "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
2874 "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
2875 "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
2876 "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
2877 "&vprold (@x[$d0],@x[$d0],16)",
2878 "&vprold (@x[$d1],@x[$d1],16)",
2879 "&vprold (@x[$d2],@x[$d2],16)",
2880 "&vprold (@x[$d3],@x[$d3],16)",
2882 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
2883 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
2884 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
2885 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
2886 "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
2887 "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
2888 "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
2889 "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
2890 "&vprold (@x[$b0],@x[$b0],12)",
2891 "&vprold (@x[$b1],@x[$b1],12)",
2892 "&vprold (@x[$b2],@x[$b2],12)",
2893 "&vprold (@x[$b3],@x[$b3],12)",
2895 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
2896 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
2897 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
2898 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
2899 "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
2900 "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
2901 "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
2902 "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
2903 "&vprold (@x[$d0],@x[$d0],8)",
2904 "&vprold (@x[$d1],@x[$d1],8)",
2905 "&vprold (@x[$d2],@x[$d2],8)",
2906 "&vprold (@x[$d3],@x[$d3],8)",
2908 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
2909 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
2910 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
2911 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
2912 "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
2913 "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
2914 "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
2915 "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
2916 "&vprold (@x[$b0],@x[$b0],7)",
2917 "&vprold (@x[$b1],@x[$b1],7)",
2918 "&vprold (@x[$b2],@x[$b2],7)",
2919 "&vprold (@x[$b3],@x[$b3],7)"
2923 my $xframe = $win64 ? 0xa8 : 8;
2926 .type ChaCha20_16x,\@function,5
2931 mov %rsp,%r9 # frame register
2932 .cfi_def_cfa_register %r9
2933 sub \$64+$xframe,%rsp
2936 $code.=<<___ if ($win64);
2937 movaps %xmm6,-0xa8(%r9)
2938 movaps %xmm7,-0x98(%r9)
2939 movaps %xmm8,-0x88(%r9)
2940 movaps %xmm9,-0x78(%r9)
2941 movaps %xmm10,-0x68(%r9)
2942 movaps %xmm11,-0x58(%r9)
2943 movaps %xmm12,-0x48(%r9)
2944 movaps %xmm13,-0x38(%r9)
2945 movaps %xmm14,-0x28(%r9)
2946 movaps %xmm15,-0x18(%r9)
2952 lea .Lsigma(%rip),%r10
2953 vbroadcasti32x4 (%r10),$xa3 # key[0]
2954 vbroadcasti32x4 ($key),$xb3 # key[1]
2955 vbroadcasti32x4 16($key),$xc3 # key[2]
2956 vbroadcasti32x4 ($counter),$xd3 # key[3]
2958 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
2959 vpshufd \$0x55,$xa3,$xa1
2960 vpshufd \$0xaa,$xa3,$xa2
2961 vpshufd \$0xff,$xa3,$xa3
2962 vmovdqa64 $xa0,@key[0]
2963 vmovdqa64 $xa1,@key[1]
2964 vmovdqa64 $xa2,@key[2]
2965 vmovdqa64 $xa3,@key[3]
2967 vpshufd \$0x00,$xb3,$xb0
2968 vpshufd \$0x55,$xb3,$xb1
2969 vpshufd \$0xaa,$xb3,$xb2
2970 vpshufd \$0xff,$xb3,$xb3
2971 vmovdqa64 $xb0,@key[4]
2972 vmovdqa64 $xb1,@key[5]
2973 vmovdqa64 $xb2,@key[6]
2974 vmovdqa64 $xb3,@key[7]
2976 vpshufd \$0x00,$xc3,$xc0
2977 vpshufd \$0x55,$xc3,$xc1
2978 vpshufd \$0xaa,$xc3,$xc2
2979 vpshufd \$0xff,$xc3,$xc3
2980 vmovdqa64 $xc0,@key[8]
2981 vmovdqa64 $xc1,@key[9]
2982 vmovdqa64 $xc2,@key[10]
2983 vmovdqa64 $xc3,@key[11]
2985 vpshufd \$0x00,$xd3,$xd0
2986 vpshufd \$0x55,$xd3,$xd1
2987 vpshufd \$0xaa,$xd3,$xd2
2988 vpshufd \$0xff,$xd3,$xd3
2989 vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet
2990 vmovdqa64 $xd0,@key[12]
2991 vmovdqa64 $xd1,@key[13]
2992 vmovdqa64 $xd2,@key[14]
2993 vmovdqa64 $xd3,@key[15]
3000 vpbroadcastd 0(%r10),$xa0 # reload key
3001 vpbroadcastd 4(%r10),$xa1
3002 vpbroadcastd 8(%r10),$xa2
3003 vpbroadcastd 12(%r10),$xa3
3004 vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters
3005 vmovdqa64 @key[4],$xb0
3006 vmovdqa64 @key[5],$xb1
3007 vmovdqa64 @key[6],$xb2
3008 vmovdqa64 @key[7],$xb3
3009 vmovdqa64 @key[8],$xc0
3010 vmovdqa64 @key[9],$xc1
3011 vmovdqa64 @key[10],$xc2
3012 vmovdqa64 @key[11],$xc3
3013 vmovdqa64 @key[12],$xd0
3014 vmovdqa64 @key[13],$xd1
3015 vmovdqa64 @key[14],$xd2
3016 vmovdqa64 @key[15],$xd3
3018 vmovdqa64 $xa0,@key[0]
3019 vmovdqa64 $xa1,@key[1]
3020 vmovdqa64 $xa2,@key[2]
3021 vmovdqa64 $xa3,@key[3]
3029 foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
3030 foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
3035 vpaddd @key[0],$xa0,$xa0 # accumulate key
3036 vpaddd @key[1],$xa1,$xa1
3037 vpaddd @key[2],$xa2,$xa2
3038 vpaddd @key[3],$xa3,$xa3
3040 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
3041 vpunpckldq $xa3,$xa2,$xt3
3042 vpunpckhdq $xa1,$xa0,$xa0
3043 vpunpckhdq $xa3,$xa2,$xa2
3044 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
3045 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
3046 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
3047 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
3049 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
3051 vpaddd @key[4],$xb0,$xb0
3052 vpaddd @key[5],$xb1,$xb1
3053 vpaddd @key[6],$xb2,$xb2
3054 vpaddd @key[7],$xb3,$xb3
3056 vpunpckldq $xb1,$xb0,$xt2
3057 vpunpckldq $xb3,$xb2,$xt3
3058 vpunpckhdq $xb1,$xb0,$xb0
3059 vpunpckhdq $xb3,$xb2,$xb2
3060 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
3061 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
3062 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
3063 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
3065 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
3067 vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further
3068 vshufi32x4 \$0xee,$xb0,$xa0,$xb0
3069 vshufi32x4 \$0x44,$xb1,$xa1,$xa0
3070 vshufi32x4 \$0xee,$xb1,$xa1,$xb1
3071 vshufi32x4 \$0x44,$xb2,$xa2,$xa1
3072 vshufi32x4 \$0xee,$xb2,$xa2,$xb2
3073 vshufi32x4 \$0x44,$xb3,$xa3,$xa2
3074 vshufi32x4 \$0xee,$xb3,$xa3,$xb3
3076 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
3078 vpaddd @key[8],$xc0,$xc0
3079 vpaddd @key[9],$xc1,$xc1
3080 vpaddd @key[10],$xc2,$xc2
3081 vpaddd @key[11],$xc3,$xc3
3083 vpunpckldq $xc1,$xc0,$xt2
3084 vpunpckldq $xc3,$xc2,$xt3
3085 vpunpckhdq $xc1,$xc0,$xc0
3086 vpunpckhdq $xc3,$xc2,$xc2
3087 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
3088 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
3089 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
3090 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
3092 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
3094 vpaddd @key[12],$xd0,$xd0
3095 vpaddd @key[13],$xd1,$xd1
3096 vpaddd @key[14],$xd2,$xd2
3097 vpaddd @key[15],$xd3,$xd3
3099 vpunpckldq $xd1,$xd0,$xt2
3100 vpunpckldq $xd3,$xd2,$xt3
3101 vpunpckhdq $xd1,$xd0,$xd0
3102 vpunpckhdq $xd3,$xd2,$xd2
3103 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
3104 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
3105 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
3106 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
3108 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
3110 vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further
3111 vshufi32x4 \$0xee,$xd0,$xc0,$xd0
3112 vshufi32x4 \$0x44,$xd1,$xc1,$xc0
3113 vshufi32x4 \$0xee,$xd1,$xc1,$xd1
3114 vshufi32x4 \$0x44,$xd2,$xc2,$xc1
3115 vshufi32x4 \$0xee,$xd2,$xc2,$xd2
3116 vshufi32x4 \$0x44,$xd3,$xc3,$xc2
3117 vshufi32x4 \$0xee,$xd3,$xc3,$xd3
3119 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
3121 vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further
3122 vshufi32x4 \$0xdd,$xc0,$xa0,$xa0
3123 vshufi32x4 \$0x88,$xd0,$xb0,$xc0
3124 vshufi32x4 \$0xdd,$xd0,$xb0,$xd0
3125 vshufi32x4 \$0x88,$xc1,$xa1,$xt1
3126 vshufi32x4 \$0xdd,$xc1,$xa1,$xa1
3127 vshufi32x4 \$0x88,$xd1,$xb1,$xc1
3128 vshufi32x4 \$0xdd,$xd1,$xb1,$xd1
3129 vshufi32x4 \$0x88,$xc2,$xa2,$xt2
3130 vshufi32x4 \$0xdd,$xc2,$xa2,$xa2
3131 vshufi32x4 \$0x88,$xd2,$xb2,$xc2
3132 vshufi32x4 \$0xdd,$xd2,$xb2,$xd2
3133 vshufi32x4 \$0x88,$xc3,$xa3,$xt3
3134 vshufi32x4 \$0xdd,$xc3,$xa3,$xa3
3135 vshufi32x4 \$0x88,$xd3,$xb3,$xc3
3136 vshufi32x4 \$0xdd,$xd3,$xb3,$xd3
3138 ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)=
3139 ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3);
3141 ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1,
3142 $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) =
3143 ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
3144 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
3149 vpxord 0x00($inp),$xa0,$xa0 # xor with input
3150 vpxord 0x40($inp),$xb0,$xb0
3151 vpxord 0x80($inp),$xc0,$xc0
3152 vpxord 0xc0($inp),$xd0,$xd0
3153 vmovdqu32 $xa0,0x00($out)
3154 vmovdqu32 $xb0,0x40($out)
3155 vmovdqu32 $xc0,0x80($out)
3156 vmovdqu32 $xd0,0xc0($out)
3158 vpxord 0x100($inp),$xa1,$xa1
3159 vpxord 0x140($inp),$xb1,$xb1
3160 vpxord 0x180($inp),$xc1,$xc1
3161 vpxord 0x1c0($inp),$xd1,$xd1
3162 vmovdqu32 $xa1,0x100($out)
3163 vmovdqu32 $xb1,0x140($out)
3164 vmovdqu32 $xc1,0x180($out)
3165 vmovdqu32 $xd1,0x1c0($out)
3167 vpxord 0x200($inp),$xa2,$xa2
3168 vpxord 0x240($inp),$xb2,$xb2
3169 vpxord 0x280($inp),$xc2,$xc2
3170 vpxord 0x2c0($inp),$xd2,$xd2
3171 vmovdqu32 $xa2,0x200($out)
3172 vmovdqu32 $xb2,0x240($out)
3173 vmovdqu32 $xc2,0x280($out)
3174 vmovdqu32 $xd2,0x2c0($out)
3176 vpxord 0x300($inp),$xa3,$xa3
3177 vpxord 0x340($inp),$xb3,$xb3
3178 vpxord 0x380($inp),$xc3,$xc3
3179 vpxord 0x3c0($inp),$xd3,$xd3
3180 lea 0x400($inp),$inp
3181 vmovdqu32 $xa3,0x300($out)
3182 vmovdqu32 $xb3,0x340($out)
3183 vmovdqu32 $xc3,0x380($out)
3184 vmovdqu32 $xd3,0x3c0($out)
3185 lea 0x400($out),$out
3197 jb .Less_than_64_16x
3198 vpxord ($inp),$xa0,$xa0 # xor with input
3199 vmovdqu32 $xa0,($out,$inp)
3205 jb .Less_than_64_16x
3206 vpxord ($inp),$xb0,$xb0
3207 vmovdqu32 $xb0,($out,$inp)
3213 jb .Less_than_64_16x
3214 vpxord ($inp),$xc0,$xc0
3215 vmovdqu32 $xc0,($out,$inp)
3221 jb .Less_than_64_16x
3222 vpxord ($inp),$xd0,$xd0
3223 vmovdqu32 $xd0,($out,$inp)
3229 jb .Less_than_64_16x
3230 vpxord ($inp),$xa1,$xa1
3231 vmovdqu32 $xa1,($out,$inp)
3237 jb .Less_than_64_16x
3238 vpxord ($inp),$xb1,$xb1
3239 vmovdqu32 $xb1,($out,$inp)
3245 jb .Less_than_64_16x
3246 vpxord ($inp),$xc1,$xc1
3247 vmovdqu32 $xc1,($out,$inp)
3253 jb .Less_than_64_16x
3254 vpxord ($inp),$xd1,$xd1
3255 vmovdqu32 $xd1,($out,$inp)
3261 jb .Less_than_64_16x
3262 vpxord ($inp),$xa2,$xa2
3263 vmovdqu32 $xa2,($out,$inp)
3269 jb .Less_than_64_16x
3270 vpxord ($inp),$xb2,$xb2
3271 vmovdqu32 $xb2,($out,$inp)
3277 jb .Less_than_64_16x
3278 vpxord ($inp),$xc2,$xc2
3279 vmovdqu32 $xc2,($out,$inp)
3285 jb .Less_than_64_16x
3286 vpxord ($inp),$xd2,$xd2
3287 vmovdqu32 $xd2,($out,$inp)
3293 jb .Less_than_64_16x
3294 vpxord ($inp),$xa3,$xa3
3295 vmovdqu32 $xa3,($out,$inp)
3301 jb .Less_than_64_16x
3302 vpxord ($inp),$xb3,$xb3
3303 vmovdqu32 $xb3,($out,$inp)
3309 jb .Less_than_64_16x
3310 vpxord ($inp),$xc3,$xc3
3311 vmovdqu32 $xc3,($out,$inp)
3317 vmovdqa32 $xa0,0x00(%rsp)
3318 lea ($out,$inp),$out
3322 movzb ($inp,%r10),%eax
3323 movzb (%rsp,%r10),%ecx
3326 mov %al,-1($out,%r10)
3330 vpxord $xa0,$xa0,$xa0
3331 vmovdqa32 $xa0,0(%rsp)
3336 $code.=<<___ if ($win64);
3337 movaps -0xa8(%r9),%xmm6
3338 movaps -0x98(%r9),%xmm7
3339 movaps -0x88(%r9),%xmm8
3340 movaps -0x78(%r9),%xmm9
3341 movaps -0x68(%r9),%xmm10
3342 movaps -0x58(%r9),%xmm11
3343 movaps -0x48(%r9),%xmm12
3344 movaps -0x38(%r9),%xmm13
3345 movaps -0x28(%r9),%xmm14
3346 movaps -0x18(%r9),%xmm15
3350 .cfi_def_cfa_register %rsp
3354 .size ChaCha20_16x,.-ChaCha20_16x
3357 # switch to %ymm domain
3358 ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
3359 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%ymm$_",(0..15));
3360 @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
3361 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
3362 @key=map("%ymm$_",(16..31));
3363 ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
3366 .type ChaCha20_8xvl,\@function,5
3371 mov %rsp,%r9 # frame register
3372 .cfi_def_cfa_register %r9
3373 sub \$64+$xframe,%rsp
3376 $code.=<<___ if ($win64);
3377 movaps %xmm6,-0xa8(%r9)
3378 movaps %xmm7,-0x98(%r9)
3379 movaps %xmm8,-0x88(%r9)
3380 movaps %xmm9,-0x78(%r9)
3381 movaps %xmm10,-0x68(%r9)
3382 movaps %xmm11,-0x58(%r9)
3383 movaps %xmm12,-0x48(%r9)
3384 movaps %xmm13,-0x38(%r9)
3385 movaps %xmm14,-0x28(%r9)
3386 movaps %xmm15,-0x18(%r9)
3392 lea .Lsigma(%rip),%r10
3393 vbroadcasti128 (%r10),$xa3 # key[0]
3394 vbroadcasti128 ($key),$xb3 # key[1]
3395 vbroadcasti128 16($key),$xc3 # key[2]
3396 vbroadcasti128 ($counter),$xd3 # key[3]
3398 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
3399 vpshufd \$0x55,$xa3,$xa1
3400 vpshufd \$0xaa,$xa3,$xa2
3401 vpshufd \$0xff,$xa3,$xa3
3402 vmovdqa64 $xa0,@key[0]
3403 vmovdqa64 $xa1,@key[1]
3404 vmovdqa64 $xa2,@key[2]
3405 vmovdqa64 $xa3,@key[3]
3407 vpshufd \$0x00,$xb3,$xb0
3408 vpshufd \$0x55,$xb3,$xb1
3409 vpshufd \$0xaa,$xb3,$xb2
3410 vpshufd \$0xff,$xb3,$xb3
3411 vmovdqa64 $xb0,@key[4]
3412 vmovdqa64 $xb1,@key[5]
3413 vmovdqa64 $xb2,@key[6]
3414 vmovdqa64 $xb3,@key[7]
3416 vpshufd \$0x00,$xc3,$xc0
3417 vpshufd \$0x55,$xc3,$xc1
3418 vpshufd \$0xaa,$xc3,$xc2
3419 vpshufd \$0xff,$xc3,$xc3
3420 vmovdqa64 $xc0,@key[8]
3421 vmovdqa64 $xc1,@key[9]
3422 vmovdqa64 $xc2,@key[10]
3423 vmovdqa64 $xc3,@key[11]
3425 vpshufd \$0x00,$xd3,$xd0
3426 vpshufd \$0x55,$xd3,$xd1
3427 vpshufd \$0xaa,$xd3,$xd2
3428 vpshufd \$0xff,$xd3,$xd3
3429 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet
3430 vmovdqa64 $xd0,@key[12]
3431 vmovdqa64 $xd1,@key[13]
3432 vmovdqa64 $xd2,@key[14]
3433 vmovdqa64 $xd3,@key[15]
3440 #vpbroadcastd 0(%r10),$xa0 # reload key
3441 #vpbroadcastd 4(%r10),$xa1
3442 vpbroadcastd 8(%r10),$xa2
3443 vpbroadcastd 12(%r10),$xa3
3444 vpaddd .Leight(%rip),@key[12],@key[12] # next SIMD counters
3445 vmovdqa64 @key[4],$xb0
3446 vmovdqa64 @key[5],$xb1
3447 vmovdqa64 @key[6],$xb2
3448 vmovdqa64 @key[7],$xb3
3449 vmovdqa64 @key[8],$xc0
3450 vmovdqa64 @key[9],$xc1
3451 vmovdqa64 @key[10],$xc2
3452 vmovdqa64 @key[11],$xc3
3453 vmovdqa64 @key[12],$xd0
3454 vmovdqa64 @key[13],$xd1
3455 vmovdqa64 @key[14],$xd2
3456 vmovdqa64 @key[15],$xd3
3458 vmovdqa64 $xa0,@key[0]
3459 vmovdqa64 $xa1,@key[1]
3460 vmovdqa64 $xa2,@key[2]
3461 vmovdqa64 $xa3,@key[3]
3469 foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
3470 foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
3475 vpaddd @key[0],$xa0,$xa0 # accumulate key
3476 vpaddd @key[1],$xa1,$xa1
3477 vpaddd @key[2],$xa2,$xa2
3478 vpaddd @key[3],$xa3,$xa3
3480 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
3481 vpunpckldq $xa3,$xa2,$xt3
3482 vpunpckhdq $xa1,$xa0,$xa0
3483 vpunpckhdq $xa3,$xa2,$xa2
3484 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
3485 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
3486 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
3487 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
3489 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
3491 vpaddd @key[4],$xb0,$xb0
3492 vpaddd @key[5],$xb1,$xb1
3493 vpaddd @key[6],$xb2,$xb2
3494 vpaddd @key[7],$xb3,$xb3
3496 vpunpckldq $xb1,$xb0,$xt2
3497 vpunpckldq $xb3,$xb2,$xt3
3498 vpunpckhdq $xb1,$xb0,$xb0
3499 vpunpckhdq $xb3,$xb2,$xb2
3500 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
3501 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
3502 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
3503 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
3505 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
3507 vshufi32x4 \$0,$xb0,$xa0,$xt3 # "de-interlace" further
3508 vshufi32x4 \$3,$xb0,$xa0,$xb0
3509 vshufi32x4 \$0,$xb1,$xa1,$xa0
3510 vshufi32x4 \$3,$xb1,$xa1,$xb1
3511 vshufi32x4 \$0,$xb2,$xa2,$xa1
3512 vshufi32x4 \$3,$xb2,$xa2,$xb2
3513 vshufi32x4 \$0,$xb3,$xa3,$xa2
3514 vshufi32x4 \$3,$xb3,$xa3,$xb3
3516 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
3518 vpaddd @key[8],$xc0,$xc0
3519 vpaddd @key[9],$xc1,$xc1
3520 vpaddd @key[10],$xc2,$xc2
3521 vpaddd @key[11],$xc3,$xc3
3523 vpunpckldq $xc1,$xc0,$xt2
3524 vpunpckldq $xc3,$xc2,$xt3
3525 vpunpckhdq $xc1,$xc0,$xc0
3526 vpunpckhdq $xc3,$xc2,$xc2
3527 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
3528 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
3529 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
3530 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
3532 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
3534 vpaddd @key[12],$xd0,$xd0
3535 vpaddd @key[13],$xd1,$xd1
3536 vpaddd @key[14],$xd2,$xd2
3537 vpaddd @key[15],$xd3,$xd3
3539 vpunpckldq $xd1,$xd0,$xt2
3540 vpunpckldq $xd3,$xd2,$xt3
3541 vpunpckhdq $xd1,$xd0,$xd0
3542 vpunpckhdq $xd3,$xd2,$xd2
3543 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
3544 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
3545 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
3546 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
3548 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
3550 vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
3551 vperm2i128 \$0x31,$xd0,$xc0,$xd0
3552 vperm2i128 \$0x20,$xd1,$xc1,$xc0
3553 vperm2i128 \$0x31,$xd1,$xc1,$xd1
3554 vperm2i128 \$0x20,$xd2,$xc2,$xc1
3555 vperm2i128 \$0x31,$xd2,$xc2,$xd2
3556 vperm2i128 \$0x20,$xd3,$xc3,$xc2
3557 vperm2i128 \$0x31,$xd3,$xc3,$xd3
3559 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
3560 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
3561 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
3566 mov \$0x80,%eax # size optimization
3567 vpxord 0x00($inp),$xa0,$xa0 # xor with input
3568 vpxor 0x20($inp),$xb0,$xb0
3569 vpxor 0x40($inp),$xc0,$xc0
3570 vpxor 0x60($inp),$xd0,$xd0
3571 lea ($inp,%rax),$inp # size optimization
3572 vmovdqu32 $xa0,0x00($out)
3573 vmovdqu $xb0,0x20($out)
3574 vmovdqu $xc0,0x40($out)
3575 vmovdqu $xd0,0x60($out)
3576 lea ($out,%rax),$out # size optimization
3578 vpxor 0x00($inp),$xa1,$xa1
3579 vpxor 0x20($inp),$xb1,$xb1
3580 vpxor 0x40($inp),$xc1,$xc1
3581 vpxor 0x60($inp),$xd1,$xd1
3582 lea ($inp,%rax),$inp # size optimization
3583 vmovdqu $xa1,0x00($out)
3584 vmovdqu $xb1,0x20($out)
3585 vmovdqu $xc1,0x40($out)
3586 vmovdqu $xd1,0x60($out)
3587 lea ($out,%rax),$out # size optimization
3589 vpxord 0x00($inp),$xa2,$xa2
3590 vpxor 0x20($inp),$xb2,$xb2
3591 vpxor 0x40($inp),$xc2,$xc2
3592 vpxor 0x60($inp),$xd2,$xd2
3593 lea ($inp,%rax),$inp # size optimization
3594 vmovdqu32 $xa2,0x00($out)
3595 vmovdqu $xb2,0x20($out)
3596 vmovdqu $xc2,0x40($out)
3597 vmovdqu $xd2,0x60($out)
3598 lea ($out,%rax),$out # size optimization
3600 vpxor 0x00($inp),$xa3,$xa3
3601 vpxor 0x20($inp),$xb3,$xb3
3602 vpxor 0x40($inp),$xc3,$xc3
3603 vpxor 0x60($inp),$xd3,$xd3
3604 lea ($inp,%rax),$inp # size optimization
3605 vmovdqu $xa3,0x00($out)
3606 vmovdqu $xb3,0x20($out)
3607 vmovdqu $xc3,0x40($out)
3608 vmovdqu $xd3,0x60($out)
3609 lea ($out,%rax),$out # size optimization
3611 vpbroadcastd 0(%r10),%ymm0 # reload key
3612 vpbroadcastd 4(%r10),%ymm1
3621 vmovdqa64 $xa0,%ymm8 # size optimization
3628 jb .Less_than_64_8xvl
3629 vpxor 0x00($inp),$xa0,$xa0 # xor with input
3630 vpxor 0x20($inp),$xb0,$xb0
3631 vmovdqu $xa0,0x00($out,$inp)
3632 vmovdqu $xb0,0x20($out,$inp)
3639 jb .Less_than_64_8xvl
3640 vpxor 0x00($inp),$xc0,$xc0
3641 vpxor 0x20($inp),$xd0,$xd0
3642 vmovdqu $xc0,0x00($out,$inp)
3643 vmovdqu $xd0,0x20($out,$inp)
3650 jb .Less_than_64_8xvl
3651 vpxor 0x00($inp),$xa1,$xa1
3652 vpxor 0x20($inp),$xb1,$xb1
3653 vmovdqu $xa1,0x00($out,$inp)
3654 vmovdqu $xb1,0x20($out,$inp)
3661 jb .Less_than_64_8xvl
3662 vpxor 0x00($inp),$xc1,$xc1
3663 vpxor 0x20($inp),$xd1,$xd1
3664 vmovdqu $xc1,0x00($out,$inp)
3665 vmovdqu $xd1,0x20($out,$inp)
3672 jb .Less_than_64_8xvl
3673 vpxord 0x00($inp),$xa2,$xa2
3674 vpxor 0x20($inp),$xb2,$xb2
3675 vmovdqu32 $xa2,0x00($out,$inp)
3676 vmovdqu $xb2,0x20($out,$inp)
3683 jb .Less_than_64_8xvl
3684 vpxor 0x00($inp),$xc2,$xc2
3685 vpxor 0x20($inp),$xd2,$xd2
3686 vmovdqu $xc2,0x00($out,$inp)
3687 vmovdqu $xd2,0x20($out,$inp)
3694 jb .Less_than_64_8xvl
3695 vpxor 0x00($inp),$xa3,$xa3
3696 vpxor 0x20($inp),$xb3,$xb3
3697 vmovdqu $xa3,0x00($out,$inp)
3698 vmovdqu $xb3,0x20($out,$inp)
3705 vmovdqa $xa0,0x00(%rsp)
3706 vmovdqa $xb0,0x20(%rsp)
3707 lea ($out,$inp),$out
3711 movzb ($inp,%r10),%eax
3712 movzb (%rsp,%r10),%ecx
3715 mov %al,-1($out,%r10)
3719 vpxor $xa0,$xa0,$xa0
3720 vmovdqa $xa0,0x00(%rsp)
3721 vmovdqa $xa0,0x20(%rsp)
3726 $code.=<<___ if ($win64);
3727 movaps -0xa8(%r9),%xmm6
3728 movaps -0x98(%r9),%xmm7
3729 movaps -0x88(%r9),%xmm8
3730 movaps -0x78(%r9),%xmm9
3731 movaps -0x68(%r9),%xmm10
3732 movaps -0x58(%r9),%xmm11
3733 movaps -0x48(%r9),%xmm12
3734 movaps -0x38(%r9),%xmm13
3735 movaps -0x28(%r9),%xmm14
3736 movaps -0x18(%r9),%xmm15
3740 .cfi_def_cfa_register %rsp
3744 .size ChaCha20_8xvl,.-ChaCha20_8xvl
3748 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3749 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
3757 .extern __imp_RtlVirtualUnwind
3758 .type se_handler,\@abi-omnipotent
3772 mov 120($context),%rax # pull context->Rax
3773 mov 248($context),%rbx # pull context->Rip
3775 mov 8($disp),%rsi # disp->ImageBase
3776 mov 56($disp),%r11 # disp->HandlerData
3778 lea .Lctr32_body(%rip),%r10
3779 cmp %r10,%rbx # context->Rip<.Lprologue
3780 jb .Lcommon_seh_tail
3782 mov 152($context),%rax # pull context->Rsp
3784 lea .Lno_data(%rip),%r10 # epilogue label
3785 cmp %r10,%rbx # context->Rip>=.Lepilogue
3786 jae .Lcommon_seh_tail
3788 lea 64+24+48(%rax),%rax
3796 mov %rbx,144($context) # restore context->Rbx
3797 mov %rbp,160($context) # restore context->Rbp
3798 mov %r12,216($context) # restore context->R12
3799 mov %r13,224($context) # restore context->R13
3800 mov %r14,232($context) # restore context->R14
3801 mov %r15,240($context) # restore context->R14
3806 mov %rax,152($context) # restore context->Rsp
3807 mov %rsi,168($context) # restore context->Rsi
3808 mov %rdi,176($context) # restore context->Rdi
3810 mov 40($disp),%rdi # disp->ContextRecord
3811 mov $context,%rsi # context
3812 mov \$154,%ecx # sizeof(CONTEXT)
3813 .long 0xa548f3fc # cld; rep movsq
3816 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3817 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3818 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3819 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3820 mov 40(%rsi),%r10 # disp->ContextRecord
3821 lea 56(%rsi),%r11 # &disp->HandlerData
3822 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3823 mov %r10,32(%rsp) # arg5
3824 mov %r11,40(%rsp) # arg6
3825 mov %r12,48(%rsp) # arg7
3826 mov %rcx,56(%rsp) # arg8, (NULL)
3827 call *__imp_RtlVirtualUnwind(%rip)
3829 mov \$1,%eax # ExceptionContinueSearch
3841 .size se_handler,.-se_handler
3843 .type simd_handler,\@abi-omnipotent
3857 mov 120($context),%rax # pull context->Rax
3858 mov 248($context),%rbx # pull context->Rip
3860 mov 8($disp),%rsi # disp->ImageBase
3861 mov 56($disp),%r11 # disp->HandlerData
3863 mov 0(%r11),%r10d # HandlerData[0]
3864 lea (%rsi,%r10),%r10 # prologue label
3865 cmp %r10,%rbx # context->Rip<prologue label
3866 jb .Lcommon_seh_tail
3868 mov 192($context),%rax # pull context->R9
3870 mov 4(%r11),%r10d # HandlerData[1]
3871 mov 8(%r11),%ecx # HandlerData[2]
3872 lea (%rsi,%r10),%r10 # epilogue label
3873 cmp %r10,%rbx # context->Rip>=epilogue label
3874 jae .Lcommon_seh_tail
3877 lea -8(%rax,%rcx),%rsi
3878 lea 512($context),%rdi # &context.Xmm6
3881 .long 0xa548f3fc # cld; rep movsq
3883 jmp .Lcommon_seh_tail
3884 .size simd_handler,.-simd_handler
3888 .rva .LSEH_begin_ChaCha20_ctr32
3889 .rva .LSEH_end_ChaCha20_ctr32
3890 .rva .LSEH_info_ChaCha20_ctr32
3892 .rva .LSEH_begin_ChaCha20_ssse3
3893 .rva .LSEH_end_ChaCha20_ssse3
3894 .rva .LSEH_info_ChaCha20_ssse3
3896 .rva .LSEH_begin_ChaCha20_128
3897 .rva .LSEH_end_ChaCha20_128
3898 .rva .LSEH_info_ChaCha20_128
3900 .rva .LSEH_begin_ChaCha20_4x
3901 .rva .LSEH_end_ChaCha20_4x
3902 .rva .LSEH_info_ChaCha20_4x
3904 $code.=<<___ if ($avx);
3905 .rva .LSEH_begin_ChaCha20_4xop
3906 .rva .LSEH_end_ChaCha20_4xop
3907 .rva .LSEH_info_ChaCha20_4xop
3909 $code.=<<___ if ($avx>1);
3910 .rva .LSEH_begin_ChaCha20_8x
3911 .rva .LSEH_end_ChaCha20_8x
3912 .rva .LSEH_info_ChaCha20_8x
3914 $code.=<<___ if ($avx>2);
3915 .rva .LSEH_begin_ChaCha20_avx512
3916 .rva .LSEH_end_ChaCha20_avx512
3917 .rva .LSEH_info_ChaCha20_avx512
3919 .rva .LSEH_begin_ChaCha20_avx512vl
3920 .rva .LSEH_end_ChaCha20_avx512vl
3921 .rva .LSEH_info_ChaCha20_avx512vl
3923 .rva .LSEH_begin_ChaCha20_16x
3924 .rva .LSEH_end_ChaCha20_16x
3925 .rva .LSEH_info_ChaCha20_16x
3927 .rva .LSEH_begin_ChaCha20_8xvl
3928 .rva .LSEH_end_ChaCha20_8xvl
3929 .rva .LSEH_info_ChaCha20_8xvl
3934 .LSEH_info_ChaCha20_ctr32:
3938 .LSEH_info_ChaCha20_ssse3:
3941 .rva .Lssse3_body,.Lssse3_epilogue
3944 .LSEH_info_ChaCha20_128:
3947 .rva .L128_body,.L128_epilogue
3950 .LSEH_info_ChaCha20_4x:
3953 .rva .L4x_body,.L4x_epilogue
3956 $code.=<<___ if ($avx);
3957 .LSEH_info_ChaCha20_4xop:
3960 .rva .L4xop_body,.L4xop_epilogue # HandlerData[]
3963 $code.=<<___ if ($avx>1);
3964 .LSEH_info_ChaCha20_8x:
3967 .rva .L8x_body,.L8x_epilogue # HandlerData[]
3970 $code.=<<___ if ($avx>2);
3971 .LSEH_info_ChaCha20_avx512:
3974 .rva .Lavx512_body,.Lavx512_epilogue # HandlerData[]
3977 .LSEH_info_ChaCha20_avx512vl:
3980 .rva .Lavx512vl_body,.Lavx512vl_epilogue # HandlerData[]
3983 .LSEH_info_ChaCha20_16x:
3986 .rva .L16x_body,.L16x_epilogue # HandlerData[]
3989 .LSEH_info_ChaCha20_8xvl:
3992 .rva .L8xvl_body,.L8xvl_epilogue # HandlerData[]
3997 foreach (split("\n",$code)) {
3998 s/\`([^\`]*)\`/eval $1/ge;
4000 s/%x#%[yz]/%x/g; # "down-shift"