2 # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
4 # Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
5 # Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
6 # Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
8 # This code is taken from the OpenSSL project but the author, Andy Polyakov,
9 # has relicensed it under the licenses specified in the SPDX header above.
10 # The original headers, including the original license headers, are
11 # included below for completeness.
13 # ====================================================================
14 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
15 # project. The module is, however, dual licensed under OpenSSL and
16 # CRYPTOGAMS licenses depending on where you obtain it. For further
17 # details see http://www.openssl.org/~appro/cryptogams/.
18 # ====================================================================
20 # This module implements Poly1305 hash for x86_64.
28 # Add AVX512F+VL+BW code path.
32 # Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
33 # executed even on Knights Landing. Trigger for modification was
34 # observation that AVX512 code paths can negatively affect overall
35 # Skylake-X system performance. Since we are likely to suppress
36 # AVX512F capability flag [at least on Skylake-X], conversion serves
37 # as kind of "investment protection". Note that next *lake processor,
38 # Cannolake, has AVX512IFMA code path to execute...
40 # Numbers are cycles per processed byte with poly1305_blocks alone,
41 # measured with rdtsc at fixed clock frequency.
43 # IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512
46 # Westmere 1.88/+120% -
47 # Sandy Bridge 1.39/+140% 1.10
48 # Haswell 1.14/+175% 1.11 0.65
49 # Skylake[-X] 1.13/+120% 0.96 0.51 [0.35]
50 # Silvermont 2.83/+95% -
51 # Knights L 3.60/? 1.65 1.10 0.41(***)
52 # Goldmont 1.70/+180% -
53 # VIA Nano 1.82/+150% -
54 # Sledgehammer 1.38/+160% -
55 # Bulldozer 2.30/+130% 0.97
56 # Ryzen 1.15/+200% 1.08 1.18
58 # (*) improvement coefficients relative to clang are more modest and
59 # are ~50% on most processors, in both cases we are comparing to
61 # (**) SSE2 implementation was attempted, but among non-AVX processors
62 # it was faster than integer-only code only on older Intel P4 and
63 # Core processors, 50-30%, less newer processor is, but slower on
64 # contemporary ones, for example almost 2x slower on Atom, and as
65 # former are naturally disappearing, SSE2 is deemed unnecessary;
66 # (***) strangely enough performance seems to vary from core to core,
67 # listed result is best case;
71 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
73 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
74 $kernel=0; $kernel=1 if (!$flavour && !$output);
77 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
78 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
79 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
80 die "can't locate x86_64-xlate.pl";
82 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
85 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
86 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
87 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
90 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
91 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
92 $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
93 $avx += 1 if ($1==2.11 && $2>=8);
96 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
97 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
98 $avx = ($1>=10) + ($1>=11);
101 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
102 $avx = ($2>=3.0) + ($2>3.0);
105 $avx = 4; # The kernel uses ifdefs for this.
108 sub declare_function() {
109 my ($name, $align, $nargs) = @_;
111 $code .= ".align $align\n";
112 $code .= "SYM_FUNC_START($name)\n";
113 $code .= ".L$name:\n";
115 $code .= ".globl $name\n";
116 $code .= ".type $name,\@function,$nargs\n";
117 $code .= ".align $align\n";
125 $code .= "SYM_FUNC_END($name)\n";
127 $code .= ".size $name,.-$name\n";
131 $code.=<<___ if $kernel;
132 #include <linux/linkage.h>
136 $code.=<<___ if $kernel;
143 .long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
145 .long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
147 .long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
149 .long 2,2,2,3,2,0,2,1
151 .long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
154 .long 0,1,1,2,2,3,7,7
158 .quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
166 .quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
167 .quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
169 .quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
170 .quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
173 $code.=<<___ if (!$kernel);
174 .asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
178 my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
179 my ($mac,$nonce)=($inp,$len); # *_emit arguments
180 my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
181 my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
183 sub poly1305_iteration {
184 # input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
185 # output: $h0-$h2 *= $r0-$r1
193 mov %rax,$h0 # future $h0
203 mov $h2,$h1 # borrow $h1
207 imulq $s1,$h1 # h2*s1
212 imulq $r0,$h2 # h2*r0
214 mov \$-4,%rax # mask value
217 and $d3,%rax # last reduction step
228 ########################################################################
229 # Layout of opaque area is following.
231 # unsigned __int64 h[3]; # current hash value base 2^64
232 # unsigned __int64 r[2]; # key value base 2^64
237 $code.=<<___ if (!$kernel);
238 .extern OPENSSL_ia32cap_P
240 .globl poly1305_init_x86_64
241 .hidden poly1305_init_x86_64
242 .globl poly1305_blocks_x86_64
243 .hidden poly1305_blocks_x86_64
244 .globl poly1305_emit_x86_64
245 .hidden poly1305_emit_x86_64
247 &declare_function("poly1305_init_x86_64", 32, 3);
250 mov %rax,0($ctx) # initialize hash value
257 $code.=<<___ if (!$kernel);
258 lea poly1305_blocks_x86_64(%rip),%r10
259 lea poly1305_emit_x86_64(%rip),%r11
261 $code.=<<___ if (!$kernel && $avx);
262 mov OPENSSL_ia32cap_P+4(%rip),%r9
263 lea poly1305_blocks_avx(%rip),%rax
264 lea poly1305_emit_avx(%rip),%rcx
265 bt \$`60-32`,%r9 # AVX?
269 $code.=<<___ if (!$kernel && $avx>1);
270 lea poly1305_blocks_avx2(%rip),%rax
271 bt \$`5+32`,%r9 # AVX2?
274 $code.=<<___ if (!$kernel && $avx>3);
275 mov \$`(1<<31|1<<21|1<<16)`,%rax
282 mov \$0x0ffffffc0fffffff,%rax
283 mov \$0x0ffffffc0ffffffc,%rcx
289 $code.=<<___ if (!$kernel && $flavour !~ /elf32/);
293 $code.=<<___ if (!$kernel && $flavour =~ /elf32/);
302 &end_function("poly1305_init_x86_64");
304 &declare_function("poly1305_blocks_x86_64", 32, 4);
309 jz .Lno_data # too short
325 mov $len,%r15 # reassign $len
327 mov 24($ctx),$r0 # load r
330 mov 0($ctx),$h0 # load hash value
337 add $r1,$s1 # s1 = r1 + (r1 >> 2)
342 add 0($inp),$h0 # accumulate input
348 &poly1305_iteration();
358 mov $h0,0($ctx) # store hash value
373 .cfi_adjust_cfa_offset -48
379 &end_function("poly1305_blocks_x86_64");
381 &declare_function("poly1305_emit_x86_64", 32, 3);
384 mov 0($ctx),%r8 # load hash value
389 add \$5,%r8 # compare to modulus
393 shr \$2,%r10 # did 130-bit value overflow?
397 add 0($nonce),%rax # accumulate nonce
399 mov %rax,0($mac) # write result
404 &end_function("poly1305_emit_x86_64");
408 $code .= "#ifdef CONFIG_AS_AVX\n";
411 ########################################################################
412 # Layout of opaque area is following.
414 # unsigned __int32 h[5]; # current hash value base 2^26
415 # unsigned __int32 is_base2_26;
416 # unsigned __int64 r[2]; # key value base 2^64
417 # unsigned __int64 pad;
418 # struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
420 # where r^n are base 2^26 digits of degrees of multiplier key. There are
421 # 5 digits, but last four are interleaved with multiples of 5, totalling
422 # in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
424 my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
425 map("%xmm$_",(0..15));
428 .type __poly1305_block,\@abi-omnipotent
433 &poly1305_iteration();
437 .size __poly1305_block,.-__poly1305_block
439 .type __poly1305_init_avx,\@abi-omnipotent
448 lea 48+64($ctx),$ctx # size optimization
451 call __poly1305_block # r^2
453 mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
459 mov %eax,`16*0+0-64`($ctx)
461 mov %edx,`16*0+4-64`($ctx)
468 mov %eax,`16*1+0-64`($ctx)
469 lea (%rax,%rax,4),%eax # *5
470 mov %edx,`16*1+4-64`($ctx)
471 lea (%rdx,%rdx,4),%edx # *5
472 mov %eax,`16*2+0-64`($ctx)
474 mov %edx,`16*2+4-64`($ctx)
485 mov %eax,`16*3+0-64`($ctx)
486 lea (%rax,%rax,4),%eax # *5
487 mov %edx,`16*3+4-64`($ctx)
488 lea (%rdx,%rdx,4),%edx # *5
489 mov %eax,`16*4+0-64`($ctx)
491 mov %edx,`16*4+4-64`($ctx)
500 mov %eax,`16*5+0-64`($ctx)
501 lea (%rax,%rax,4),%eax # *5
502 mov %edx,`16*5+4-64`($ctx)
503 lea (%rdx,%rdx,4),%edx # *5
504 mov %eax,`16*6+0-64`($ctx)
506 mov %edx,`16*6+4-64`($ctx)
512 mov $d1#d,`16*7+0-64`($ctx)
513 lea ($d1,$d1,4),$d1 # *5
514 mov $d2#d,`16*7+4-64`($ctx)
515 lea ($d2,$d2,4),$d2 # *5
516 mov $d1#d,`16*8+0-64`($ctx)
517 mov $d2#d,`16*8+4-64`($ctx)
520 call __poly1305_block # r^3
522 mov \$0x3ffffff,%eax # save r^3 base 2^26
526 mov %eax,`16*0+12-64`($ctx)
530 mov %edx,`16*1+12-64`($ctx)
531 lea (%rdx,%rdx,4),%edx # *5
533 mov %edx,`16*2+12-64`($ctx)
539 mov %eax,`16*3+12-64`($ctx)
540 lea (%rax,%rax,4),%eax # *5
542 mov %eax,`16*4+12-64`($ctx)
547 mov %edx,`16*5+12-64`($ctx)
548 lea (%rdx,%rdx,4),%edx # *5
550 mov %edx,`16*6+12-64`($ctx)
555 mov $d1#d,`16*7+12-64`($ctx)
556 lea ($d1,$d1,4),$d1 # *5
557 mov $d1#d,`16*8+12-64`($ctx)
560 call __poly1305_block # r^4
562 mov \$0x3ffffff,%eax # save r^4 base 2^26
566 mov %eax,`16*0+8-64`($ctx)
570 mov %edx,`16*1+8-64`($ctx)
571 lea (%rdx,%rdx,4),%edx # *5
573 mov %edx,`16*2+8-64`($ctx)
579 mov %eax,`16*3+8-64`($ctx)
580 lea (%rax,%rax,4),%eax # *5
582 mov %eax,`16*4+8-64`($ctx)
587 mov %edx,`16*5+8-64`($ctx)
588 lea (%rdx,%rdx,4),%edx # *5
590 mov %edx,`16*6+8-64`($ctx)
595 mov $d1#d,`16*7+8-64`($ctx)
596 lea ($d1,$d1,4),$d1 # *5
597 mov $d1#d,`16*8+8-64`($ctx)
599 lea -48-64($ctx),$ctx # size [de-]optimization
602 .size __poly1305_init_avx,.-__poly1305_init_avx
605 &declare_function("poly1305_blocks_avx", 32, 4);
608 mov 20($ctx),%r8d # is_base2_26
641 mov $len,%r15 # reassign $len
643 mov 0($ctx),$d1 # load hash value
647 mov 24($ctx),$r0 # load r
650 ################################# base 2^26 -> base 2^64
652 and \$`-1*(1<<31)`,$d1
653 mov $d2,$r1 # borrow $r1
655 and \$`-1*(1<<31)`,$d2
669 adc \$0,$h2 # can be partially reduced...
671 mov \$-4,$d2 # ... so reduce
684 add $r1,$s1 # s1 = r1 + (r1 >> 2)
686 add 0($inp),$h0 # accumulate input
691 call __poly1305_block
693 test $padbit,$padbit # if $padbit is zero,
694 jz .Lstore_base2_64_avx # store hash in base 2^64 format
696 ################################# base 2^64 -> base 2^26
703 and \$0x3ffffff,%rax # h[0]
705 and \$0x3ffffff,%rdx # h[1]
709 and \$0x3ffffff,$h0 # h[2]
711 and \$0x3ffffff,$h1 # h[3]
715 jz .Lstore_base2_26_avx
725 .Lstore_base2_64_avx:
728 mov $h2,16($ctx) # note that is_base2_26 is zeroed
732 .Lstore_base2_26_avx:
733 mov %rax#d,0($ctx) # store hash value base 2^26
753 .Lblocks_avx_epilogue:
775 mov $len,%r15 # reassign $len
777 mov 24($ctx),$r0 # load r
780 mov 0($ctx),$h0 # load hash value
787 add $r1,$s1 # s1 = r1 + (r1 >> 2)
792 add 0($inp),$h0 # accumulate input
798 call __poly1305_block
801 ################################# base 2^64 -> base 2^26
808 and \$0x3ffffff,%rax # h[0]
810 and \$0x3ffffff,%rdx # h[1]
814 and \$0x3ffffff,$h0 # h[2]
816 and \$0x3ffffff,$h1 # h[3]
824 movl \$1,20($ctx) # set is_base2_26
826 call __poly1305_init_avx
842 .Lbase2_64_avx_epilogue:
849 vmovd 4*0($ctx),$H0 # load hash value
857 $code.=<<___ if (!$win64);
859 .cfi_def_cfa_register %r10
866 $code.=<<___ if ($win64);
869 vmovdqa %xmm6,0x50(%r11)
870 vmovdqa %xmm7,0x60(%r11)
871 vmovdqa %xmm8,0x70(%r11)
872 vmovdqa %xmm9,0x80(%r11)
873 vmovdqa %xmm10,0x90(%r11)
874 vmovdqa %xmm11,0xa0(%r11)
875 vmovdqa %xmm12,0xb0(%r11)
876 vmovdqa %xmm13,0xc0(%r11)
877 vmovdqa %xmm14,0xd0(%r11)
878 vmovdqa %xmm15,0xe0(%r11)
886 vmovdqu `16*3`($ctx),$D4 # preload r0^2
887 lea `16*3+64`($ctx),$ctx # size optimization
888 lea .Lconst(%rip),%rcx
890 ################################################################
892 vmovdqu 16*2($inp),$T0
893 vmovdqu 16*3($inp),$T1
894 vmovdqa 64(%rcx),$MASK # .Lmask26
896 vpsrldq \$6,$T0,$T2 # splat input
898 vpunpckhqdq $T1,$T0,$T4 # 4
899 vpunpcklqdq $T1,$T0,$T0 # 0:1
900 vpunpcklqdq $T3,$T2,$T3 # 2:3
902 vpsrlq \$40,$T4,$T4 # 4
904 vpand $MASK,$T0,$T0 # 0
906 vpand $MASK,$T1,$T1 # 1
908 vpand $MASK,$T2,$T2 # 2
909 vpand $MASK,$T3,$T3 # 3
910 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
914 # expand and copy pre-calculated table to stack
915 vmovdqu `16*1-64`($ctx),$D1
916 vmovdqu `16*2-64`($ctx),$D2
917 vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
918 vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
919 vmovdqa $D3,-0x90(%r11)
920 vmovdqa $D0,0x00(%rsp)
921 vpshufd \$0xEE,$D1,$D4
922 vmovdqu `16*3-64`($ctx),$D0
923 vpshufd \$0x44,$D1,$D1
924 vmovdqa $D4,-0x80(%r11)
925 vmovdqa $D1,0x10(%rsp)
926 vpshufd \$0xEE,$D2,$D3
927 vmovdqu `16*4-64`($ctx),$D1
928 vpshufd \$0x44,$D2,$D2
929 vmovdqa $D3,-0x70(%r11)
930 vmovdqa $D2,0x20(%rsp)
931 vpshufd \$0xEE,$D0,$D4
932 vmovdqu `16*5-64`($ctx),$D2
933 vpshufd \$0x44,$D0,$D0
934 vmovdqa $D4,-0x60(%r11)
935 vmovdqa $D0,0x30(%rsp)
936 vpshufd \$0xEE,$D1,$D3
937 vmovdqu `16*6-64`($ctx),$D0
938 vpshufd \$0x44,$D1,$D1
939 vmovdqa $D3,-0x50(%r11)
940 vmovdqa $D1,0x40(%rsp)
941 vpshufd \$0xEE,$D2,$D4
942 vmovdqu `16*7-64`($ctx),$D1
943 vpshufd \$0x44,$D2,$D2
944 vmovdqa $D4,-0x40(%r11)
945 vmovdqa $D2,0x50(%rsp)
946 vpshufd \$0xEE,$D0,$D3
947 vmovdqu `16*8-64`($ctx),$D2
948 vpshufd \$0x44,$D0,$D0
949 vmovdqa $D3,-0x30(%r11)
950 vmovdqa $D0,0x60(%rsp)
951 vpshufd \$0xEE,$D1,$D4
952 vpshufd \$0x44,$D1,$D1
953 vmovdqa $D4,-0x20(%r11)
954 vmovdqa $D1,0x70(%rsp)
955 vpshufd \$0xEE,$D2,$D3
956 vmovdqa 0x00(%rsp),$D4 # preload r0^2
957 vpshufd \$0x44,$D2,$D2
958 vmovdqa $D3,-0x10(%r11)
959 vmovdqa $D2,0x80(%rsp)
965 ################################################################
966 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
967 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
968 # \___________________/
969 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
970 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
971 # \___________________/ \____________________/
973 # Note that we start with inp[2:3]*r^2. This is because it
974 # doesn't depend on reduction in previous iteration.
975 ################################################################
976 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
977 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
978 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
979 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
980 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
982 # though note that $Tx and $Hx are "reversed" in this section,
983 # and $D4 is preloaded with r0^2...
985 vpmuludq $T0,$D4,$D0 # d0 = h0*r0
986 vpmuludq $T1,$D4,$D1 # d1 = h1*r0
987 vmovdqa $H2,0x20(%r11) # offload hash
988 vpmuludq $T2,$D4,$D2 # d3 = h2*r0
989 vmovdqa 0x10(%rsp),$H2 # r1^2
990 vpmuludq $T3,$D4,$D3 # d3 = h3*r0
991 vpmuludq $T4,$D4,$D4 # d4 = h4*r0
993 vmovdqa $H0,0x00(%r11) #
994 vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
995 vmovdqa $H1,0x10(%r11) #
996 vpmuludq $T3,$H2,$H1 # h3*r1
997 vpaddq $H0,$D0,$D0 # d0 += h4*s1
998 vpaddq $H1,$D4,$D4 # d4 += h3*r1
999 vmovdqa $H3,0x30(%r11) #
1000 vpmuludq $T2,$H2,$H0 # h2*r1
1001 vpmuludq $T1,$H2,$H1 # h1*r1
1002 vpaddq $H0,$D3,$D3 # d3 += h2*r1
1003 vmovdqa 0x30(%rsp),$H3 # r2^2
1004 vpaddq $H1,$D2,$D2 # d2 += h1*r1
1005 vmovdqa $H4,0x40(%r11) #
1006 vpmuludq $T0,$H2,$H2 # h0*r1
1007 vpmuludq $T2,$H3,$H0 # h2*r2
1008 vpaddq $H2,$D1,$D1 # d1 += h0*r1
1010 vmovdqa 0x40(%rsp),$H4 # s2^2
1011 vpaddq $H0,$D4,$D4 # d4 += h2*r2
1012 vpmuludq $T1,$H3,$H1 # h1*r2
1013 vpmuludq $T0,$H3,$H3 # h0*r2
1014 vpaddq $H1,$D3,$D3 # d3 += h1*r2
1015 vmovdqa 0x50(%rsp),$H2 # r3^2
1016 vpaddq $H3,$D2,$D2 # d2 += h0*r2
1017 vpmuludq $T4,$H4,$H0 # h4*s2
1018 vpmuludq $T3,$H4,$H4 # h3*s2
1019 vpaddq $H0,$D1,$D1 # d1 += h4*s2
1020 vmovdqa 0x60(%rsp),$H3 # s3^2
1021 vpaddq $H4,$D0,$D0 # d0 += h3*s2
1023 vmovdqa 0x80(%rsp),$H4 # s4^2
1024 vpmuludq $T1,$H2,$H1 # h1*r3
1025 vpmuludq $T0,$H2,$H2 # h0*r3
1026 vpaddq $H1,$D4,$D4 # d4 += h1*r3
1027 vpaddq $H2,$D3,$D3 # d3 += h0*r3
1028 vpmuludq $T4,$H3,$H0 # h4*s3
1029 vpmuludq $T3,$H3,$H1 # h3*s3
1030 vpaddq $H0,$D2,$D2 # d2 += h4*s3
1031 vmovdqu 16*0($inp),$H0 # load input
1032 vpaddq $H1,$D1,$D1 # d1 += h3*s3
1033 vpmuludq $T2,$H3,$H3 # h2*s3
1034 vpmuludq $T2,$H4,$T2 # h2*s4
1035 vpaddq $H3,$D0,$D0 # d0 += h2*s3
1037 vmovdqu 16*1($inp),$H1 #
1038 vpaddq $T2,$D1,$D1 # d1 += h2*s4
1039 vpmuludq $T3,$H4,$T3 # h3*s4
1040 vpmuludq $T4,$H4,$T4 # h4*s4
1041 vpsrldq \$6,$H0,$H2 # splat input
1042 vpaddq $T3,$D2,$D2 # d2 += h3*s4
1043 vpaddq $T4,$D3,$D3 # d3 += h4*s4
1044 vpsrldq \$6,$H1,$H3 #
1045 vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
1046 vpmuludq $T1,$H4,$T0 # h1*s4
1047 vpunpckhqdq $H1,$H0,$H4 # 4
1048 vpaddq $T4,$D4,$D4 # d4 += h0*r4
1049 vmovdqa -0x90(%r11),$T4 # r0^4
1050 vpaddq $T0,$D0,$D0 # d0 += h1*s4
1052 vpunpcklqdq $H1,$H0,$H0 # 0:1
1053 vpunpcklqdq $H3,$H2,$H3 # 2:3
1055 #vpsrlq \$40,$H4,$H4 # 4
1056 vpsrldq \$`40/8`,$H4,$H4 # 4
1058 vpand $MASK,$H0,$H0 # 0
1060 vpand $MASK,$H1,$H1 # 1
1061 vpand 0(%rcx),$H4,$H4 # .Lmask24
1063 vpand $MASK,$H2,$H2 # 2
1064 vpand $MASK,$H3,$H3 # 3
1065 vpor 32(%rcx),$H4,$H4 # padbit, yes, always
1067 vpaddq 0x00(%r11),$H0,$H0 # add hash value
1068 vpaddq 0x10(%r11),$H1,$H1
1069 vpaddq 0x20(%r11),$H2,$H2
1070 vpaddq 0x30(%r11),$H3,$H3
1071 vpaddq 0x40(%r11),$H4,$H4
1078 ################################################################
1079 # Now we accumulate (inp[0:1]+hash)*r^4
1080 ################################################################
1081 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1082 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1083 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1084 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1085 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1087 vpmuludq $H0,$T4,$T0 # h0*r0
1088 vpmuludq $H1,$T4,$T1 # h1*r0
1091 vmovdqa -0x80(%r11),$T2 # r1^4
1092 vpmuludq $H2,$T4,$T0 # h2*r0
1093 vpmuludq $H3,$T4,$T1 # h3*r0
1096 vpmuludq $H4,$T4,$T4 # h4*r0
1097 vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
1100 vpaddq $T0,$D0,$D0 # d0 += h4*s1
1101 vpmuludq $H2,$T2,$T1 # h2*r1
1102 vpmuludq $H3,$T2,$T0 # h3*r1
1103 vpaddq $T1,$D3,$D3 # d3 += h2*r1
1104 vmovdqa -0x60(%r11),$T3 # r2^4
1105 vpaddq $T0,$D4,$D4 # d4 += h3*r1
1106 vpmuludq $H1,$T2,$T1 # h1*r1
1107 vpmuludq $H0,$T2,$T2 # h0*r1
1108 vpaddq $T1,$D2,$D2 # d2 += h1*r1
1109 vpaddq $T2,$D1,$D1 # d1 += h0*r1
1111 vmovdqa -0x50(%r11),$T4 # s2^4
1112 vpmuludq $H2,$T3,$T0 # h2*r2
1113 vpmuludq $H1,$T3,$T1 # h1*r2
1114 vpaddq $T0,$D4,$D4 # d4 += h2*r2
1115 vpaddq $T1,$D3,$D3 # d3 += h1*r2
1116 vmovdqa -0x40(%r11),$T2 # r3^4
1117 vpmuludq $H0,$T3,$T3 # h0*r2
1118 vpmuludq $H4,$T4,$T0 # h4*s2
1119 vpaddq $T3,$D2,$D2 # d2 += h0*r2
1120 vpaddq $T0,$D1,$D1 # d1 += h4*s2
1121 vmovdqa -0x30(%r11),$T3 # s3^4
1122 vpmuludq $H3,$T4,$T4 # h3*s2
1123 vpmuludq $H1,$T2,$T1 # h1*r3
1124 vpaddq $T4,$D0,$D0 # d0 += h3*s2
1126 vmovdqa -0x10(%r11),$T4 # s4^4
1127 vpaddq $T1,$D4,$D4 # d4 += h1*r3
1128 vpmuludq $H0,$T2,$T2 # h0*r3
1129 vpmuludq $H4,$T3,$T0 # h4*s3
1130 vpaddq $T2,$D3,$D3 # d3 += h0*r3
1131 vpaddq $T0,$D2,$D2 # d2 += h4*s3
1132 vmovdqu 16*2($inp),$T0 # load input
1133 vpmuludq $H3,$T3,$T2 # h3*s3
1134 vpmuludq $H2,$T3,$T3 # h2*s3
1135 vpaddq $T2,$D1,$D1 # d1 += h3*s3
1136 vmovdqu 16*3($inp),$T1 #
1137 vpaddq $T3,$D0,$D0 # d0 += h2*s3
1139 vpmuludq $H2,$T4,$H2 # h2*s4
1140 vpmuludq $H3,$T4,$H3 # h3*s4
1141 vpsrldq \$6,$T0,$T2 # splat input
1142 vpaddq $H2,$D1,$D1 # d1 += h2*s4
1143 vpmuludq $H4,$T4,$H4 # h4*s4
1144 vpsrldq \$6,$T1,$T3 #
1145 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
1146 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
1147 vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
1148 vpmuludq $H1,$T4,$H0
1149 vpunpckhqdq $T1,$T0,$T4 # 4
1150 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
1151 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
1153 vpunpcklqdq $T1,$T0,$T0 # 0:1
1154 vpunpcklqdq $T3,$T2,$T3 # 2:3
1156 #vpsrlq \$40,$T4,$T4 # 4
1157 vpsrldq \$`40/8`,$T4,$T4 # 4
1159 vmovdqa 0x00(%rsp),$D4 # preload r0^2
1160 vpand $MASK,$T0,$T0 # 0
1162 vpand $MASK,$T1,$T1 # 1
1163 vpand 0(%rcx),$T4,$T4 # .Lmask24
1165 vpand $MASK,$T2,$T2 # 2
1166 vpand $MASK,$T3,$T3 # 3
1167 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1169 ################################################################
1170 # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
1175 vpaddq $D3,$H4,$H4 # h3 -> h4
1179 vpaddq $D0,$D1,$H1 # h0 -> h1
1186 vpaddq $D1,$H2,$H2 # h1 -> h2
1190 vpaddq $D0,$H0,$H0 # h4 -> h0
1194 vpaddq $D2,$H3,$H3 # h2 -> h3
1198 vpaddq $D0,$H1,$H1 # h0 -> h1
1202 vpaddq $D3,$H4,$H4 # h3 -> h4
1207 ################################################################
1208 # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1210 vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
1221 vmovdqa $H2,0x20(%r11)
1222 vmovdqa $H0,0x00(%r11)
1223 vmovdqa $H1,0x10(%r11)
1224 vmovdqa $H3,0x30(%r11)
1225 vmovdqa $H4,0x40(%r11)
1227 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1228 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1229 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1230 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1231 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1233 vpmuludq $T2,$D4,$D2 # d2 = h2*r0
1234 vpmuludq $T0,$D4,$D0 # d0 = h0*r0
1235 vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
1236 vpmuludq $T1,$D4,$D1 # d1 = h1*r0
1237 vpmuludq $T3,$D4,$D3 # d3 = h3*r0
1238 vpmuludq $T4,$D4,$D4 # d4 = h4*r0
1240 vpmuludq $T3,$H2,$H0 # h3*r1
1241 vpaddq $H0,$D4,$D4 # d4 += h3*r1
1242 vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
1243 vpmuludq $T2,$H2,$H1 # h2*r1
1244 vpaddq $H1,$D3,$D3 # d3 += h2*r1
1245 vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
1246 vpmuludq $T1,$H2,$H0 # h1*r1
1247 vpaddq $H0,$D2,$D2 # d2 += h1*r1
1248 vpmuludq $T0,$H2,$H2 # h0*r1
1249 vpaddq $H2,$D1,$D1 # d1 += h0*r1
1250 vpmuludq $T4,$H3,$H3 # h4*s1
1251 vpaddq $H3,$D0,$D0 # d0 += h4*s1
1253 vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
1254 vpmuludq $T2,$H4,$H1 # h2*r2
1255 vpaddq $H1,$D4,$D4 # d4 += h2*r2
1256 vpmuludq $T1,$H4,$H0 # h1*r2
1257 vpaddq $H0,$D3,$D3 # d3 += h1*r2
1258 vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
1259 vpmuludq $T0,$H4,$H4 # h0*r2
1260 vpaddq $H4,$D2,$D2 # d2 += h0*r2
1261 vpmuludq $T4,$H2,$H1 # h4*s2
1262 vpaddq $H1,$D1,$D1 # d1 += h4*s2
1263 vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
1264 vpmuludq $T3,$H2,$H2 # h3*s2
1265 vpaddq $H2,$D0,$D0 # d0 += h3*s2
1267 vpmuludq $T1,$H3,$H0 # h1*r3
1268 vpaddq $H0,$D4,$D4 # d4 += h1*r3
1269 vpmuludq $T0,$H3,$H3 # h0*r3
1270 vpaddq $H3,$D3,$D3 # d3 += h0*r3
1271 vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
1272 vpmuludq $T4,$H4,$H1 # h4*s3
1273 vpaddq $H1,$D2,$D2 # d2 += h4*s3
1274 vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
1275 vpmuludq $T3,$H4,$H0 # h3*s3
1276 vpaddq $H0,$D1,$D1 # d1 += h3*s3
1277 vpmuludq $T2,$H4,$H4 # h2*s3
1278 vpaddq $H4,$D0,$D0 # d0 += h2*s3
1280 vpmuludq $T0,$H2,$H2 # h0*r4
1281 vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
1282 vpmuludq $T4,$H3,$H1 # h4*s4
1283 vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
1284 vpmuludq $T3,$H3,$H0 # h3*s4
1285 vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
1286 vpmuludq $T2,$H3,$H1 # h2*s4
1287 vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
1288 vpmuludq $T1,$H3,$H3 # h1*s4
1289 vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
1293 vmovdqu 16*0($inp),$H0 # load input
1294 vmovdqu 16*1($inp),$H1
1296 vpsrldq \$6,$H0,$H2 # splat input
1298 vpunpckhqdq $H1,$H0,$H4 # 4
1299 vpunpcklqdq $H1,$H0,$H0 # 0:1
1300 vpunpcklqdq $H3,$H2,$H3 # 2:3
1302 vpsrlq \$40,$H4,$H4 # 4
1304 vpand $MASK,$H0,$H0 # 0
1306 vpand $MASK,$H1,$H1 # 1
1308 vpand $MASK,$H2,$H2 # 2
1309 vpand $MASK,$H3,$H3 # 3
1310 vpor 32(%rcx),$H4,$H4 # padbit, yes, always
1312 vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
1313 vpaddq 0x00(%r11),$H0,$H0
1314 vpaddq 0x10(%r11),$H1,$H1
1315 vpaddq 0x20(%r11),$H2,$H2
1316 vpaddq 0x30(%r11),$H3,$H3
1317 vpaddq 0x40(%r11),$H4,$H4
1319 ################################################################
1320 # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
1322 vpmuludq $H0,$T4,$T0 # h0*r0
1323 vpaddq $T0,$D0,$D0 # d0 += h0*r0
1324 vpmuludq $H1,$T4,$T1 # h1*r0
1325 vpaddq $T1,$D1,$D1 # d1 += h1*r0
1326 vpmuludq $H2,$T4,$T0 # h2*r0
1327 vpaddq $T0,$D2,$D2 # d2 += h2*r0
1328 vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
1329 vpmuludq $H3,$T4,$T1 # h3*r0
1330 vpaddq $T1,$D3,$D3 # d3 += h3*r0
1331 vpmuludq $H4,$T4,$T4 # h4*r0
1332 vpaddq $T4,$D4,$D4 # d4 += h4*r0
1334 vpmuludq $H3,$T2,$T0 # h3*r1
1335 vpaddq $T0,$D4,$D4 # d4 += h3*r1
1336 vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
1337 vpmuludq $H2,$T2,$T1 # h2*r1
1338 vpaddq $T1,$D3,$D3 # d3 += h2*r1
1339 vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
1340 vpmuludq $H1,$T2,$T0 # h1*r1
1341 vpaddq $T0,$D2,$D2 # d2 += h1*r1
1342 vpmuludq $H0,$T2,$T2 # h0*r1
1343 vpaddq $T2,$D1,$D1 # d1 += h0*r1
1344 vpmuludq $H4,$T3,$T3 # h4*s1
1345 vpaddq $T3,$D0,$D0 # d0 += h4*s1
1347 vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
1348 vpmuludq $H2,$T4,$T1 # h2*r2
1349 vpaddq $T1,$D4,$D4 # d4 += h2*r2
1350 vpmuludq $H1,$T4,$T0 # h1*r2
1351 vpaddq $T0,$D3,$D3 # d3 += h1*r2
1352 vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
1353 vpmuludq $H0,$T4,$T4 # h0*r2
1354 vpaddq $T4,$D2,$D2 # d2 += h0*r2
1355 vpmuludq $H4,$T2,$T1 # h4*s2
1356 vpaddq $T1,$D1,$D1 # d1 += h4*s2
1357 vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
1358 vpmuludq $H3,$T2,$T2 # h3*s2
1359 vpaddq $T2,$D0,$D0 # d0 += h3*s2
1361 vpmuludq $H1,$T3,$T0 # h1*r3
1362 vpaddq $T0,$D4,$D4 # d4 += h1*r3
1363 vpmuludq $H0,$T3,$T3 # h0*r3
1364 vpaddq $T3,$D3,$D3 # d3 += h0*r3
1365 vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
1366 vpmuludq $H4,$T4,$T1 # h4*s3
1367 vpaddq $T1,$D2,$D2 # d2 += h4*s3
1368 vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
1369 vpmuludq $H3,$T4,$T0 # h3*s3
1370 vpaddq $T0,$D1,$D1 # d1 += h3*s3
1371 vpmuludq $H2,$T4,$T4 # h2*s3
1372 vpaddq $T4,$D0,$D0 # d0 += h2*s3
1374 vpmuludq $H0,$T2,$T2 # h0*r4
1375 vpaddq $T2,$D4,$D4 # d4 += h0*r4
1376 vpmuludq $H4,$T3,$T1 # h4*s4
1377 vpaddq $T1,$D3,$D3 # d3 += h4*s4
1378 vpmuludq $H3,$T3,$T0 # h3*s4
1379 vpaddq $T0,$D2,$D2 # d2 += h3*s4
1380 vpmuludq $H2,$T3,$T1 # h2*s4
1381 vpaddq $T1,$D1,$D1 # d1 += h2*s4
1382 vpmuludq $H1,$T3,$T3 # h1*s4
1383 vpaddq $T3,$D0,$D0 # d0 += h1*s4
1386 ################################################################
1387 # horizontal addition
1400 ################################################################
1405 vpaddq $H3,$D4,$D4 # h3 -> h4
1409 vpaddq $H0,$D1,$D1 # h0 -> h1
1416 vpaddq $H1,$D2,$D2 # h1 -> h2
1420 vpaddq $H4,$D0,$D0 # h4 -> h0
1424 vpaddq $H2,$D3,$D3 # h2 -> h3
1428 vpaddq $H0,$D1,$D1 # h0 -> h1
1432 vpaddq $H3,$D4,$D4 # h3 -> h4
1434 vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
1435 vmovd $D1,`4*1-48-64`($ctx)
1436 vmovd $D2,`4*2-48-64`($ctx)
1437 vmovd $D3,`4*3-48-64`($ctx)
1438 vmovd $D4,`4*4-48-64`($ctx)
1440 $code.=<<___ if ($win64);
1441 vmovdqa 0x50(%r11),%xmm6
1442 vmovdqa 0x60(%r11),%xmm7
1443 vmovdqa 0x70(%r11),%xmm8
1444 vmovdqa 0x80(%r11),%xmm9
1445 vmovdqa 0x90(%r11),%xmm10
1446 vmovdqa 0xa0(%r11),%xmm11
1447 vmovdqa 0xb0(%r11),%xmm12
1448 vmovdqa 0xc0(%r11),%xmm13
1449 vmovdqa 0xd0(%r11),%xmm14
1450 vmovdqa 0xe0(%r11),%xmm15
1454 $code.=<<___ if (!$win64);
1456 .cfi_def_cfa_register %rsp
1463 &end_function("poly1305_blocks_avx");
1465 &declare_function("poly1305_emit_avx", 32, 3);
1467 cmpl \$0,20($ctx) # is_base2_26?
1470 mov 0($ctx),%eax # load hash value base 2^26
1476 shl \$26,%rcx # base 2^26 -> base 2^64
1492 mov %r10,%rax # could be partially reduced, so reduce
1503 add \$5,%r8 # compare to modulus
1507 shr \$2,%r10 # did 130-bit value overflow?
1511 add 0($nonce),%rax # accumulate nonce
1513 mov %rax,0($mac) # write result
1518 &end_function("poly1305_emit_avx");
1521 $code .= "#endif\n";
1527 $code .= "#ifdef CONFIG_AS_AVX2\n";
1530 my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
1531 map("%ymm$_",(0..15));
1534 sub poly1305_blocks_avxN {
1536 my $suffix = $avx512 ? "_avx512" : "";
1539 mov 20($ctx),%r8d # is_base2_26
1541 jae .Lblocks_avx2$suffix
1545 .Lblocks_avx2$suffix:
1547 jz .Lno_data_avx2$suffix
1552 jz .Lbase2_64_avx2$suffix
1555 jz .Leven_avx2$suffix
1570 .Lblocks_avx2_body$suffix:
1572 mov $len,%r15 # reassign $len
1574 mov 0($ctx),$d1 # load hash value
1578 mov 24($ctx),$r0 # load r
1581 ################################# base 2^26 -> base 2^64
1583 and \$`-1*(1<<31)`,$d1
1584 mov $d2,$r1 # borrow $r1
1586 and \$`-1*(1<<31)`,$d2
1600 adc \$0,$h2 # can be partially reduced...
1602 mov \$-4,$d2 # ... so reduce
1615 add $r1,$s1 # s1 = r1 + (r1 >> 2)
1617 .Lbase2_26_pre_avx2$suffix:
1618 add 0($inp),$h0 # accumulate input
1624 call __poly1305_block
1628 jnz .Lbase2_26_pre_avx2$suffix
1630 test $padbit,$padbit # if $padbit is zero,
1631 jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format
1633 ################################# base 2^64 -> base 2^26
1640 and \$0x3ffffff,%rax # h[0]
1642 and \$0x3ffffff,%rdx # h[1]
1646 and \$0x3ffffff,$h0 # h[2]
1648 and \$0x3ffffff,$h1 # h[3]
1652 jz .Lstore_base2_26_avx2$suffix
1659 jmp .Lproceed_avx2$suffix
1662 .Lstore_base2_64_avx2$suffix:
1665 mov $h2,16($ctx) # note that is_base2_26 is zeroed
1666 jmp .Ldone_avx2$suffix
1669 .Lstore_base2_26_avx2$suffix:
1670 mov %rax#d,0($ctx) # store hash value base 2^26
1689 .Lno_data_avx2$suffix:
1690 .Lblocks_avx2_epilogue$suffix:
1695 .Lbase2_64_avx2$suffix:
1710 .Lbase2_64_avx2_body$suffix:
1712 mov $len,%r15 # reassign $len
1714 mov 24($ctx),$r0 # load r
1717 mov 0($ctx),$h0 # load hash value
1724 add $r1,$s1 # s1 = r1 + (r1 >> 2)
1727 jz .Linit_avx2$suffix
1729 .Lbase2_64_pre_avx2$suffix:
1730 add 0($inp),$h0 # accumulate input
1736 call __poly1305_block
1740 jnz .Lbase2_64_pre_avx2$suffix
1743 ################################# base 2^64 -> base 2^26
1750 and \$0x3ffffff,%rax # h[0]
1752 and \$0x3ffffff,%rdx # h[1]
1756 and \$0x3ffffff,$h0 # h[2]
1758 and \$0x3ffffff,$h1 # h[3]
1766 movl \$1,20($ctx) # set is_base2_26
1768 call __poly1305_init_avx
1770 .Lproceed_avx2$suffix:
1771 mov %r15,$len # restore $len
1773 $code.=<<___ if (!$kernel);
1774 mov OPENSSL_ia32cap_P+8(%rip),%r9d
1775 mov \$`(1<<31|1<<30|1<<16)`,%r11d
1790 .Lbase2_64_avx2_epilogue$suffix:
1791 jmp .Ldo_avx2$suffix
1798 $code.=<<___ if (!$kernel);
1799 mov OPENSSL_ia32cap_P+8(%rip),%r9d
1802 vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
1803 vmovd 4*1($ctx),%x#$H1
1804 vmovd 4*2($ctx),%x#$H2
1805 vmovd 4*3($ctx),%x#$H3
1806 vmovd 4*4($ctx),%x#$H4
1810 $code.=<<___ if (!$kernel && $avx>2);
1814 test \$`1<<16`,%r9d # check for AVX512F
1816 .Lskip_avx512$suffix:
1818 $code.=<<___ if ($avx > 2 && $avx512 && $kernel);
1822 $code.=<<___ if (!$win64);
1824 .cfi_def_cfa_register %r10
1827 $code.=<<___ if ($win64);
1830 vmovdqa %xmm6,-0xb0(%r10)
1831 vmovdqa %xmm7,-0xa0(%r10)
1832 vmovdqa %xmm8,-0x90(%r10)
1833 vmovdqa %xmm9,-0x80(%r10)
1834 vmovdqa %xmm10,-0x70(%r10)
1835 vmovdqa %xmm11,-0x60(%r10)
1836 vmovdqa %xmm12,-0x50(%r10)
1837 vmovdqa %xmm13,-0x40(%r10)
1838 vmovdqa %xmm14,-0x30(%r10)
1839 vmovdqa %xmm15,-0x20(%r10)
1840 .Ldo_avx2_body$suffix:
1843 lea .Lconst(%rip),%rcx
1844 lea 48+64($ctx),$ctx # size optimization
1845 vmovdqa 96(%rcx),$T0 # .Lpermd_avx2
1847 # expand and copy pre-calculated table to stack
1848 vmovdqu `16*0-64`($ctx),%x#$T2
1850 vmovdqu `16*1-64`($ctx),%x#$T3
1851 vmovdqu `16*2-64`($ctx),%x#$T4
1852 vmovdqu `16*3-64`($ctx),%x#$D0
1853 vmovdqu `16*4-64`($ctx),%x#$D1
1854 vmovdqu `16*5-64`($ctx),%x#$D2
1855 lea 0x90(%rsp),%rax # size optimization
1856 vmovdqu `16*6-64`($ctx),%x#$D3
1857 vpermd $T2,$T0,$T2 # 00003412 -> 14243444
1858 vmovdqu `16*7-64`($ctx),%x#$D4
1860 vmovdqu `16*8-64`($ctx),%x#$MASK
1862 vmovdqa $T2,0x00(%rsp)
1864 vmovdqa $T3,0x20-0x90(%rax)
1866 vmovdqa $T4,0x40-0x90(%rax)
1868 vmovdqa $D0,0x60-0x90(%rax)
1870 vmovdqa $D1,0x80-0x90(%rax)
1872 vmovdqa $D2,0xa0-0x90(%rax)
1873 vpermd $MASK,$T0,$MASK
1874 vmovdqa $D3,0xc0-0x90(%rax)
1875 vmovdqa $D4,0xe0-0x90(%rax)
1876 vmovdqa $MASK,0x100-0x90(%rax)
1877 vmovdqa 64(%rcx),$MASK # .Lmask26
1879 ################################################################
1881 vmovdqu 16*0($inp),%x#$T0
1882 vmovdqu 16*1($inp),%x#$T1
1883 vinserti128 \$1,16*2($inp),$T0,$T0
1884 vinserti128 \$1,16*3($inp),$T1,$T1
1887 vpsrldq \$6,$T0,$T2 # splat input
1889 vpunpckhqdq $T1,$T0,$T4 # 4
1890 vpunpcklqdq $T3,$T2,$T2 # 2:3
1891 vpunpcklqdq $T1,$T0,$T0 # 0:1
1896 vpsrlq \$40,$T4,$T4 # 4
1897 vpand $MASK,$T2,$T2 # 2
1898 vpand $MASK,$T0,$T0 # 0
1899 vpand $MASK,$T1,$T1 # 1
1900 vpand $MASK,$T3,$T3 # 3
1901 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1903 vpaddq $H2,$T2,$H2 # accumulate input
1905 jz .Ltail_avx2$suffix
1906 jmp .Loop_avx2$suffix
1910 ################################################################
1911 # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
1912 # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
1913 # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
1914 # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
1915 # \________/\__________/
1916 ################################################################
1917 #vpaddq $H2,$T2,$H2 # accumulate input
1919 vmovdqa `32*0`(%rsp),$T0 # r0^4
1921 vmovdqa `32*1`(%rsp),$T1 # r1^4
1923 vmovdqa `32*3`(%rsp),$T2 # r2^4
1925 vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
1926 vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
1928 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1929 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1930 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1931 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1932 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1934 # however, as h2 is "chronologically" first one available pull
1935 # corresponding operations up, so it's
1937 # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
1938 # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
1939 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1940 # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
1941 # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
1943 vpmuludq $H2,$T0,$D2 # d2 = h2*r0
1944 vpmuludq $H2,$T1,$D3 # d3 = h2*r1
1945 vpmuludq $H2,$T2,$D4 # d4 = h2*r2
1946 vpmuludq $H2,$T3,$D0 # d0 = h2*s3
1947 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
1949 vpmuludq $H0,$T1,$T4 # h0*r1
1950 vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
1951 vpaddq $T4,$D1,$D1 # d1 += h0*r1
1952 vpaddq $H2,$D2,$D2 # d2 += h1*r1
1953 vpmuludq $H3,$T1,$T4 # h3*r1
1954 vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
1955 vpaddq $T4,$D4,$D4 # d4 += h3*r1
1956 vpaddq $H2,$D0,$D0 # d0 += h4*s1
1957 vmovdqa `32*4-0x90`(%rax),$T1 # s2
1959 vpmuludq $H0,$T0,$T4 # h0*r0
1960 vpmuludq $H1,$T0,$H2 # h1*r0
1961 vpaddq $T4,$D0,$D0 # d0 += h0*r0
1962 vpaddq $H2,$D1,$D1 # d1 += h1*r0
1963 vpmuludq $H3,$T0,$T4 # h3*r0
1964 vpmuludq $H4,$T0,$H2 # h4*r0
1965 vmovdqu 16*0($inp),%x#$T0 # load input
1966 vpaddq $T4,$D3,$D3 # d3 += h3*r0
1967 vpaddq $H2,$D4,$D4 # d4 += h4*r0
1968 vinserti128 \$1,16*2($inp),$T0,$T0
1970 vpmuludq $H3,$T1,$T4 # h3*s2
1971 vpmuludq $H4,$T1,$H2 # h4*s2
1972 vmovdqu 16*1($inp),%x#$T1
1973 vpaddq $T4,$D0,$D0 # d0 += h3*s2
1974 vpaddq $H2,$D1,$D1 # d1 += h4*s2
1975 vmovdqa `32*5-0x90`(%rax),$H2 # r3
1976 vpmuludq $H1,$T2,$T4 # h1*r2
1977 vpmuludq $H0,$T2,$T2 # h0*r2
1978 vpaddq $T4,$D3,$D3 # d3 += h1*r2
1979 vpaddq $T2,$D2,$D2 # d2 += h0*r2
1980 vinserti128 \$1,16*3($inp),$T1,$T1
1983 vpmuludq $H1,$H2,$T4 # h1*r3
1984 vpmuludq $H0,$H2,$H2 # h0*r3
1985 vpsrldq \$6,$T0,$T2 # splat input
1986 vpaddq $T4,$D4,$D4 # d4 += h1*r3
1987 vpaddq $H2,$D3,$D3 # d3 += h0*r3
1988 vpmuludq $H3,$T3,$T4 # h3*s3
1989 vpmuludq $H4,$T3,$H2 # h4*s3
1991 vpaddq $T4,$D1,$D1 # d1 += h3*s3
1992 vpaddq $H2,$D2,$D2 # d2 += h4*s3
1993 vpunpckhqdq $T1,$T0,$T4 # 4
1995 vpmuludq $H3,$S4,$H3 # h3*s4
1996 vpmuludq $H4,$S4,$H4 # h4*s4
1997 vpunpcklqdq $T1,$T0,$T0 # 0:1
1998 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
1999 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
2000 vpunpcklqdq $T3,$T2,$T3 # 2:3
2001 vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
2002 vpmuludq $H1,$S4,$H0 # h1*s4
2003 vmovdqa 64(%rcx),$MASK # .Lmask26
2004 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
2005 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
2007 ################################################################
2008 # lazy reduction (interleaved with tail of input splat)
2012 vpaddq $D3,$H4,$H4 # h3 -> h4
2016 vpaddq $D0,$D1,$H1 # h0 -> h1
2025 vpaddq $D1,$H2,$H2 # h1 -> h2
2029 vpaddq $D4,$H0,$H0 # h4 -> h0
2031 vpand $MASK,$T2,$T2 # 2
2036 vpaddq $D2,$H3,$H3 # h2 -> h3
2038 vpaddq $T2,$H2,$H2 # modulo-scheduled
2043 vpaddq $D0,$H1,$H1 # h0 -> h1
2045 vpsrlq \$40,$T4,$T4 # 4
2049 vpaddq $D3,$H4,$H4 # h3 -> h4
2051 vpand $MASK,$T0,$T0 # 0
2052 vpand $MASK,$T1,$T1 # 1
2053 vpand $MASK,$T3,$T3 # 3
2054 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
2057 jnz .Loop_avx2$suffix
2061 ################################################################
2062 # while above multiplications were by r^4 in all lanes, in last
2063 # iteration we multiply least significant lane by r^4 and most
2064 # significant one by r, so copy of above except that references
2065 # to the precomputed table are displaced by 4...
2067 #vpaddq $H2,$T2,$H2 # accumulate input
2069 vmovdqu `32*0+4`(%rsp),$T0 # r0^4
2071 vmovdqu `32*1+4`(%rsp),$T1 # r1^4
2073 vmovdqu `32*3+4`(%rsp),$T2 # r2^4
2075 vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
2076 vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
2078 vpmuludq $H2,$T0,$D2 # d2 = h2*r0
2079 vpmuludq $H2,$T1,$D3 # d3 = h2*r1
2080 vpmuludq $H2,$T2,$D4 # d4 = h2*r2
2081 vpmuludq $H2,$T3,$D0 # d0 = h2*s3
2082 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2084 vpmuludq $H0,$T1,$T4 # h0*r1
2085 vpmuludq $H1,$T1,$H2 # h1*r1
2086 vpaddq $T4,$D1,$D1 # d1 += h0*r1
2087 vpaddq $H2,$D2,$D2 # d2 += h1*r1
2088 vpmuludq $H3,$T1,$T4 # h3*r1
2089 vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
2090 vpaddq $T4,$D4,$D4 # d4 += h3*r1
2091 vpaddq $H2,$D0,$D0 # d0 += h4*s1
2093 vpmuludq $H0,$T0,$T4 # h0*r0
2094 vpmuludq $H1,$T0,$H2 # h1*r0
2095 vpaddq $T4,$D0,$D0 # d0 += h0*r0
2096 vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
2097 vpaddq $H2,$D1,$D1 # d1 += h1*r0
2098 vpmuludq $H3,$T0,$T4 # h3*r0
2099 vpmuludq $H4,$T0,$H2 # h4*r0
2100 vpaddq $T4,$D3,$D3 # d3 += h3*r0
2101 vpaddq $H2,$D4,$D4 # d4 += h4*r0
2103 vpmuludq $H3,$T1,$T4 # h3*s2
2104 vpmuludq $H4,$T1,$H2 # h4*s2
2105 vpaddq $T4,$D0,$D0 # d0 += h3*s2
2106 vpaddq $H2,$D1,$D1 # d1 += h4*s2
2107 vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
2108 vpmuludq $H1,$T2,$T4 # h1*r2
2109 vpmuludq $H0,$T2,$T2 # h0*r2
2110 vpaddq $T4,$D3,$D3 # d3 += h1*r2
2111 vpaddq $T2,$D2,$D2 # d2 += h0*r2
2113 vpmuludq $H1,$H2,$T4 # h1*r3
2114 vpmuludq $H0,$H2,$H2 # h0*r3
2115 vpaddq $T4,$D4,$D4 # d4 += h1*r3
2116 vpaddq $H2,$D3,$D3 # d3 += h0*r3
2117 vpmuludq $H3,$T3,$T4 # h3*s3
2118 vpmuludq $H4,$T3,$H2 # h4*s3
2119 vpaddq $T4,$D1,$D1 # d1 += h3*s3
2120 vpaddq $H2,$D2,$D2 # d2 += h4*s3
2122 vpmuludq $H3,$S4,$H3 # h3*s4
2123 vpmuludq $H4,$S4,$H4 # h4*s4
2124 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
2125 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
2126 vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
2127 vpmuludq $H1,$S4,$H0 # h1*s4
2128 vmovdqa 64(%rcx),$MASK # .Lmask26
2129 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
2130 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
2132 ################################################################
2133 # horizontal addition
2146 vpermq \$0x2,$H3,$T3
2147 vpermq \$0x2,$H4,$T4
2148 vpermq \$0x2,$H0,$T0
2149 vpermq \$0x2,$D1,$T1
2150 vpermq \$0x2,$H2,$T2
2157 ################################################################
2162 vpaddq $D3,$H4,$H4 # h3 -> h4
2166 vpaddq $D0,$D1,$H1 # h0 -> h1
2173 vpaddq $D1,$H2,$H2 # h1 -> h2
2177 vpaddq $D4,$H0,$H0 # h4 -> h0
2181 vpaddq $D2,$H3,$H3 # h2 -> h3
2185 vpaddq $D0,$H1,$H1 # h0 -> h1
2189 vpaddq $D3,$H4,$H4 # h3 -> h4
2191 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2192 vmovd %x#$H1,`4*1-48-64`($ctx)
2193 vmovd %x#$H2,`4*2-48-64`($ctx)
2194 vmovd %x#$H3,`4*3-48-64`($ctx)
2195 vmovd %x#$H4,`4*4-48-64`($ctx)
2197 $code.=<<___ if ($win64);
2198 vmovdqa -0xb0(%r10),%xmm6
2199 vmovdqa -0xa0(%r10),%xmm7
2200 vmovdqa -0x90(%r10),%xmm8
2201 vmovdqa -0x80(%r10),%xmm9
2202 vmovdqa -0x70(%r10),%xmm10
2203 vmovdqa -0x60(%r10),%xmm11
2204 vmovdqa -0x50(%r10),%xmm12
2205 vmovdqa -0x40(%r10),%xmm13
2206 vmovdqa -0x30(%r10),%xmm14
2207 vmovdqa -0x20(%r10),%xmm15
2209 .Ldo_avx2_epilogue$suffix:
2211 $code.=<<___ if (!$win64);
2213 .cfi_def_cfa_register %rsp
2220 if($avx > 2 && $avx512) {
2221 my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
2222 my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
2223 my $PADBIT="%zmm30";
2225 map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain
2226 map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
2227 map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
2228 map(s/%y/%z/,($MASK));
2236 $code.=<<___ if (!$win64);
2238 .cfi_def_cfa_register %r10
2241 $code.=<<___ if ($win64);
2244 vmovdqa %xmm6,-0xb0(%r10)
2245 vmovdqa %xmm7,-0xa0(%r10)
2246 vmovdqa %xmm8,-0x90(%r10)
2247 vmovdqa %xmm9,-0x80(%r10)
2248 vmovdqa %xmm10,-0x70(%r10)
2249 vmovdqa %xmm11,-0x60(%r10)
2250 vmovdqa %xmm12,-0x50(%r10)
2251 vmovdqa %xmm13,-0x40(%r10)
2252 vmovdqa %xmm14,-0x30(%r10)
2253 vmovdqa %xmm15,-0x20(%r10)
2257 lea .Lconst(%rip),%rcx
2258 lea 48+64($ctx),$ctx # size optimization
2259 vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2
2261 # expand pre-calculated table
2262 vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0}
2264 vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1}
2266 vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1}
2267 vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2}
2268 vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2}
2269 vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3}
2270 vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3}
2271 vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4}
2272 vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4}
2273 vpermd $D0,$T2,$R0 # 00003412 -> 14243444
2274 vpbroadcastq 64(%rcx),$MASK # .Lmask26
2278 vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0
2279 vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304
2281 vmovdqu64 $R1,0x00(%rsp,%rax){%k2}
2284 vmovdqa64 $S1,0x40(%rsp){%k2}
2287 vmovdqu64 $R2,0x40(%rsp,%rax){%k2}
2289 vmovdqa64 $S2,0x80(%rsp){%k2}
2290 vmovdqu64 $R3,0x80(%rsp,%rax){%k2}
2291 vmovdqa64 $S3,0xc0(%rsp){%k2}
2292 vmovdqu64 $R4,0xc0(%rsp,%rax){%k2}
2293 vmovdqa64 $S4,0x100(%rsp){%k2}
2295 ################################################################
2296 # calculate 5th through 8th powers of the key
2298 # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
2299 # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
2300 # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3
2301 # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4
2302 # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0
2304 vpmuludq $T0,$R0,$D0 # d0 = r0'*r0
2305 vpmuludq $T0,$R1,$D1 # d1 = r0'*r1
2306 vpmuludq $T0,$R2,$D2 # d2 = r0'*r2
2307 vpmuludq $T0,$R3,$D3 # d3 = r0'*r3
2308 vpmuludq $T0,$R4,$D4 # d4 = r0'*r4
2311 vpmuludq $T1,$S4,$M0
2312 vpmuludq $T1,$R0,$M1
2313 vpmuludq $T1,$R1,$M2
2314 vpmuludq $T1,$R2,$M3
2315 vpmuludq $T1,$R3,$M4
2317 vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4
2318 vpaddq $M1,$D1,$D1 # d1 += r1'*r0
2319 vpaddq $M2,$D2,$D2 # d2 += r1'*r1
2320 vpaddq $M3,$D3,$D3 # d3 += r1'*r2
2321 vpaddq $M4,$D4,$D4 # d4 += r1'*r3
2323 vpmuludq $T2,$S3,$M0
2324 vpmuludq $T2,$S4,$M1
2325 vpmuludq $T2,$R1,$M3
2326 vpmuludq $T2,$R2,$M4
2327 vpmuludq $T2,$R0,$M2
2329 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3
2330 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4
2331 vpaddq $M3,$D3,$D3 # d3 += r2'*r1
2332 vpaddq $M4,$D4,$D4 # d4 += r2'*r2
2333 vpaddq $M2,$D2,$D2 # d2 += r2'*r0
2335 vpmuludq $T3,$S2,$M0
2336 vpmuludq $T3,$R0,$M3
2337 vpmuludq $T3,$R1,$M4
2338 vpmuludq $T3,$S3,$M1
2339 vpmuludq $T3,$S4,$M2
2340 vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2
2341 vpaddq $M3,$D3,$D3 # d3 += r3'*r0
2342 vpaddq $M4,$D4,$D4 # d4 += r3'*r1
2343 vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3
2344 vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4
2346 vpmuludq $T4,$S4,$M3
2347 vpmuludq $T4,$R0,$M4
2348 vpmuludq $T4,$S1,$M0
2349 vpmuludq $T4,$S2,$M1
2350 vpmuludq $T4,$S3,$M2
2351 vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4
2352 vpaddq $M4,$D4,$D4 # d4 += r2'*r0
2353 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1
2354 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2
2355 vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3
2357 ################################################################
2359 vmovdqu64 16*0($inp),%z#$T3
2360 vmovdqu64 16*4($inp),%z#$T4
2363 ################################################################
2367 vpandq $MASK,$D3,$D3
2368 vpaddq $M3,$D4,$D4 # d3 -> d4
2371 vpandq $MASK,$D0,$D0
2372 vpaddq $M0,$D1,$D1 # d0 -> d1
2375 vpandq $MASK,$D4,$D4
2378 vpandq $MASK,$D1,$D1
2379 vpaddq $M1,$D2,$D2 # d1 -> d2
2383 vpaddq $M4,$D0,$D0 # d4 -> d0
2386 vpandq $MASK,$D2,$D2
2387 vpaddq $M2,$D3,$D3 # d2 -> d3
2390 vpandq $MASK,$D0,$D0
2391 vpaddq $M0,$D1,$D1 # d0 -> d1
2394 vpandq $MASK,$D3,$D3
2395 vpaddq $M3,$D4,$D4 # d3 -> d4
2397 ################################################################
2398 # at this point we have 14243444 in $R0-$S4 and 05060708 in
2401 vpunpcklqdq $T4,$T3,$T0 # transpose input
2402 vpunpckhqdq $T4,$T3,$T4
2404 # ... since input 64-bit lanes are ordered as 73625140, we could
2405 # "vperm" it to 76543210 (here and in each loop iteration), *or*
2406 # we could just flow along, hence the goal for $R0-$S4 is
2407 # 1858286838784888 ...
2409 vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512:
2413 vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4---
2419 vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888
2420 vpermd $D1,$M0,${R1}{%k1}
2421 vpermd $D2,$M0,${R2}{%k1}
2422 vpermd $D3,$M0,${R3}{%k1}
2423 vpermd $D4,$M0,${R4}{%k1}
2425 vpslld \$2,$R1,$S1 # *5
2434 vpbroadcastq 32(%rcx),$PADBIT # .L129
2436 vpsrlq \$52,$T0,$T2 # splat input
2441 vpsrlq \$40,$T4,$T4 # 4
2442 vpandq $MASK,$T2,$T2 # 2
2443 vpandq $MASK,$T0,$T0 # 0
2444 #vpandq $MASK,$T1,$T1 # 1
2445 #vpandq $MASK,$T3,$T3 # 3
2446 #vporq $PADBIT,$T4,$T4 # padbit, yes, always
2448 vpaddq $H2,$T2,$H2 # accumulate input
2455 ################################################################
2456 # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
2457 # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
2458 # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
2459 # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
2460 # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
2461 # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
2462 # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
2463 # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
2464 # \________/\___________/
2465 ################################################################
2466 #vpaddq $H2,$T2,$H2 # accumulate input
2468 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
2469 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
2470 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
2471 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
2472 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
2474 # however, as h2 is "chronologically" first one available pull
2475 # corresponding operations up, so it's
2477 # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4
2478 # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0
2479 # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
2480 # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2
2481 # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
2483 vpmuludq $H2,$R1,$D3 # d3 = h2*r1
2485 vpmuludq $H2,$R2,$D4 # d4 = h2*r2
2486 vpandq $MASK,$T1,$T1 # 1
2487 vpmuludq $H2,$S3,$D0 # d0 = h2*s3
2488 vpandq $MASK,$T3,$T3 # 3
2489 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2490 vporq $PADBIT,$T4,$T4 # padbit, yes, always
2491 vpmuludq $H2,$R0,$D2 # d2 = h2*r0
2492 vpaddq $H1,$T1,$H1 # accumulate input
2496 vmovdqu64 16*0($inp),$T3 # load input
2497 vmovdqu64 16*4($inp),$T4
2499 vpmuludq $H0,$R3,$M3
2500 vpmuludq $H0,$R4,$M4
2501 vpmuludq $H0,$R0,$M0
2502 vpmuludq $H0,$R1,$M1
2503 vpaddq $M3,$D3,$D3 # d3 += h0*r3
2504 vpaddq $M4,$D4,$D4 # d4 += h0*r4
2505 vpaddq $M0,$D0,$D0 # d0 += h0*r0
2506 vpaddq $M1,$D1,$D1 # d1 += h0*r1
2508 vpmuludq $H1,$R2,$M3
2509 vpmuludq $H1,$R3,$M4
2510 vpmuludq $H1,$S4,$M0
2511 vpmuludq $H0,$R2,$M2
2512 vpaddq $M3,$D3,$D3 # d3 += h1*r2
2513 vpaddq $M4,$D4,$D4 # d4 += h1*r3
2514 vpaddq $M0,$D0,$D0 # d0 += h1*s4
2515 vpaddq $M2,$D2,$D2 # d2 += h0*r2
2517 vpunpcklqdq $T4,$T3,$T0 # transpose input
2518 vpunpckhqdq $T4,$T3,$T4
2520 vpmuludq $H3,$R0,$M3
2521 vpmuludq $H3,$R1,$M4
2522 vpmuludq $H1,$R0,$M1
2523 vpmuludq $H1,$R1,$M2
2524 vpaddq $M3,$D3,$D3 # d3 += h3*r0
2525 vpaddq $M4,$D4,$D4 # d4 += h3*r1
2526 vpaddq $M1,$D1,$D1 # d1 += h1*r0
2527 vpaddq $M2,$D2,$D2 # d2 += h1*r1
2529 vpmuludq $H4,$S4,$M3
2530 vpmuludq $H4,$R0,$M4
2531 vpmuludq $H3,$S2,$M0
2532 vpmuludq $H3,$S3,$M1
2533 vpaddq $M3,$D3,$D3 # d3 += h4*s4
2534 vpmuludq $H3,$S4,$M2
2535 vpaddq $M4,$D4,$D4 # d4 += h4*r0
2536 vpaddq $M0,$D0,$D0 # d0 += h3*s2
2537 vpaddq $M1,$D1,$D1 # d1 += h3*s3
2538 vpaddq $M2,$D2,$D2 # d2 += h3*s4
2540 vpmuludq $H4,$S1,$M0
2541 vpmuludq $H4,$S2,$M1
2542 vpmuludq $H4,$S3,$M2
2543 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
2544 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
2545 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
2547 ################################################################
2548 # lazy reduction (interleaved with input splat)
2550 vpsrlq \$52,$T0,$T2 # splat input
2554 vpandq $MASK,$D3,$D3
2555 vpaddq $H3,$D4,$H4 # h3 -> h4
2560 vpandq $MASK,$H0,$H0
2561 vpaddq $D0,$H1,$H1 # h0 -> h1
2563 vpandq $MASK,$T2,$T2 # 2
2566 vpandq $MASK,$H4,$H4
2569 vpandq $MASK,$H1,$H1
2570 vpaddq $D1,$H2,$H2 # h1 -> h2
2574 vpaddq $D4,$H0,$H0 # h4 -> h0
2576 vpaddq $T2,$H2,$H2 # modulo-scheduled
2580 vpandq $MASK,$H2,$H2
2581 vpaddq $D2,$D3,$H3 # h2 -> h3
2586 vpandq $MASK,$H0,$H0
2587 vpaddq $D0,$H1,$H1 # h0 -> h1
2589 vpsrlq \$40,$T4,$T4 # 4
2592 vpandq $MASK,$H3,$H3
2593 vpaddq $D3,$H4,$H4 # h3 -> h4
2595 vpandq $MASK,$T0,$T0 # 0
2596 #vpandq $MASK,$T1,$T1 # 1
2597 #vpandq $MASK,$T3,$T3 # 3
2598 #vporq $PADBIT,$T4,$T4 # padbit, yes, always
2604 ################################################################
2605 # while above multiplications were by r^8 in all lanes, in last
2606 # iteration we multiply least significant lane by r^8 and most
2607 # significant one by r, that's why table gets shifted...
2609 vpsrlq \$32,$R0,$R0 # 0105020603070408
2619 ################################################################
2620 # load either next or last 64 byte of input
2621 lea ($inp,$len),$inp
2623 #vpaddq $H2,$T2,$H2 # accumulate input
2626 vpmuludq $H2,$R1,$D3 # d3 = h2*r1
2627 vpmuludq $H2,$R2,$D4 # d4 = h2*r2
2628 vpmuludq $H2,$S3,$D0 # d0 = h2*s3
2629 vpandq $MASK,$T1,$T1 # 1
2630 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2631 vpandq $MASK,$T3,$T3 # 3
2632 vpmuludq $H2,$R0,$D2 # d2 = h2*r0
2633 vporq $PADBIT,$T4,$T4 # padbit, yes, always
2634 vpaddq $H1,$T1,$H1 # accumulate input
2638 vmovdqu 16*0($inp),%x#$T0
2639 vpmuludq $H0,$R3,$M3
2640 vpmuludq $H0,$R4,$M4
2641 vpmuludq $H0,$R0,$M0
2642 vpmuludq $H0,$R1,$M1
2643 vpaddq $M3,$D3,$D3 # d3 += h0*r3
2644 vpaddq $M4,$D4,$D4 # d4 += h0*r4
2645 vpaddq $M0,$D0,$D0 # d0 += h0*r0
2646 vpaddq $M1,$D1,$D1 # d1 += h0*r1
2648 vmovdqu 16*1($inp),%x#$T1
2649 vpmuludq $H1,$R2,$M3
2650 vpmuludq $H1,$R3,$M4
2651 vpmuludq $H1,$S4,$M0
2652 vpmuludq $H0,$R2,$M2
2653 vpaddq $M3,$D3,$D3 # d3 += h1*r2
2654 vpaddq $M4,$D4,$D4 # d4 += h1*r3
2655 vpaddq $M0,$D0,$D0 # d0 += h1*s4
2656 vpaddq $M2,$D2,$D2 # d2 += h0*r2
2658 vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0
2659 vpmuludq $H3,$R0,$M3
2660 vpmuludq $H3,$R1,$M4
2661 vpmuludq $H1,$R0,$M1
2662 vpmuludq $H1,$R1,$M2
2663 vpaddq $M3,$D3,$D3 # d3 += h3*r0
2664 vpaddq $M4,$D4,$D4 # d4 += h3*r1
2665 vpaddq $M1,$D1,$D1 # d1 += h1*r0
2666 vpaddq $M2,$D2,$D2 # d2 += h1*r1
2668 vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1
2669 vpmuludq $H4,$S4,$M3
2670 vpmuludq $H4,$R0,$M4
2671 vpmuludq $H3,$S2,$M0
2672 vpmuludq $H3,$S3,$M1
2673 vpmuludq $H3,$S4,$M2
2674 vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4
2675 vpaddq $M4,$D4,$D4 # d4 += h4*r0
2676 vpaddq $M0,$D0,$D0 # d0 += h3*s2
2677 vpaddq $M1,$D1,$D1 # d1 += h3*s3
2678 vpaddq $M2,$D2,$D2 # d2 += h3*s4
2680 vpmuludq $H4,$S1,$M0
2681 vpmuludq $H4,$S2,$M1
2682 vpmuludq $H4,$S3,$M2
2683 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
2684 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
2685 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
2687 ################################################################
2688 # horizontal addition
2691 vpermq \$0xb1,$H3,$D3
2692 vpermq \$0xb1,$D4,$H4
2693 vpermq \$0xb1,$H0,$D0
2694 vpermq \$0xb1,$H1,$D1
2695 vpermq \$0xb1,$H2,$D2
2703 vpermq \$0x2,$H3,$D3
2704 vpermq \$0x2,$H4,$D4
2705 vpermq \$0x2,$H0,$D0
2706 vpermq \$0x2,$H1,$D1
2707 vpermq \$0x2,$H2,$D2
2714 vextracti64x4 \$0x1,$H3,%y#$D3
2715 vextracti64x4 \$0x1,$H4,%y#$D4
2716 vextracti64x4 \$0x1,$H0,%y#$D0
2717 vextracti64x4 \$0x1,$H1,%y#$D1
2718 vextracti64x4 \$0x1,$H2,%y#$D2
2719 vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case
2720 vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2
2721 vpaddq $D0,$H0,${H0}{%k3}{z}
2722 vpaddq $D1,$H1,${H1}{%k3}{z}
2723 vpaddq $D2,$H2,${H2}{%k3}{z}
2725 map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
2726 map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
2728 ################################################################
2729 # lazy reduction (interleaved with input splat)
2733 vpsrldq \$6,$T0,$T2 # splat input
2735 vpunpckhqdq $T1,$T0,$T4 # 4
2736 vpaddq $D3,$H4,$H4 # h3 -> h4
2740 vpunpcklqdq $T3,$T2,$T2 # 2:3
2741 vpunpcklqdq $T1,$T0,$T0 # 0:1
2742 vpaddq $D0,$H1,$H1 # h0 -> h1
2751 vpaddq $D1,$H2,$H2 # h1 -> h2
2756 vpsrlq \$40,$T4,$T4 # 4
2757 vpaddq $D4,$H0,$H0 # h4 -> h0
2761 vpand $MASK,$T2,$T2 # 2
2762 vpand $MASK,$T0,$T0 # 0
2763 vpaddq $D2,$H3,$H3 # h2 -> h3
2767 vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2
2768 vpand $MASK,$T1,$T1 # 1
2769 vpaddq $D0,$H1,$H1 # h0 -> h1
2773 vpand $MASK,$T3,$T3 # 3
2774 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
2775 vpaddq $D3,$H4,$H4 # h3 -> h4
2777 lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
2779 jnz .Ltail_avx2$suffix
2781 vpsubq $T2,$H2,$H2 # undo input accumulation
2782 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2783 vmovd %x#$H1,`4*1-48-64`($ctx)
2784 vmovd %x#$H2,`4*2-48-64`($ctx)
2785 vmovd %x#$H3,`4*3-48-64`($ctx)
2786 vmovd %x#$H4,`4*4-48-64`($ctx)
2789 $code.=<<___ if ($win64);
2790 movdqa -0xb0(%r10),%xmm6
2791 movdqa -0xa0(%r10),%xmm7
2792 movdqa -0x90(%r10),%xmm8
2793 movdqa -0x80(%r10),%xmm9
2794 movdqa -0x70(%r10),%xmm10
2795 movdqa -0x60(%r10),%xmm11
2796 movdqa -0x50(%r10),%xmm12
2797 movdqa -0x40(%r10),%xmm13
2798 movdqa -0x30(%r10),%xmm14
2799 movdqa -0x20(%r10),%xmm15
2801 .Ldo_avx512_epilogue:
2803 $code.=<<___ if (!$win64);
2805 .cfi_def_cfa_register %rsp
2816 &declare_function("poly1305_blocks_avx2", 32, 4);
2817 poly1305_blocks_avxN(0);
2818 &end_function("poly1305_blocks_avx2");
2821 $code .= "#endif\n";
2824 #######################################################################
2826 # On entry we have input length divisible by 64. But since inner loop
2827 # processes 128 bytes per iteration, cases when length is not divisible
2828 # by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
2829 # reason stack layout is kept identical to poly1305_blocks_avx2. If not
2830 # for this tail, we wouldn't have to even allocate stack frame...
2833 $code .= "#ifdef CONFIG_AS_AVX512\n";
2836 &declare_function("poly1305_blocks_avx512", 32, 4);
2837 poly1305_blocks_avxN(1);
2838 &end_function("poly1305_blocks_avx512");
2841 $code .= "#endif\n";
2844 if (!$kernel && $avx>3) {
2845 ########################################################################
2846 # VPMADD52 version using 2^44 radix.
2848 # One can argue that base 2^52 would be more natural. Well, even though
2849 # some operations would be more natural, one has to recognize couple of
2850 # things. Base 2^52 doesn't provide advantage over base 2^44 if you look
2851 # at amount of multiply-n-accumulate operations. Secondly, it makes it
2852 # impossible to pre-compute multiples of 5 [referred to as s[]/sN in
2853 # reference implementations], which means that more such operations
2854 # would have to be performed in inner loop, which in turn makes critical
2855 # path longer. In other words, even though base 2^44 reduction might
2856 # look less elegant, overall critical path is actually shorter...
2858 ########################################################################
2859 # Layout of opaque area is following.
2861 # unsigned __int64 h[3]; # current hash value base 2^44
2862 # unsigned __int64 s[2]; # key value*20 base 2^44
2863 # unsigned __int64 r[3]; # key value base 2^44
2864 # struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
2865 # # r^n positions reflect
2866 # # placement in register, not
2867 # # memory, R[3] is R[1]*20
2870 .type poly1305_init_base2_44,\@function,3
2872 poly1305_init_base2_44:
2874 mov %rax,0($ctx) # initialize hash value
2879 lea poly1305_blocks_vpmadd52(%rip),%r10
2880 lea poly1305_emit_base2_44(%rip),%r11
2882 mov \$0x0ffffffc0fffffff,%rax
2883 mov \$0x0ffffffc0ffffffc,%rcx
2885 mov \$0x00000fffffffffff,%r8
2887 mov \$0x00000fffffffffff,%r9
2890 mov %r8,40($ctx) # r0
2893 mov %rax,48($ctx) # r1
2894 lea (%rax,%rax,4),%rax # *5
2895 mov %rcx,56($ctx) # r2
2896 shl \$2,%rax # magic <<2
2897 lea (%rcx,%rcx,4),%rcx # *5
2898 shl \$2,%rcx # magic <<2
2899 mov %rax,24($ctx) # s1
2900 mov %rcx,32($ctx) # s2
2901 movq \$-1,64($ctx) # write impossible value
2903 $code.=<<___ if ($flavour !~ /elf32/);
2907 $code.=<<___ if ($flavour =~ /elf32/);
2914 .size poly1305_init_base2_44,.-poly1305_init_base2_44
2917 my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
2918 my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
2919 my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
2922 .type poly1305_blocks_vpmadd52,\@function,4
2924 poly1305_blocks_vpmadd52:
2926 jz .Lno_data_vpmadd52 # too short
2929 mov 64($ctx),%r8 # peek on power of the key
2931 # if powers of the key are not calculated yet, process up to 3
2932 # blocks with this single-block subroutine, otherwise ensure that
2933 # length is divisible by 2 blocks and pass the rest down to next
2938 cmp \$4,$len # is input long
2940 test %r8,%r8 # is power value impossible?
2943 and $len,%rax # is input of favourable length?
2944 jz .Lblocks_vpmadd52_4x
2950 lea .L2_44_inp_permd(%rip),%r10
2953 vmovq $padbit,%x#$PAD
2954 vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd
2955 vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift
2956 vpermq \$0xcf,$PAD,$PAD
2957 vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask
2959 vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value
2960 vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys
2961 vmovdqu64 32($ctx),${r1r0s2}{%k7}{z}
2962 vmovdqu64 24($ctx),${r0s2s1}{%k7}{z}
2964 vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt
2965 vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft
2971 vmovdqu32 0($inp),%x#$T0 # load input as ----3210
2974 vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110
2975 vpsrlvq $inp_shift,$T0,$T0
2976 vpandq $reduc_mask,$T0,$T0
2979 vpaddq $T0,$Dlo,$Dlo # accumulate input
2981 vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value
2982 vpermq \$0b01010101,$Dlo,${H1}{%k7}{z}
2983 vpermq \$0b10101010,$Dlo,${H2}{%k7}{z}
2985 vpxord $Dlo,$Dlo,$Dlo
2986 vpxord $Dhi,$Dhi,$Dhi
2988 vpmadd52luq $r2r1r0,$H0,$Dlo
2989 vpmadd52huq $r2r1r0,$H0,$Dhi
2991 vpmadd52luq $r1r0s2,$H1,$Dlo
2992 vpmadd52huq $r1r0s2,$H1,$Dhi
2994 vpmadd52luq $r0s2s1,$H2,$Dlo
2995 vpmadd52huq $r0s2s1,$H2,$Dhi
2997 vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword
2998 vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword
2999 vpandq $reduc_mask,$Dlo,$Dlo
3001 vpaddq $T0,$Dhi,$Dhi
3003 vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword
3005 vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-)
3007 vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word
3008 vpandq $reduc_mask,$Dlo,$Dlo
3010 vpermq \$0b10010011,$T0,$T0
3012 vpaddq $T0,$Dlo,$Dlo
3014 vpermq \$0b10010011,$Dlo,${T0}{%k1}{z}
3016 vpaddq $T0,$Dlo,$Dlo
3019 vpaddq $T0,$Dlo,$Dlo
3024 vmovdqu64 $Dlo,0($ctx){%k7} # store hash value
3027 jnz .Lblocks_vpmadd52_4x
3031 .size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
3035 ########################################################################
3036 # As implied by its name 4x subroutine processes 4 blocks in parallel
3037 # (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
3038 # and is handled in 256-bit %ymm registers.
3040 my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
3041 my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
3042 my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
3045 .type poly1305_blocks_vpmadd52_4x,\@function,4
3047 poly1305_blocks_vpmadd52_4x:
3049 jz .Lno_data_vpmadd52_4x # too short
3052 mov 64($ctx),%r8 # peek on power of the key
3054 .Lblocks_vpmadd52_4x:
3055 vpbroadcastq $padbit,$PAD
3057 vmovdqa64 .Lx_mask44(%rip),$mask44
3059 vmovdqa64 .Lx_mask42(%rip),$mask42
3060 kmovw %eax,%k1 # used in 2x path
3062 test %r8,%r8 # is power value impossible?
3063 js .Linit_vpmadd52 # if it is, then init R[4]
3065 vmovq 0($ctx),%x#$H0 # load current hash value
3066 vmovq 8($ctx),%x#$H1
3067 vmovq 16($ctx),%x#$H2
3069 test \$3,$len # is length 4*n+2?
3070 jnz .Lblocks_vpmadd52_2x_do
3072 .Lblocks_vpmadd52_4x_do:
3073 vpbroadcastq 64($ctx),$R0 # load 4th power of the key
3074 vpbroadcastq 96($ctx),$R1
3075 vpbroadcastq 128($ctx),$R2
3076 vpbroadcastq 160($ctx),$S1
3078 .Lblocks_vpmadd52_4x_key_loaded:
3079 vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3083 test \$7,$len # is len 8*n?
3084 jz .Lblocks_vpmadd52_8x
3086 vmovdqu64 16*0($inp),$T2 # load data
3087 vmovdqu64 16*2($inp),$T3
3090 vpunpcklqdq $T3,$T2,$T1 # transpose data
3091 vpunpckhqdq $T3,$T2,$T3
3093 # at this point 64-bit lanes are ordered as 3-1-2-0
3095 vpsrlq \$24,$T3,$T2 # splat the data
3097 vpaddq $T2,$H2,$H2 # accumulate input
3098 vpandq $mask44,$T1,$T0
3102 vpandq $mask44,$T1,$T1
3105 jz .Ltail_vpmadd52_4x
3106 jmp .Loop_vpmadd52_4x
3111 vmovq 24($ctx),%x#$S1 # load key
3112 vmovq 56($ctx),%x#$H2
3113 vmovq 32($ctx),%x#$S2
3114 vmovq 40($ctx),%x#$R0
3115 vmovq 48($ctx),%x#$R1
3123 .Lmul_init_vpmadd52:
3124 vpxorq $D0lo,$D0lo,$D0lo
3125 vpmadd52luq $H2,$S1,$D0lo
3126 vpxorq $D0hi,$D0hi,$D0hi
3127 vpmadd52huq $H2,$S1,$D0hi
3128 vpxorq $D1lo,$D1lo,$D1lo
3129 vpmadd52luq $H2,$S2,$D1lo
3130 vpxorq $D1hi,$D1hi,$D1hi
3131 vpmadd52huq $H2,$S2,$D1hi
3132 vpxorq $D2lo,$D2lo,$D2lo
3133 vpmadd52luq $H2,$R0,$D2lo
3134 vpxorq $D2hi,$D2hi,$D2hi
3135 vpmadd52huq $H2,$R0,$D2hi
3137 vpmadd52luq $H0,$R0,$D0lo
3138 vpmadd52huq $H0,$R0,$D0hi
3139 vpmadd52luq $H0,$R1,$D1lo
3140 vpmadd52huq $H0,$R1,$D1hi
3141 vpmadd52luq $H0,$R2,$D2lo
3142 vpmadd52huq $H0,$R2,$D2hi
3144 vpmadd52luq $H1,$S2,$D0lo
3145 vpmadd52huq $H1,$S2,$D0hi
3146 vpmadd52luq $H1,$R0,$D1lo
3147 vpmadd52huq $H1,$R0,$D1hi
3148 vpmadd52luq $H1,$R1,$D2lo
3149 vpmadd52huq $H1,$R1,$D2hi
3151 ################################################################
3153 vpsrlq \$44,$D0lo,$tmp
3154 vpsllq \$8,$D0hi,$D0hi
3155 vpandq $mask44,$D0lo,$H0
3156 vpaddq $tmp,$D0hi,$D0hi
3158 vpaddq $D0hi,$D1lo,$D1lo
3160 vpsrlq \$44,$D1lo,$tmp
3161 vpsllq \$8,$D1hi,$D1hi
3162 vpandq $mask44,$D1lo,$H1
3163 vpaddq $tmp,$D1hi,$D1hi
3165 vpaddq $D1hi,$D2lo,$D2lo
3167 vpsrlq \$42,$D2lo,$tmp
3168 vpsllq \$10,$D2hi,$D2hi
3169 vpandq $mask42,$D2lo,$H2
3170 vpaddq $tmp,$D2hi,$D2hi
3172 vpaddq $D2hi,$H0,$H0
3173 vpsllq \$2,$D2hi,$D2hi
3175 vpaddq $D2hi,$H0,$H0
3177 vpsrlq \$44,$H0,$tmp # additional step
3178 vpandq $mask44,$H0,$H0
3183 jz .Ldone_init_vpmadd52
3185 vpunpcklqdq $R1,$H1,$R1 # 1,2
3186 vpbroadcastq %x#$H1,%x#$H1 # 2,2
3187 vpunpcklqdq $R2,$H2,$R2
3188 vpbroadcastq %x#$H2,%x#$H2
3189 vpunpcklqdq $R0,$H0,$R0
3190 vpbroadcastq %x#$H0,%x#$H0
3192 vpsllq \$2,$R1,$S1 # S1 = R1*5*4
3193 vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3199 jmp .Lmul_init_vpmadd52
3203 .Ldone_init_vpmadd52:
3204 vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4
3205 vinserti128 \$1,%x#$R2,$H2,$R2
3206 vinserti128 \$1,%x#$R0,$H0,$R0
3208 vpermq \$0b11011000,$R1,$R1 # 1,3,2,4
3209 vpermq \$0b11011000,$R2,$R2
3210 vpermq \$0b11011000,$R0,$R0
3212 vpsllq \$2,$R1,$S1 # S1 = R1*5*4
3216 vmovq 0($ctx),%x#$H0 # load current hash value
3217 vmovq 8($ctx),%x#$H1
3218 vmovq 16($ctx),%x#$H2
3220 test \$3,$len # is length 4*n+2?
3221 jnz .Ldone_init_vpmadd52_2x
3223 vmovdqu64 $R0,64($ctx) # save key powers
3224 vpbroadcastq %x#$R0,$R0 # broadcast 4th power
3225 vmovdqu64 $R1,96($ctx)
3226 vpbroadcastq %x#$R1,$R1
3227 vmovdqu64 $R2,128($ctx)
3228 vpbroadcastq %x#$R2,$R2
3229 vmovdqu64 $S1,160($ctx)
3230 vpbroadcastq %x#$S1,$S1
3232 jmp .Lblocks_vpmadd52_4x_key_loaded
3236 .Ldone_init_vpmadd52_2x:
3237 vmovdqu64 $R0,64($ctx) # save key powers
3238 vpsrldq \$8,$R0,$R0 # 0-1-0-2
3239 vmovdqu64 $R1,96($ctx)
3241 vmovdqu64 $R2,128($ctx)
3243 vmovdqu64 $S1,160($ctx)
3245 jmp .Lblocks_vpmadd52_2x_key_loaded
3249 .Lblocks_vpmadd52_2x_do:
3250 vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
3251 vmovdqu64 160+8($ctx),${S1}{%k1}{z}
3252 vmovdqu64 64+8($ctx),${R0}{%k1}{z}
3253 vmovdqu64 96+8($ctx),${R1}{%k1}{z}
3255 .Lblocks_vpmadd52_2x_key_loaded:
3256 vmovdqu64 16*0($inp),$T2 # load data
3260 vpunpcklqdq $T3,$T2,$T1 # transpose data
3261 vpunpckhqdq $T3,$T2,$T3
3263 # at this point 64-bit lanes are ordered as x-1-x-0
3265 vpsrlq \$24,$T3,$T2 # splat the data
3267 vpaddq $T2,$H2,$H2 # accumulate input
3268 vpandq $mask44,$T1,$T0
3272 vpandq $mask44,$T1,$T1
3274 jmp .Ltail_vpmadd52_2x
3279 #vpaddq $T2,$H2,$H2 # accumulate input
3283 vpxorq $D0lo,$D0lo,$D0lo
3284 vpmadd52luq $H2,$S1,$D0lo
3285 vpxorq $D0hi,$D0hi,$D0hi
3286 vpmadd52huq $H2,$S1,$D0hi
3287 vpxorq $D1lo,$D1lo,$D1lo
3288 vpmadd52luq $H2,$S2,$D1lo
3289 vpxorq $D1hi,$D1hi,$D1hi
3290 vpmadd52huq $H2,$S2,$D1hi
3291 vpxorq $D2lo,$D2lo,$D2lo
3292 vpmadd52luq $H2,$R0,$D2lo
3293 vpxorq $D2hi,$D2hi,$D2hi
3294 vpmadd52huq $H2,$R0,$D2hi
3296 vmovdqu64 16*0($inp),$T2 # load data
3297 vmovdqu64 16*2($inp),$T3
3299 vpmadd52luq $H0,$R0,$D0lo
3300 vpmadd52huq $H0,$R0,$D0hi
3301 vpmadd52luq $H0,$R1,$D1lo
3302 vpmadd52huq $H0,$R1,$D1hi
3303 vpmadd52luq $H0,$R2,$D2lo
3304 vpmadd52huq $H0,$R2,$D2hi
3306 vpunpcklqdq $T3,$T2,$T1 # transpose data
3307 vpunpckhqdq $T3,$T2,$T3
3308 vpmadd52luq $H1,$S2,$D0lo
3309 vpmadd52huq $H1,$S2,$D0hi
3310 vpmadd52luq $H1,$R0,$D1lo
3311 vpmadd52huq $H1,$R0,$D1hi
3312 vpmadd52luq $H1,$R1,$D2lo
3313 vpmadd52huq $H1,$R1,$D2hi
3315 ################################################################
3316 # partial reduction (interleaved with data splat)
3317 vpsrlq \$44,$D0lo,$tmp
3318 vpsllq \$8,$D0hi,$D0hi
3319 vpandq $mask44,$D0lo,$H0
3320 vpaddq $tmp,$D0hi,$D0hi
3324 vpaddq $D0hi,$D1lo,$D1lo
3326 vpsrlq \$44,$D1lo,$tmp
3327 vpsllq \$8,$D1hi,$D1hi
3328 vpandq $mask44,$D1lo,$H1
3329 vpaddq $tmp,$D1hi,$D1hi
3331 vpandq $mask44,$T1,$T0
3334 vpaddq $D1hi,$D2lo,$D2lo
3336 vpsrlq \$42,$D2lo,$tmp
3337 vpsllq \$10,$D2hi,$D2hi
3338 vpandq $mask42,$D2lo,$H2
3339 vpaddq $tmp,$D2hi,$D2hi
3341 vpaddq $T2,$H2,$H2 # accumulate input
3342 vpaddq $D2hi,$H0,$H0
3343 vpsllq \$2,$D2hi,$D2hi
3345 vpaddq $D2hi,$H0,$H0
3347 vpandq $mask44,$T1,$T1
3349 vpsrlq \$44,$H0,$tmp # additional step
3350 vpandq $mask44,$H0,$H0
3354 sub \$4,$len # len-=64
3355 jnz .Loop_vpmadd52_4x
3358 vmovdqu64 128($ctx),$R2 # load all key powers
3359 vmovdqu64 160($ctx),$S1
3360 vmovdqu64 64($ctx),$R0
3361 vmovdqu64 96($ctx),$R1
3364 vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3368 #vpaddq $T2,$H2,$H2 # accumulate input
3372 vpxorq $D0lo,$D0lo,$D0lo
3373 vpmadd52luq $H2,$S1,$D0lo
3374 vpxorq $D0hi,$D0hi,$D0hi
3375 vpmadd52huq $H2,$S1,$D0hi
3376 vpxorq $D1lo,$D1lo,$D1lo
3377 vpmadd52luq $H2,$S2,$D1lo
3378 vpxorq $D1hi,$D1hi,$D1hi
3379 vpmadd52huq $H2,$S2,$D1hi
3380 vpxorq $D2lo,$D2lo,$D2lo
3381 vpmadd52luq $H2,$R0,$D2lo
3382 vpxorq $D2hi,$D2hi,$D2hi
3383 vpmadd52huq $H2,$R0,$D2hi
3385 vpmadd52luq $H0,$R0,$D0lo
3386 vpmadd52huq $H0,$R0,$D0hi
3387 vpmadd52luq $H0,$R1,$D1lo
3388 vpmadd52huq $H0,$R1,$D1hi
3389 vpmadd52luq $H0,$R2,$D2lo
3390 vpmadd52huq $H0,$R2,$D2hi
3392 vpmadd52luq $H1,$S2,$D0lo
3393 vpmadd52huq $H1,$S2,$D0hi
3394 vpmadd52luq $H1,$R0,$D1lo
3395 vpmadd52huq $H1,$R0,$D1hi
3396 vpmadd52luq $H1,$R1,$D2lo
3397 vpmadd52huq $H1,$R1,$D2hi
3399 ################################################################
3400 # horizontal addition
3404 vpsrldq \$8,$D0lo,$T0
3405 vpsrldq \$8,$D0hi,$H0
3406 vpsrldq \$8,$D1lo,$T1
3407 vpsrldq \$8,$D1hi,$H1
3408 vpaddq $T0,$D0lo,$D0lo
3409 vpaddq $H0,$D0hi,$D0hi
3410 vpsrldq \$8,$D2lo,$T2
3411 vpsrldq \$8,$D2hi,$H2
3412 vpaddq $T1,$D1lo,$D1lo
3413 vpaddq $H1,$D1hi,$D1hi
3414 vpermq \$0x2,$D0lo,$T0
3415 vpermq \$0x2,$D0hi,$H0
3416 vpaddq $T2,$D2lo,$D2lo
3417 vpaddq $H2,$D2hi,$D2hi
3419 vpermq \$0x2,$D1lo,$T1
3420 vpermq \$0x2,$D1hi,$H1
3421 vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
3422 vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
3423 vpermq \$0x2,$D2lo,$T2
3424 vpermq \$0x2,$D2hi,$H2
3425 vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
3426 vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
3427 vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
3428 vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
3430 ################################################################
3432 vpsrlq \$44,$D0lo,$tmp
3433 vpsllq \$8,$D0hi,$D0hi
3434 vpandq $mask44,$D0lo,$H0
3435 vpaddq $tmp,$D0hi,$D0hi
3437 vpaddq $D0hi,$D1lo,$D1lo
3439 vpsrlq \$44,$D1lo,$tmp
3440 vpsllq \$8,$D1hi,$D1hi
3441 vpandq $mask44,$D1lo,$H1
3442 vpaddq $tmp,$D1hi,$D1hi
3444 vpaddq $D1hi,$D2lo,$D2lo
3446 vpsrlq \$42,$D2lo,$tmp
3447 vpsllq \$10,$D2hi,$D2hi
3448 vpandq $mask42,$D2lo,$H2
3449 vpaddq $tmp,$D2hi,$D2hi
3451 vpaddq $D2hi,$H0,$H0
3452 vpsllq \$2,$D2hi,$D2hi
3454 vpaddq $D2hi,$H0,$H0
3456 vpsrlq \$44,$H0,$tmp # additional step
3457 vpandq $mask44,$H0,$H0
3460 # at this point $len is
3461 # either 4*n+2 or 0...
3462 sub \$2,$len # len-=32
3463 ja .Lblocks_vpmadd52_4x_do
3465 vmovq %x#$H0,0($ctx)
3466 vmovq %x#$H1,8($ctx)
3467 vmovq %x#$H2,16($ctx)
3470 .Lno_data_vpmadd52_4x:
3472 .size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
3476 ########################################################################
3477 # As implied by its name 8x subroutine processes 8 blocks in parallel...
3478 # This is intermediate version, as it's used only in cases when input
3479 # length is either 8*n, 8*n+1 or 8*n+2...
3481 my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
3482 my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
3483 my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
3484 my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
3487 .type poly1305_blocks_vpmadd52_8x,\@function,4
3489 poly1305_blocks_vpmadd52_8x:
3491 jz .Lno_data_vpmadd52_8x # too short
3494 mov 64($ctx),%r8 # peek on power of the key
3496 vmovdqa64 .Lx_mask44(%rip),$mask44
3497 vmovdqa64 .Lx_mask42(%rip),$mask42
3499 test %r8,%r8 # is power value impossible?
3500 js .Linit_vpmadd52 # if it is, then init R[4]
3502 vmovq 0($ctx),%x#$H0 # load current hash value
3503 vmovq 8($ctx),%x#$H1
3504 vmovq 16($ctx),%x#$H2
3506 .Lblocks_vpmadd52_8x:
3507 ################################################################
3508 # fist we calculate more key powers
3510 vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers
3511 vmovdqu64 160($ctx),$S1
3512 vmovdqu64 64($ctx),$R0
3513 vmovdqu64 96($ctx),$R1
3515 vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3519 vpbroadcastq %x#$R2,$RR2 # broadcast 4th power
3520 vpbroadcastq %x#$R0,$RR0
3521 vpbroadcastq %x#$R1,$RR1
3523 vpxorq $D0lo,$D0lo,$D0lo
3524 vpmadd52luq $RR2,$S1,$D0lo
3525 vpxorq $D0hi,$D0hi,$D0hi
3526 vpmadd52huq $RR2,$S1,$D0hi
3527 vpxorq $D1lo,$D1lo,$D1lo
3528 vpmadd52luq $RR2,$S2,$D1lo
3529 vpxorq $D1hi,$D1hi,$D1hi
3530 vpmadd52huq $RR2,$S2,$D1hi
3531 vpxorq $D2lo,$D2lo,$D2lo
3532 vpmadd52luq $RR2,$R0,$D2lo
3533 vpxorq $D2hi,$D2hi,$D2hi
3534 vpmadd52huq $RR2,$R0,$D2hi
3536 vpmadd52luq $RR0,$R0,$D0lo
3537 vpmadd52huq $RR0,$R0,$D0hi
3538 vpmadd52luq $RR0,$R1,$D1lo
3539 vpmadd52huq $RR0,$R1,$D1hi
3540 vpmadd52luq $RR0,$R2,$D2lo
3541 vpmadd52huq $RR0,$R2,$D2hi
3543 vpmadd52luq $RR1,$S2,$D0lo
3544 vpmadd52huq $RR1,$S2,$D0hi
3545 vpmadd52luq $RR1,$R0,$D1lo
3546 vpmadd52huq $RR1,$R0,$D1hi
3547 vpmadd52luq $RR1,$R1,$D2lo
3548 vpmadd52huq $RR1,$R1,$D2hi
3550 ################################################################
3552 vpsrlq \$44,$D0lo,$tmp
3553 vpsllq \$8,$D0hi,$D0hi
3554 vpandq $mask44,$D0lo,$RR0
3555 vpaddq $tmp,$D0hi,$D0hi
3557 vpaddq $D0hi,$D1lo,$D1lo
3559 vpsrlq \$44,$D1lo,$tmp
3560 vpsllq \$8,$D1hi,$D1hi
3561 vpandq $mask44,$D1lo,$RR1
3562 vpaddq $tmp,$D1hi,$D1hi
3564 vpaddq $D1hi,$D2lo,$D2lo
3566 vpsrlq \$42,$D2lo,$tmp
3567 vpsllq \$10,$D2hi,$D2hi
3568 vpandq $mask42,$D2lo,$RR2
3569 vpaddq $tmp,$D2hi,$D2hi
3571 vpaddq $D2hi,$RR0,$RR0
3572 vpsllq \$2,$D2hi,$D2hi
3574 vpaddq $D2hi,$RR0,$RR0
3576 vpsrlq \$44,$RR0,$tmp # additional step
3577 vpandq $mask44,$RR0,$RR0
3579 vpaddq $tmp,$RR1,$RR1
3581 ################################################################
3582 # At this point Rx holds 1324 powers, RRx - 5768, and the goal
3583 # is 15263748, which reflects how data is loaded...
3585 vpunpcklqdq $R2,$RR2,$T2 # 3748
3586 vpunpckhqdq $R2,$RR2,$R2 # 1526
3587 vpunpcklqdq $R0,$RR0,$T0
3588 vpunpckhqdq $R0,$RR0,$R0
3589 vpunpcklqdq $R1,$RR1,$T1
3590 vpunpckhqdq $R1,$RR1,$R1
3592 ######## switch to %zmm
3593 map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
3594 map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
3595 map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
3596 map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
3599 vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748
3600 vshufi64x2 \$0x44,$R0,$T0,$RR0
3601 vshufi64x2 \$0x44,$R1,$T1,$RR1
3603 vmovdqu64 16*0($inp),$T2 # load data
3604 vmovdqu64 16*4($inp),$T3
3607 vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4
3608 vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4
3609 vpaddq $RR2,$SS2,$SS2
3610 vpaddq $RR1,$SS1,$SS1
3611 vpsllq \$2,$SS2,$SS2
3612 vpsllq \$2,$SS1,$SS1
3614 vpbroadcastq $padbit,$PAD
3615 vpbroadcastq %x#$mask44,$mask44
3616 vpbroadcastq %x#$mask42,$mask42
3618 vpbroadcastq %x#$SS1,$S1 # broadcast 8th power
3619 vpbroadcastq %x#$SS2,$S2
3620 vpbroadcastq %x#$RR0,$R0
3621 vpbroadcastq %x#$RR1,$R1
3622 vpbroadcastq %x#$RR2,$R2
3624 vpunpcklqdq $T3,$T2,$T1 # transpose data
3625 vpunpckhqdq $T3,$T2,$T3
3627 # at this point 64-bit lanes are ordered as 73625140
3629 vpsrlq \$24,$T3,$T2 # splat the data
3631 vpaddq $T2,$H2,$H2 # accumulate input
3632 vpandq $mask44,$T1,$T0
3636 vpandq $mask44,$T1,$T1
3639 jz .Ltail_vpmadd52_8x
3640 jmp .Loop_vpmadd52_8x
3644 #vpaddq $T2,$H2,$H2 # accumulate input
3648 vpxorq $D0lo,$D0lo,$D0lo
3649 vpmadd52luq $H2,$S1,$D0lo
3650 vpxorq $D0hi,$D0hi,$D0hi
3651 vpmadd52huq $H2,$S1,$D0hi
3652 vpxorq $D1lo,$D1lo,$D1lo
3653 vpmadd52luq $H2,$S2,$D1lo
3654 vpxorq $D1hi,$D1hi,$D1hi
3655 vpmadd52huq $H2,$S2,$D1hi
3656 vpxorq $D2lo,$D2lo,$D2lo
3657 vpmadd52luq $H2,$R0,$D2lo
3658 vpxorq $D2hi,$D2hi,$D2hi
3659 vpmadd52huq $H2,$R0,$D2hi
3661 vmovdqu64 16*0($inp),$T2 # load data
3662 vmovdqu64 16*4($inp),$T3
3664 vpmadd52luq $H0,$R0,$D0lo
3665 vpmadd52huq $H0,$R0,$D0hi
3666 vpmadd52luq $H0,$R1,$D1lo
3667 vpmadd52huq $H0,$R1,$D1hi
3668 vpmadd52luq $H0,$R2,$D2lo
3669 vpmadd52huq $H0,$R2,$D2hi
3671 vpunpcklqdq $T3,$T2,$T1 # transpose data
3672 vpunpckhqdq $T3,$T2,$T3
3673 vpmadd52luq $H1,$S2,$D0lo
3674 vpmadd52huq $H1,$S2,$D0hi
3675 vpmadd52luq $H1,$R0,$D1lo
3676 vpmadd52huq $H1,$R0,$D1hi
3677 vpmadd52luq $H1,$R1,$D2lo
3678 vpmadd52huq $H1,$R1,$D2hi
3680 ################################################################
3681 # partial reduction (interleaved with data splat)
3682 vpsrlq \$44,$D0lo,$tmp
3683 vpsllq \$8,$D0hi,$D0hi
3684 vpandq $mask44,$D0lo,$H0
3685 vpaddq $tmp,$D0hi,$D0hi
3689 vpaddq $D0hi,$D1lo,$D1lo
3691 vpsrlq \$44,$D1lo,$tmp
3692 vpsllq \$8,$D1hi,$D1hi
3693 vpandq $mask44,$D1lo,$H1
3694 vpaddq $tmp,$D1hi,$D1hi
3696 vpandq $mask44,$T1,$T0
3699 vpaddq $D1hi,$D2lo,$D2lo
3701 vpsrlq \$42,$D2lo,$tmp
3702 vpsllq \$10,$D2hi,$D2hi
3703 vpandq $mask42,$D2lo,$H2
3704 vpaddq $tmp,$D2hi,$D2hi
3706 vpaddq $T2,$H2,$H2 # accumulate input
3707 vpaddq $D2hi,$H0,$H0
3708 vpsllq \$2,$D2hi,$D2hi
3710 vpaddq $D2hi,$H0,$H0
3712 vpandq $mask44,$T1,$T1
3714 vpsrlq \$44,$H0,$tmp # additional step
3715 vpandq $mask44,$H0,$H0
3719 sub \$8,$len # len-=128
3720 jnz .Loop_vpmadd52_8x
3723 #vpaddq $T2,$H2,$H2 # accumulate input
3727 vpxorq $D0lo,$D0lo,$D0lo
3728 vpmadd52luq $H2,$SS1,$D0lo
3729 vpxorq $D0hi,$D0hi,$D0hi
3730 vpmadd52huq $H2,$SS1,$D0hi
3731 vpxorq $D1lo,$D1lo,$D1lo
3732 vpmadd52luq $H2,$SS2,$D1lo
3733 vpxorq $D1hi,$D1hi,$D1hi
3734 vpmadd52huq $H2,$SS2,$D1hi
3735 vpxorq $D2lo,$D2lo,$D2lo
3736 vpmadd52luq $H2,$RR0,$D2lo
3737 vpxorq $D2hi,$D2hi,$D2hi
3738 vpmadd52huq $H2,$RR0,$D2hi
3740 vpmadd52luq $H0,$RR0,$D0lo
3741 vpmadd52huq $H0,$RR0,$D0hi
3742 vpmadd52luq $H0,$RR1,$D1lo
3743 vpmadd52huq $H0,$RR1,$D1hi
3744 vpmadd52luq $H0,$RR2,$D2lo
3745 vpmadd52huq $H0,$RR2,$D2hi
3747 vpmadd52luq $H1,$SS2,$D0lo
3748 vpmadd52huq $H1,$SS2,$D0hi
3749 vpmadd52luq $H1,$RR0,$D1lo
3750 vpmadd52huq $H1,$RR0,$D1hi
3751 vpmadd52luq $H1,$RR1,$D2lo
3752 vpmadd52huq $H1,$RR1,$D2hi
3754 ################################################################
3755 # horizontal addition
3759 vpsrldq \$8,$D0lo,$T0
3760 vpsrldq \$8,$D0hi,$H0
3761 vpsrldq \$8,$D1lo,$T1
3762 vpsrldq \$8,$D1hi,$H1
3763 vpaddq $T0,$D0lo,$D0lo
3764 vpaddq $H0,$D0hi,$D0hi
3765 vpsrldq \$8,$D2lo,$T2
3766 vpsrldq \$8,$D2hi,$H2
3767 vpaddq $T1,$D1lo,$D1lo
3768 vpaddq $H1,$D1hi,$D1hi
3769 vpermq \$0x2,$D0lo,$T0
3770 vpermq \$0x2,$D0hi,$H0
3771 vpaddq $T2,$D2lo,$D2lo
3772 vpaddq $H2,$D2hi,$D2hi
3774 vpermq \$0x2,$D1lo,$T1
3775 vpermq \$0x2,$D1hi,$H1
3776 vpaddq $T0,$D0lo,$D0lo
3777 vpaddq $H0,$D0hi,$D0hi
3778 vpermq \$0x2,$D2lo,$T2
3779 vpermq \$0x2,$D2hi,$H2
3780 vpaddq $T1,$D1lo,$D1lo
3781 vpaddq $H1,$D1hi,$D1hi
3782 vextracti64x4 \$1,$D0lo,%y#$T0
3783 vextracti64x4 \$1,$D0hi,%y#$H0
3784 vpaddq $T2,$D2lo,$D2lo
3785 vpaddq $H2,$D2hi,$D2hi
3787 vextracti64x4 \$1,$D1lo,%y#$T1
3788 vextracti64x4 \$1,$D1hi,%y#$H1
3789 vextracti64x4 \$1,$D2lo,%y#$T2
3790 vextracti64x4 \$1,$D2hi,%y#$H2
3792 ######## switch back to %ymm
3793 map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
3794 map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
3795 map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
3798 vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
3799 vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
3800 vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
3801 vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
3802 vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
3803 vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
3805 ################################################################
3807 vpsrlq \$44,$D0lo,$tmp
3808 vpsllq \$8,$D0hi,$D0hi
3809 vpandq $mask44,$D0lo,$H0
3810 vpaddq $tmp,$D0hi,$D0hi
3812 vpaddq $D0hi,$D1lo,$D1lo
3814 vpsrlq \$44,$D1lo,$tmp
3815 vpsllq \$8,$D1hi,$D1hi
3816 vpandq $mask44,$D1lo,$H1
3817 vpaddq $tmp,$D1hi,$D1hi
3819 vpaddq $D1hi,$D2lo,$D2lo
3821 vpsrlq \$42,$D2lo,$tmp
3822 vpsllq \$10,$D2hi,$D2hi
3823 vpandq $mask42,$D2lo,$H2
3824 vpaddq $tmp,$D2hi,$D2hi
3826 vpaddq $D2hi,$H0,$H0
3827 vpsllq \$2,$D2hi,$D2hi
3829 vpaddq $D2hi,$H0,$H0
3831 vpsrlq \$44,$H0,$tmp # additional step
3832 vpandq $mask44,$H0,$H0
3836 ################################################################
3838 vmovq %x#$H0,0($ctx)
3839 vmovq %x#$H1,8($ctx)
3840 vmovq %x#$H2,16($ctx)
3843 .Lno_data_vpmadd52_8x:
3845 .size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
3849 .type poly1305_emit_base2_44,\@function,3
3851 poly1305_emit_base2_44:
3852 mov 0($ctx),%r8 # load hash value
3868 add \$5,%r8 # compare to modulus
3872 shr \$2,%r10 # did 130-bit value overflow?
3876 add 0($nonce),%rax # accumulate nonce
3878 mov %rax,0($mac) # write result
3882 .size poly1305_emit_base2_44,.-poly1305_emit_base2_44
3888 { # chacha20-poly1305 helpers
3889 my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
3890 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
3892 .globl xor128_encrypt_n_pad
3893 .type xor128_encrypt_n_pad,\@abi-omnipotent
3895 xor128_encrypt_n_pad:
3898 mov $len,%r10 # put len aside
3899 shr \$4,$len # len / 16
3903 movdqu ($inp,$otp),%xmm0
3905 movdqu %xmm0,($out,$otp)
3911 and \$15,%r10 # len % 16
3937 .size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
3939 .globl xor128_decrypt_n_pad
3940 .type xor128_decrypt_n_pad,\@abi-omnipotent
3942 xor128_decrypt_n_pad:
3945 mov $len,%r10 # put len aside
3946 shr \$4,$len # len / 16
3950 movdqu ($inp,$otp),%xmm0
3953 movdqu %xmm1,($out,$otp)
3960 and \$15,%r10 # len % 16
3969 mov ($inp,$otp),%r11b
3988 .size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
3992 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3993 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
4001 .extern __imp_RtlVirtualUnwind
4002 .type se_handler,\@abi-omnipotent
4016 mov 120($context),%rax # pull context->Rax
4017 mov 248($context),%rbx # pull context->Rip
4019 mov 8($disp),%rsi # disp->ImageBase
4020 mov 56($disp),%r11 # disp->HandlerData
4022 mov 0(%r11),%r10d # HandlerData[0]
4023 lea (%rsi,%r10),%r10 # prologue label
4024 cmp %r10,%rbx # context->Rip<.Lprologue
4025 jb .Lcommon_seh_tail
4027 mov 152($context),%rax # pull context->Rsp
4029 mov 4(%r11),%r10d # HandlerData[1]
4030 lea (%rsi,%r10),%r10 # epilogue label
4031 cmp %r10,%rbx # context->Rip>=.Lepilogue
4032 jae .Lcommon_seh_tail
4042 mov %rbx,144($context) # restore context->Rbx
4043 mov %rbp,160($context) # restore context->Rbp
4044 mov %r12,216($context) # restore context->R12
4045 mov %r13,224($context) # restore context->R13
4046 mov %r14,232($context) # restore context->R14
4047 mov %r15,240($context) # restore context->R14
4049 jmp .Lcommon_seh_tail
4050 .size se_handler,.-se_handler
4052 .type avx_handler,\@abi-omnipotent
4066 mov 120($context),%rax # pull context->Rax
4067 mov 248($context),%rbx # pull context->Rip
4069 mov 8($disp),%rsi # disp->ImageBase
4070 mov 56($disp),%r11 # disp->HandlerData
4072 mov 0(%r11),%r10d # HandlerData[0]
4073 lea (%rsi,%r10),%r10 # prologue label
4074 cmp %r10,%rbx # context->Rip<prologue label
4075 jb .Lcommon_seh_tail
4077 mov 152($context),%rax # pull context->Rsp
4079 mov 4(%r11),%r10d # HandlerData[1]
4080 lea (%rsi,%r10),%r10 # epilogue label
4081 cmp %r10,%rbx # context->Rip>=epilogue label
4082 jae .Lcommon_seh_tail
4084 mov 208($context),%rax # pull context->R11
4088 lea 512($context),%rdi # &context.Xmm6
4090 .long 0xa548f3fc # cld; rep movsq
4095 mov %rax,152($context) # restore context->Rsp
4096 mov %rsi,168($context) # restore context->Rsi
4097 mov %rdi,176($context) # restore context->Rdi
4099 mov 40($disp),%rdi # disp->ContextRecord
4100 mov $context,%rsi # context
4101 mov \$154,%ecx # sizeof(CONTEXT)
4102 .long 0xa548f3fc # cld; rep movsq
4105 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
4106 mov 8(%rsi),%rdx # arg2, disp->ImageBase
4107 mov 0(%rsi),%r8 # arg3, disp->ControlPc
4108 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
4109 mov 40(%rsi),%r10 # disp->ContextRecord
4110 lea 56(%rsi),%r11 # &disp->HandlerData
4111 lea 24(%rsi),%r12 # &disp->EstablisherFrame
4112 mov %r10,32(%rsp) # arg5
4113 mov %r11,40(%rsp) # arg6
4114 mov %r12,48(%rsp) # arg7
4115 mov %rcx,56(%rsp) # arg8, (NULL)
4116 call *__imp_RtlVirtualUnwind(%rip)
4118 mov \$1,%eax # ExceptionContinueSearch
4130 .size avx_handler,.-avx_handler
4134 .rva .LSEH_begin_poly1305_init_x86_64
4135 .rva .LSEH_end_poly1305_init_x86_64
4136 .rva .LSEH_info_poly1305_init_x86_64
4138 .rva .LSEH_begin_poly1305_blocks_x86_64
4139 .rva .LSEH_end_poly1305_blocks_x86_64
4140 .rva .LSEH_info_poly1305_blocks_x86_64
4142 .rva .LSEH_begin_poly1305_emit_x86_64
4143 .rva .LSEH_end_poly1305_emit_x86_64
4144 .rva .LSEH_info_poly1305_emit_x86_64
4146 $code.=<<___ if ($avx);
4147 .rva .LSEH_begin_poly1305_blocks_avx
4149 .rva .LSEH_info_poly1305_blocks_avx_1
4153 .rva .LSEH_info_poly1305_blocks_avx_2
4156 .rva .LSEH_end_poly1305_blocks_avx
4157 .rva .LSEH_info_poly1305_blocks_avx_3
4159 .rva .LSEH_begin_poly1305_emit_avx
4160 .rva .LSEH_end_poly1305_emit_avx
4161 .rva .LSEH_info_poly1305_emit_avx
4163 $code.=<<___ if ($avx>1);
4164 .rva .LSEH_begin_poly1305_blocks_avx2
4165 .rva .Lbase2_64_avx2
4166 .rva .LSEH_info_poly1305_blocks_avx2_1
4168 .rva .Lbase2_64_avx2
4170 .rva .LSEH_info_poly1305_blocks_avx2_2
4173 .rva .LSEH_end_poly1305_blocks_avx2
4174 .rva .LSEH_info_poly1305_blocks_avx2_3
4176 $code.=<<___ if ($avx>2);
4177 .rva .LSEH_begin_poly1305_blocks_avx512
4178 .rva .LSEH_end_poly1305_blocks_avx512
4179 .rva .LSEH_info_poly1305_blocks_avx512
4184 .LSEH_info_poly1305_init_x86_64:
4187 .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64
4189 .LSEH_info_poly1305_blocks_x86_64:
4192 .rva .Lblocks_body,.Lblocks_epilogue
4194 .LSEH_info_poly1305_emit_x86_64:
4197 .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
4199 $code.=<<___ if ($avx);
4200 .LSEH_info_poly1305_blocks_avx_1:
4203 .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
4205 .LSEH_info_poly1305_blocks_avx_2:
4208 .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
4210 .LSEH_info_poly1305_blocks_avx_3:
4213 .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
4215 .LSEH_info_poly1305_emit_avx:
4218 .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
4220 $code.=<<___ if ($avx>1);
4221 .LSEH_info_poly1305_blocks_avx2_1:
4224 .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
4226 .LSEH_info_poly1305_blocks_avx2_2:
4229 .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
4231 .LSEH_info_poly1305_blocks_avx2_3:
4234 .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
4236 $code.=<<___ if ($avx>2);
4237 .LSEH_info_poly1305_blocks_avx512:
4240 .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[]
4247 last if (!s/^#/\/\// and !/^$/);
4252 foreach (split('\n',$code)) {
4253 s/\`([^\`]*)\`/eval($1)/ge;
4254 s/%r([a-z]+)#d/%e$1/g;
4255 s/%r([0-9]+)#d/%r$1d/g;
4256 s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
4259 s/(^\.type.*),[0-9]+$/\1/;
4260 s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;