2 # Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 ###################################################################
11 ### AES-128 [originally in CTR mode] ###
12 ### bitsliced implementation for Intel Core 2 processors ###
13 ### requires support of SSE extensions up to SSSE3 ###
14 ### Author: Emilia Käsper and Peter Schwabe ###
15 ### Date: 2009-03-19 ###
18 ### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
19 ### further information. ###
20 ###################################################################
24 # Started as transliteration to "perlasm" the original code has
25 # undergone following changes:
27 # - code was made position-independent;
28 # - rounds were folded into a loop resulting in >5x size reduction
29 # from 12.5KB to 2.2KB;
30 # - above was possibile thanks to mixcolumns() modification that
31 # allowed to feed its output back to aesenc[last], this was
32 # achieved at cost of two additional inter-registers moves;
33 # - some instruction reordering and interleaving;
34 # - this module doesn't implement key setup subroutine, instead it
35 # relies on conversion of "conventional" key schedule as returned
36 # by AES_set_encrypt_key (see discussion below);
37 # - first and last round keys are treated differently, which allowed
38 # to skip one shiftrows(), reduce bit-sliced key schedule and
39 # speed-up conversion by 22%;
40 # - support for 192- and 256-bit keys was added;
42 # Resulting performance in CPU cycles spent to encrypt one byte out
43 # of 4096-byte buffer with 128-bit key is:
45 # Emilia's this(*) difference
47 # Core 2 9.30 8.69 +7%
48 # Nehalem(**) 7.63 6.88 +11%
53 # (*) Comparison is not completely fair, because "this" is ECB,
54 # i.e. no extra processing such as counter values calculation
55 # and xor-ing input as in Emilia's CTR implementation is
56 # performed. However, the CTR calculations stand for not more
57 # than 1% of total time, so comparison is *rather* fair.
59 # (**) Results were collected on Westmere, which is considered to
60 # be equivalent to Nehalem for this code.
62 # As for key schedule conversion subroutine. Interface to OpenSSL
63 # relies on per-invocation on-the-fly conversion. This naturally
64 # has impact on performance, especially for short inputs. Conversion
65 # time in CPU cycles and its ratio to CPU cycles spent in 8x block
68 # conversion conversion/8x block
73 # The ratio values mean that 128-byte blocks will be processed
74 # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
75 # etc. Then keep in mind that input sizes not divisible by 128 are
76 # *effectively* slower, especially shortest ones, e.g. consecutive
77 # 144-byte blocks are processed 44% slower than one would expect,
78 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
79 # it's still faster than ["hyper-threading-safe" code path in]
80 # aes-x86_64.pl on all lengths above 64 bytes...
84 # Add decryption procedure. Performance in CPU cycles spent to decrypt
85 # one byte out of 4096-byte buffer with 128-bit key is:
95 # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
96 # suboptimal, but XTS is meant to be used with larger blocks...
102 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
104 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
106 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
107 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
108 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
109 die "can't locate x86_64-xlate.pl";
111 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
114 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
115 my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
116 my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
119 my ($key,$rounds,$const)=("%rax","%r10d","%r11");
122 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
123 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
128 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
129 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
133 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
134 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
156 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
157 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
177 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
178 # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
182 &InvInBasisChange (@b);
183 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
184 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
187 sub InvInBasisChange { # OutBasisChange in reverse
188 my @b=@_[5,1,2,6,3,7,0,4];
206 sub InvOutBasisChange { # InBasisChange in reverse
207 my @b=@_[2,5,7,3,6,1,0,4];
228 #;*************************************************************
229 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
230 #;*************************************************************
231 my ($x0,$x1,$y0,$y1,$t0)=@_;
244 sub Mul_GF4_N { # not used, see next subroutine
245 # multiply and scale by N
246 my ($x0,$x1,$y0,$y1,$t0)=@_;
260 # interleaved Mul_GF4_N and Mul_GF4
261 my ($x0,$x1,$y0,$y1,$t0,
262 $x2,$x3,$y2,$y3,$t1)=@_;
290 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
297 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
298 @x[2], @x[3], @y[2], @y[3], @t[2]);
310 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
311 @x[6], @x[7], @y[2], @y[3], @t[2]);
316 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
325 #;********************************************************************
326 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
327 #;********************************************************************
331 # direct optimizations from hardware
386 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
388 # new smaller inversion
422 # output in s3, s2, s1, t1
424 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
426 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
427 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
429 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
432 # AES linear components
438 pxor 0x00($key),@x[0]
439 pxor 0x10($key),@x[1]
440 pxor 0x20($key),@x[2]
441 pxor 0x30($key),@x[3]
444 pxor 0x40($key),@x[4]
445 pxor 0x50($key),@x[5]
448 pxor 0x60($key),@x[6]
449 pxor 0x70($key),@x[7]
459 # modified to emit output in order suitable for feeding back to aesenc[last]
462 my $inv=@_[16]; # optional
464 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
465 pshufd \$0x93, @x[1], @t[1]
466 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
467 pshufd \$0x93, @x[2], @t[2]
469 pshufd \$0x93, @x[3], @t[3]
471 pshufd \$0x93, @x[4], @t[4]
473 pshufd \$0x93, @x[5], @t[5]
475 pshufd \$0x93, @x[6], @t[6]
477 pshufd \$0x93, @x[7], @t[7]
484 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
486 pshufd \$0x4E, @x[1], @x[1]
492 pshufd \$0x4E, @x[4], @t[0]
494 pshufd \$0x4E, @x[5], @t[1]
496 pshufd \$0x4E, @x[3], @x[4]
498 pshufd \$0x4E, @x[7], @x[5]
500 pshufd \$0x4E, @x[6], @x[3]
502 pshufd \$0x4E, @x[2], @x[6]
505 $code.=<<___ if (!$inv);
513 $code.=<<___ if ($inv);
526 sub InvMixColumns_orig {
531 # multiplication by 0x0e
532 pshufd \$0x93, @x[7], @t[7]
534 pxor @x[5], @x[7] # 7 5
535 pxor @x[5], @x[2] # 2 5
536 pshufd \$0x93, @x[0], @t[0]
538 pxor @x[0], @x[5] # 5 0 [1]
539 pxor @x[1], @x[0] # 0 1
540 pshufd \$0x93, @x[1], @t[1]
541 pxor @x[2], @x[1] # 1 25
542 pxor @x[6], @x[0] # 01 6 [2]
543 pxor @x[3], @x[1] # 125 3 [4]
544 pshufd \$0x93, @x[3], @t[3]
545 pxor @x[0], @x[2] # 25 016 [3]
546 pxor @x[7], @x[3] # 3 75
547 pxor @x[6], @x[7] # 75 6 [0]
548 pshufd \$0x93, @x[6], @t[6]
550 pxor @x[4], @x[6] # 6 4
551 pxor @x[3], @x[4] # 4 375 [6]
552 pxor @x[7], @x[3] # 375 756=36
553 pxor @t[5], @x[6] # 64 5 [7]
554 pxor @t[2], @x[3] # 36 2
555 pxor @t[4], @x[3] # 362 4 [5]
556 pshufd \$0x93, @t[5], @t[5]
558 my @y = @x[7,5,0,2,1,3,4,6];
560 # multiplication by 0x0b
564 pshufd \$0x93, @t[2], @t[2]
568 pshufd \$0x93, @t[4], @t[4]
569 pxor @t[6], @t[7] # clobber t[7]
573 pshufd \$0x93, @t[0], @t[0]
577 pshufd \$0x93, @t[1], @t[1]
581 pshufd \$0x93, @t[2], @t[2]
585 pshufd \$0x93, @t[3], @t[3]
591 pxor @t[5], @t[7] # clobber t[7] even more
594 pshufd \$0x93, @t[4], @t[4]
599 pshufd \$0x93, @t[5], @t[5]
600 pxor @t[6], @t[7] # restore t[7]
602 # multiplication by 0x0d
605 pshufd \$0x93, @t[6], @t[6]
609 pshufd \$0x93, @t[7], @t[7]
618 pshufd \$0x93, @t[0], @t[0]
622 pshufd \$0x93, @t[1], @t[1]
627 pshufd \$0x93, @t[2], @t[2]
629 pxor @t[3], @t[6] # clobber t[6]
636 pshufd \$0x93, @t[4], @t[4]
639 pxor @t[3], @t[6] # restore t[6]
641 pshufd \$0x93, @t[5], @t[5]
642 pshufd \$0x93, @t[6], @t[6]
643 pshufd \$0x93, @t[7], @t[7]
644 pshufd \$0x93, @t[3], @t[3]
646 # multiplication by 0x09
648 pxor @y[1], @t[1] # t[1]=y[1]
649 pxor @t[5], @t[0] # clobber t[0]
652 pxor @y[0], @t[0] # t[0]=y[0]
654 pxor @t[7], @t[6] # clobber t[6]
657 pxor @y[4], @t[4] # t[4]=y[4]
659 pxor @y[3], @t[3] # t[3]=y[3]
661 pxor @y[2], @t[2] # t[2]=y[2]
663 pxor @y[5], @t[5] # t[5]=y[5]
666 pxor @y[6], @t[6] # t[6]=y[6]
667 pxor @y[7], @t[7] # t[7]=y[7]
684 # Thanks to Jussi Kivilinna for providing pointer to
686 # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
687 # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
688 # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
689 # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
692 # multiplication by 0x05-0x00-0x04-0x00
693 pshufd \$0x4E, @x[0], @t[0]
694 pshufd \$0x4E, @x[6], @t[6]
696 pshufd \$0x4E, @x[7], @t[7]
698 pshufd \$0x4E, @x[1], @t[1]
700 pshufd \$0x4E, @x[2], @t[2]
702 pshufd \$0x4E, @x[3], @t[3]
706 pshufd \$0x4E, @x[4], @t[4]
710 pshufd \$0x4E, @x[5], @t[5]
725 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
728 sub aesenc { # not used
732 movdqa 0x30($const),@t[0] # .LSR
734 &ShiftRows (@b,@t[0]);
736 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
739 sub aesenclast { # not used
743 movdqa 0x40($const),@t[0] # .LSRM0
745 &ShiftRows (@b,@t[0]);
748 pxor 0x00($key),@b[0]
749 pxor 0x10($key),@b[1]
750 pxor 0x20($key),@b[4]
751 pxor 0x30($key),@b[6]
752 pxor 0x40($key),@b[3]
753 pxor 0x50($key),@b[7]
754 pxor 0x60($key),@b[2]
755 pxor 0x70($key),@b[5]
760 my ($a,$b,$n,$mask,$t)=@_;
772 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
792 my @x=reverse(@_[0..7]);
793 my ($t0,$t1,$t2,$t3)=@_[8..11];
795 movdqa 0x00($const),$t0 # .LBS0
796 movdqa 0x10($const),$t1 # .LBS1
798 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
799 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
801 movdqa 0x20($const),$t0 # .LBS2
803 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
804 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
806 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
807 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
813 .extern asm_AES_encrypt
814 .extern asm_AES_decrypt
816 .type _bsaes_encrypt8,\@abi-omnipotent
819 lea .LBS0(%rip), $const # constants table
821 movdqa ($key), @XMM[9] # round 0 key
823 movdqa 0x50($const), @XMM[8] # .LM0SR
824 pxor @XMM[9], @XMM[0] # xor with round0 key
825 pxor @XMM[9], @XMM[1]
826 pxor @XMM[9], @XMM[2]
827 pxor @XMM[9], @XMM[3]
828 pshufb @XMM[8], @XMM[0]
829 pshufb @XMM[8], @XMM[1]
830 pxor @XMM[9], @XMM[4]
831 pxor @XMM[9], @XMM[5]
832 pshufb @XMM[8], @XMM[2]
833 pshufb @XMM[8], @XMM[3]
834 pxor @XMM[9], @XMM[6]
835 pxor @XMM[9], @XMM[7]
836 pshufb @XMM[8], @XMM[4]
837 pshufb @XMM[8], @XMM[5]
838 pshufb @XMM[8], @XMM[6]
839 pshufb @XMM[8], @XMM[7]
840 _bsaes_encrypt8_bitslice:
842 &bitslice (@XMM[0..7, 8..11]);
849 &ShiftRows (@XMM[0..7, 8]);
850 $code.=".Lenc_sbox:\n";
851 &Sbox (@XMM[0..7, 8..15]);
856 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
858 movdqa 0x30($const), @XMM[8] # .LSR
860 movdqa 0x40($const), @XMM[8] # .LSRM0
865 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
866 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
868 movdqa ($key), @XMM[8] # last round key
869 pxor @XMM[8], @XMM[4]
870 pxor @XMM[8], @XMM[6]
871 pxor @XMM[8], @XMM[3]
872 pxor @XMM[8], @XMM[7]
873 pxor @XMM[8], @XMM[2]
874 pxor @XMM[8], @XMM[5]
875 pxor @XMM[8], @XMM[0]
876 pxor @XMM[8], @XMM[1]
878 .size _bsaes_encrypt8,.-_bsaes_encrypt8
880 .type _bsaes_decrypt8,\@abi-omnipotent
883 lea .LBS0(%rip), $const # constants table
885 movdqa ($key), @XMM[9] # round 0 key
887 movdqa -0x30($const), @XMM[8] # .LM0ISR
888 pxor @XMM[9], @XMM[0] # xor with round0 key
889 pxor @XMM[9], @XMM[1]
890 pxor @XMM[9], @XMM[2]
891 pxor @XMM[9], @XMM[3]
892 pshufb @XMM[8], @XMM[0]
893 pshufb @XMM[8], @XMM[1]
894 pxor @XMM[9], @XMM[4]
895 pxor @XMM[9], @XMM[5]
896 pshufb @XMM[8], @XMM[2]
897 pshufb @XMM[8], @XMM[3]
898 pxor @XMM[9], @XMM[6]
899 pxor @XMM[9], @XMM[7]
900 pshufb @XMM[8], @XMM[4]
901 pshufb @XMM[8], @XMM[5]
902 pshufb @XMM[8], @XMM[6]
903 pshufb @XMM[8], @XMM[7]
905 &bitslice (@XMM[0..7, 8..11]);
912 &ShiftRows (@XMM[0..7, 8]);
913 $code.=".Ldec_sbox:\n";
914 &InvSbox (@XMM[0..7, 8..15]);
919 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
921 movdqa -0x10($const), @XMM[8] # .LISR
923 movdqa -0x20($const), @XMM[8] # .LISRM0
928 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
930 movdqa ($key), @XMM[8] # last round key
931 pxor @XMM[8], @XMM[6]
932 pxor @XMM[8], @XMM[4]
933 pxor @XMM[8], @XMM[2]
934 pxor @XMM[8], @XMM[7]
935 pxor @XMM[8], @XMM[3]
936 pxor @XMM[8], @XMM[5]
937 pxor @XMM[8], @XMM[0]
938 pxor @XMM[8], @XMM[1]
940 .size _bsaes_decrypt8,.-_bsaes_decrypt8
944 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
947 my @x=reverse(@_[0..7]);
948 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
950 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
952 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
956 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
958 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
960 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
966 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
967 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
971 .type _bsaes_key_convert,\@abi-omnipotent
974 lea .Lmasks(%rip), $const
975 movdqu ($inp), %xmm7 # load round 0 key
977 movdqa 0x00($const), %xmm0 # 0x01...
978 movdqa 0x10($const), %xmm1 # 0x02...
979 movdqa 0x20($const), %xmm2 # 0x04...
980 movdqa 0x30($const), %xmm3 # 0x08...
981 movdqa 0x40($const), %xmm4 # .LM0
982 pcmpeqd %xmm5, %xmm5 # .LNOT
984 movdqu ($inp), %xmm6 # load round 1 key
985 movdqa %xmm7, ($out) # save round 0 key
991 pshufb %xmm4, %xmm6 # .LM0
1000 psllq \$4, %xmm0 # 0x10...
1001 movdqa %xmm3, %xmm11
1002 pcmpeqb %xmm1, %xmm9
1003 psllq \$4, %xmm1 # 0x20...
1007 movdqa %xmm0, %xmm12
1008 pcmpeqb %xmm2, %xmm10
1009 psllq \$4, %xmm2 # 0x40...
1010 movdqa %xmm1, %xmm13
1011 pcmpeqb %xmm3, %xmm11
1012 psllq \$4, %xmm3 # 0x80...
1014 movdqa %xmm2, %xmm14
1015 movdqa %xmm3, %xmm15
1016 pxor %xmm5, %xmm8 # "pnot"
1021 movdqa %xmm8, 0x00($out) # write bit-sliced round key
1022 pcmpeqb %xmm0, %xmm12
1023 psrlq \$4, %xmm0 # 0x01...
1024 movdqa %xmm9, 0x10($out)
1025 pcmpeqb %xmm1, %xmm13
1026 psrlq \$4, %xmm1 # 0x02...
1027 lea 0x10($inp), $inp
1031 movdqa %xmm10, 0x20($out)
1032 pcmpeqb %xmm2, %xmm14
1033 psrlq \$4, %xmm2 # 0x04...
1034 movdqa %xmm11, 0x30($out)
1035 pcmpeqb %xmm3, %xmm15
1036 psrlq \$4, %xmm3 # 0x08...
1037 movdqu ($inp), %xmm6 # load next round key
1039 pxor %xmm5, %xmm13 # "pnot"
1041 movdqa %xmm12, 0x40($out)
1042 movdqa %xmm13, 0x50($out)
1043 movdqa %xmm14, 0x60($out)
1044 movdqa %xmm15, 0x70($out)
1049 movdqa 0x50($const), %xmm7 # .L63
1050 #movdqa %xmm6, ($out) # don't save last round key
1052 .size _bsaes_key_convert,.-_bsaes_key_convert
1056 if (0 && !$win64) { # following four functions are unsupported interface
1057 # used for benchmarking...
1059 .globl bsaes_enc_key_convert
1060 .type bsaes_enc_key_convert,\@function,2
1062 bsaes_enc_key_convert:
1063 mov 240($inp),%r10d # pass rounds
1064 mov $inp,%rcx # pass key
1065 mov $out,%rax # pass key schedule
1066 call _bsaes_key_convert
1067 pxor %xmm6,%xmm7 # fix up last round key
1068 movdqa %xmm7,(%rax) # save last round key
1070 .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1072 .globl bsaes_encrypt_128
1073 .type bsaes_encrypt_128,\@function,4
1077 movdqu 0x00($inp), @XMM[0] # load input
1078 movdqu 0x10($inp), @XMM[1]
1079 movdqu 0x20($inp), @XMM[2]
1080 movdqu 0x30($inp), @XMM[3]
1081 movdqu 0x40($inp), @XMM[4]
1082 movdqu 0x50($inp), @XMM[5]
1083 movdqu 0x60($inp), @XMM[6]
1084 movdqu 0x70($inp), @XMM[7]
1085 mov $key, %rax # pass the $key
1086 lea 0x80($inp), $inp
1089 call _bsaes_encrypt8
1091 movdqu @XMM[0], 0x00($out) # write output
1092 movdqu @XMM[1], 0x10($out)
1093 movdqu @XMM[4], 0x20($out)
1094 movdqu @XMM[6], 0x30($out)
1095 movdqu @XMM[3], 0x40($out)
1096 movdqu @XMM[7], 0x50($out)
1097 movdqu @XMM[2], 0x60($out)
1098 movdqu @XMM[5], 0x70($out)
1099 lea 0x80($out), $out
1103 .size bsaes_encrypt_128,.-bsaes_encrypt_128
1105 .globl bsaes_dec_key_convert
1106 .type bsaes_dec_key_convert,\@function,2
1108 bsaes_dec_key_convert:
1109 mov 240($inp),%r10d # pass rounds
1110 mov $inp,%rcx # pass key
1111 mov $out,%rax # pass key schedule
1112 call _bsaes_key_convert
1113 pxor ($out),%xmm7 # fix up round 0 key
1114 movdqa %xmm6,(%rax) # save last round key
1117 .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1119 .globl bsaes_decrypt_128
1120 .type bsaes_decrypt_128,\@function,4
1124 movdqu 0x00($inp), @XMM[0] # load input
1125 movdqu 0x10($inp), @XMM[1]
1126 movdqu 0x20($inp), @XMM[2]
1127 movdqu 0x30($inp), @XMM[3]
1128 movdqu 0x40($inp), @XMM[4]
1129 movdqu 0x50($inp), @XMM[5]
1130 movdqu 0x60($inp), @XMM[6]
1131 movdqu 0x70($inp), @XMM[7]
1132 mov $key, %rax # pass the $key
1133 lea 0x80($inp), $inp
1136 call _bsaes_decrypt8
1138 movdqu @XMM[0], 0x00($out) # write output
1139 movdqu @XMM[1], 0x10($out)
1140 movdqu @XMM[6], 0x20($out)
1141 movdqu @XMM[4], 0x30($out)
1142 movdqu @XMM[2], 0x40($out)
1143 movdqu @XMM[7], 0x50($out)
1144 movdqu @XMM[3], 0x60($out)
1145 movdqu @XMM[5], 0x70($out)
1146 lea 0x80($out), $out
1150 .size bsaes_decrypt_128,.-bsaes_decrypt_128
1154 ######################################################################
1158 my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1159 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1160 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1164 .globl bsaes_ecb_encrypt_blocks
1165 .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1167 bsaes_ecb_encrypt_blocks:
1183 lea -0x48(%rsp),%rsp
1184 .cfi_adjust_cfa_offset 0x48
1186 $code.=<<___ if ($win64);
1187 lea -0xa0(%rsp), %rsp
1188 movaps %xmm6, 0x40(%rsp)
1189 movaps %xmm7, 0x50(%rsp)
1190 movaps %xmm8, 0x60(%rsp)
1191 movaps %xmm9, 0x70(%rsp)
1192 movaps %xmm10, 0x80(%rsp)
1193 movaps %xmm11, 0x90(%rsp)
1194 movaps %xmm12, 0xa0(%rsp)
1195 movaps %xmm13, 0xb0(%rsp)
1196 movaps %xmm14, 0xc0(%rsp)
1197 movaps %xmm15, 0xd0(%rsp)
1201 mov %rsp,%rbp # backup %rsp
1202 .cfi_def_cfa_register %rbp
1203 mov 240($arg4),%eax # rounds
1204 mov $arg1,$inp # backup arguments
1211 mov %eax,%ebx # backup rounds
1212 shl \$7,%rax # 128 bytes per inner round key
1213 sub \$`128-32`,%rax # size of bit-sliced key schedule
1215 mov %rsp,%rax # pass key schedule
1216 mov $key,%rcx # pass key
1217 mov %ebx,%r10d # pass rounds
1218 call _bsaes_key_convert
1219 pxor %xmm6,%xmm7 # fix up last round key
1220 movdqa %xmm7,(%rax) # save last round key
1224 movdqu 0x00($inp), @XMM[0] # load input
1225 movdqu 0x10($inp), @XMM[1]
1226 movdqu 0x20($inp), @XMM[2]
1227 movdqu 0x30($inp), @XMM[3]
1228 movdqu 0x40($inp), @XMM[4]
1229 movdqu 0x50($inp), @XMM[5]
1230 mov %rsp, %rax # pass key schedule
1231 movdqu 0x60($inp), @XMM[6]
1232 mov %ebx,%r10d # pass rounds
1233 movdqu 0x70($inp), @XMM[7]
1234 lea 0x80($inp), $inp
1236 call _bsaes_encrypt8
1238 movdqu @XMM[0], 0x00($out) # write output
1239 movdqu @XMM[1], 0x10($out)
1240 movdqu @XMM[4], 0x20($out)
1241 movdqu @XMM[6], 0x30($out)
1242 movdqu @XMM[3], 0x40($out)
1243 movdqu @XMM[7], 0x50($out)
1244 movdqu @XMM[2], 0x60($out)
1245 movdqu @XMM[5], 0x70($out)
1246 lea 0x80($out), $out
1253 movdqu 0x00($inp), @XMM[0] # load input
1254 mov %rsp, %rax # pass key schedule
1255 mov %ebx,%r10d # pass rounds
1258 movdqu 0x10($inp), @XMM[1]
1260 movdqu 0x20($inp), @XMM[2]
1263 movdqu 0x30($inp), @XMM[3]
1265 movdqu 0x40($inp), @XMM[4]
1268 movdqu 0x50($inp), @XMM[5]
1270 movdqu 0x60($inp), @XMM[6]
1271 call _bsaes_encrypt8
1272 movdqu @XMM[0], 0x00($out) # write output
1273 movdqu @XMM[1], 0x10($out)
1274 movdqu @XMM[4], 0x20($out)
1275 movdqu @XMM[6], 0x30($out)
1276 movdqu @XMM[3], 0x40($out)
1277 movdqu @XMM[7], 0x50($out)
1278 movdqu @XMM[2], 0x60($out)
1282 call _bsaes_encrypt8
1283 movdqu @XMM[0], 0x00($out) # write output
1284 movdqu @XMM[1], 0x10($out)
1285 movdqu @XMM[4], 0x20($out)
1286 movdqu @XMM[6], 0x30($out)
1287 movdqu @XMM[3], 0x40($out)
1288 movdqu @XMM[7], 0x50($out)
1292 call _bsaes_encrypt8
1293 movdqu @XMM[0], 0x00($out) # write output
1294 movdqu @XMM[1], 0x10($out)
1295 movdqu @XMM[4], 0x20($out)
1296 movdqu @XMM[6], 0x30($out)
1297 movdqu @XMM[3], 0x40($out)
1301 call _bsaes_encrypt8
1302 movdqu @XMM[0], 0x00($out) # write output
1303 movdqu @XMM[1], 0x10($out)
1304 movdqu @XMM[4], 0x20($out)
1305 movdqu @XMM[6], 0x30($out)
1309 call _bsaes_encrypt8
1310 movdqu @XMM[0], 0x00($out) # write output
1311 movdqu @XMM[1], 0x10($out)
1312 movdqu @XMM[4], 0x20($out)
1316 call _bsaes_encrypt8
1317 movdqu @XMM[0], 0x00($out) # write output
1318 movdqu @XMM[1], 0x10($out)
1322 call _bsaes_encrypt8
1323 movdqu @XMM[0], 0x00($out) # write output
1330 call asm_AES_encrypt
1339 .Lecb_enc_bzero: # wipe key schedule [if any]
1340 movdqa %xmm0, 0x00(%rax)
1341 movdqa %xmm0, 0x10(%rax)
1342 lea 0x20(%rax), %rax
1349 $code.=<<___ if ($win64);
1350 movaps 0x40(%rbp), %xmm6
1351 movaps 0x50(%rbp), %xmm7
1352 movaps 0x60(%rbp), %xmm8
1353 movaps 0x70(%rbp), %xmm9
1354 movaps 0x80(%rbp), %xmm10
1355 movaps 0x90(%rbp), %xmm11
1356 movaps 0xa0(%rbp), %xmm12
1357 movaps 0xb0(%rbp), %xmm13
1358 movaps 0xc0(%rbp), %xmm14
1359 movaps 0xd0(%rbp), %xmm15
1360 lea 0xa0(%rax), %rax
1376 lea (%rax), %rsp # restore %rsp
1377 .cfi_def_cfa_register %rsp
1381 .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1383 .globl bsaes_ecb_decrypt_blocks
1384 .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1386 bsaes_ecb_decrypt_blocks:
1402 lea -0x48(%rsp),%rsp
1403 .cfi_adjust_cfa_offset 0x48
1405 $code.=<<___ if ($win64);
1406 lea -0xa0(%rsp), %rsp
1407 movaps %xmm6, 0x40(%rsp)
1408 movaps %xmm7, 0x50(%rsp)
1409 movaps %xmm8, 0x60(%rsp)
1410 movaps %xmm9, 0x70(%rsp)
1411 movaps %xmm10, 0x80(%rsp)
1412 movaps %xmm11, 0x90(%rsp)
1413 movaps %xmm12, 0xa0(%rsp)
1414 movaps %xmm13, 0xb0(%rsp)
1415 movaps %xmm14, 0xc0(%rsp)
1416 movaps %xmm15, 0xd0(%rsp)
1420 mov %rsp,%rbp # backup %rsp
1421 .cfi_def_cfa_register %rbp
1422 mov 240($arg4),%eax # rounds
1423 mov $arg1,$inp # backup arguments
1430 mov %eax,%ebx # backup rounds
1431 shl \$7,%rax # 128 bytes per inner round key
1432 sub \$`128-32`,%rax # size of bit-sliced key schedule
1434 mov %rsp,%rax # pass key schedule
1435 mov $key,%rcx # pass key
1436 mov %ebx,%r10d # pass rounds
1437 call _bsaes_key_convert
1438 pxor (%rsp),%xmm7 # fix up 0 round key
1439 movdqa %xmm6,(%rax) # save last round key
1444 movdqu 0x00($inp), @XMM[0] # load input
1445 movdqu 0x10($inp), @XMM[1]
1446 movdqu 0x20($inp), @XMM[2]
1447 movdqu 0x30($inp), @XMM[3]
1448 movdqu 0x40($inp), @XMM[4]
1449 movdqu 0x50($inp), @XMM[5]
1450 mov %rsp, %rax # pass key schedule
1451 movdqu 0x60($inp), @XMM[6]
1452 mov %ebx,%r10d # pass rounds
1453 movdqu 0x70($inp), @XMM[7]
1454 lea 0x80($inp), $inp
1456 call _bsaes_decrypt8
1458 movdqu @XMM[0], 0x00($out) # write output
1459 movdqu @XMM[1], 0x10($out)
1460 movdqu @XMM[6], 0x20($out)
1461 movdqu @XMM[4], 0x30($out)
1462 movdqu @XMM[2], 0x40($out)
1463 movdqu @XMM[7], 0x50($out)
1464 movdqu @XMM[3], 0x60($out)
1465 movdqu @XMM[5], 0x70($out)
1466 lea 0x80($out), $out
1473 movdqu 0x00($inp), @XMM[0] # load input
1474 mov %rsp, %rax # pass key schedule
1475 mov %ebx,%r10d # pass rounds
1478 movdqu 0x10($inp), @XMM[1]
1480 movdqu 0x20($inp), @XMM[2]
1483 movdqu 0x30($inp), @XMM[3]
1485 movdqu 0x40($inp), @XMM[4]
1488 movdqu 0x50($inp), @XMM[5]
1490 movdqu 0x60($inp), @XMM[6]
1491 call _bsaes_decrypt8
1492 movdqu @XMM[0], 0x00($out) # write output
1493 movdqu @XMM[1], 0x10($out)
1494 movdqu @XMM[6], 0x20($out)
1495 movdqu @XMM[4], 0x30($out)
1496 movdqu @XMM[2], 0x40($out)
1497 movdqu @XMM[7], 0x50($out)
1498 movdqu @XMM[3], 0x60($out)
1502 call _bsaes_decrypt8
1503 movdqu @XMM[0], 0x00($out) # write output
1504 movdqu @XMM[1], 0x10($out)
1505 movdqu @XMM[6], 0x20($out)
1506 movdqu @XMM[4], 0x30($out)
1507 movdqu @XMM[2], 0x40($out)
1508 movdqu @XMM[7], 0x50($out)
1512 call _bsaes_decrypt8
1513 movdqu @XMM[0], 0x00($out) # write output
1514 movdqu @XMM[1], 0x10($out)
1515 movdqu @XMM[6], 0x20($out)
1516 movdqu @XMM[4], 0x30($out)
1517 movdqu @XMM[2], 0x40($out)
1521 call _bsaes_decrypt8
1522 movdqu @XMM[0], 0x00($out) # write output
1523 movdqu @XMM[1], 0x10($out)
1524 movdqu @XMM[6], 0x20($out)
1525 movdqu @XMM[4], 0x30($out)
1529 call _bsaes_decrypt8
1530 movdqu @XMM[0], 0x00($out) # write output
1531 movdqu @XMM[1], 0x10($out)
1532 movdqu @XMM[6], 0x20($out)
1536 call _bsaes_decrypt8
1537 movdqu @XMM[0], 0x00($out) # write output
1538 movdqu @XMM[1], 0x10($out)
1542 call _bsaes_decrypt8
1543 movdqu @XMM[0], 0x00($out) # write output
1550 call asm_AES_decrypt
1559 .Lecb_dec_bzero: # wipe key schedule [if any]
1560 movdqa %xmm0, 0x00(%rax)
1561 movdqa %xmm0, 0x10(%rax)
1562 lea 0x20(%rax), %rax
1569 $code.=<<___ if ($win64);
1570 movaps 0x40(%rbp), %xmm6
1571 movaps 0x50(%rbp), %xmm7
1572 movaps 0x60(%rbp), %xmm8
1573 movaps 0x70(%rbp), %xmm9
1574 movaps 0x80(%rbp), %xmm10
1575 movaps 0x90(%rbp), %xmm11
1576 movaps 0xa0(%rbp), %xmm12
1577 movaps 0xb0(%rbp), %xmm13
1578 movaps 0xc0(%rbp), %xmm14
1579 movaps 0xd0(%rbp), %xmm15
1580 lea 0xa0(%rax), %rax
1596 lea (%rax), %rsp # restore %rsp
1597 .cfi_def_cfa_register %rsp
1601 .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1605 .extern asm_AES_cbc_encrypt
1606 .globl bsaes_cbc_encrypt
1607 .type bsaes_cbc_encrypt,\@abi-omnipotent
1612 $code.=<<___ if ($win64);
1613 mov 48(%rsp),$arg6 # pull direction flag
1617 jne asm_AES_cbc_encrypt
1619 jb asm_AES_cbc_encrypt
1635 lea -0x48(%rsp), %rsp
1636 .cfi_adjust_cfa_offset 0x48
1638 $code.=<<___ if ($win64);
1639 mov 0xa0(%rsp),$arg5 # pull ivp
1640 lea -0xa0(%rsp), %rsp
1641 movaps %xmm6, 0x40(%rsp)
1642 movaps %xmm7, 0x50(%rsp)
1643 movaps %xmm8, 0x60(%rsp)
1644 movaps %xmm9, 0x70(%rsp)
1645 movaps %xmm10, 0x80(%rsp)
1646 movaps %xmm11, 0x90(%rsp)
1647 movaps %xmm12, 0xa0(%rsp)
1648 movaps %xmm13, 0xb0(%rsp)
1649 movaps %xmm14, 0xc0(%rsp)
1650 movaps %xmm15, 0xd0(%rsp)
1654 mov %rsp, %rbp # backup %rsp
1655 .cfi_def_cfa_register %rbp
1656 mov 240($arg4), %eax # rounds
1657 mov $arg1, $inp # backup arguments
1662 shr \$4, $len # bytes to blocks
1664 mov %eax, %edx # rounds
1665 shl \$7, %rax # 128 bytes per inner round key
1666 sub \$`128-32`, %rax # size of bit-sliced key schedule
1669 mov %rsp, %rax # pass key schedule
1670 mov $key, %rcx # pass key
1671 mov %edx, %r10d # pass rounds
1672 call _bsaes_key_convert
1673 pxor (%rsp),%xmm7 # fix up 0 round key
1674 movdqa %xmm6,(%rax) # save last round key
1677 movdqu (%rbx), @XMM[15] # load IV
1680 movdqu 0x00($inp), @XMM[0] # load input
1681 movdqu 0x10($inp), @XMM[1]
1682 movdqu 0x20($inp), @XMM[2]
1683 movdqu 0x30($inp), @XMM[3]
1684 movdqu 0x40($inp), @XMM[4]
1685 movdqu 0x50($inp), @XMM[5]
1686 mov %rsp, %rax # pass key schedule
1687 movdqu 0x60($inp), @XMM[6]
1688 mov %edx,%r10d # pass rounds
1689 movdqu 0x70($inp), @XMM[7]
1690 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1692 call _bsaes_decrypt8
1694 pxor 0x20(%rbp), @XMM[0] # ^= IV
1695 movdqu 0x00($inp), @XMM[8] # re-load input
1696 movdqu 0x10($inp), @XMM[9]
1697 pxor @XMM[8], @XMM[1]
1698 movdqu 0x20($inp), @XMM[10]
1699 pxor @XMM[9], @XMM[6]
1700 movdqu 0x30($inp), @XMM[11]
1701 pxor @XMM[10], @XMM[4]
1702 movdqu 0x40($inp), @XMM[12]
1703 pxor @XMM[11], @XMM[2]
1704 movdqu 0x50($inp), @XMM[13]
1705 pxor @XMM[12], @XMM[7]
1706 movdqu 0x60($inp), @XMM[14]
1707 pxor @XMM[13], @XMM[3]
1708 movdqu 0x70($inp), @XMM[15] # IV
1709 pxor @XMM[14], @XMM[5]
1710 movdqu @XMM[0], 0x00($out) # write output
1711 lea 0x80($inp), $inp
1712 movdqu @XMM[1], 0x10($out)
1713 movdqu @XMM[6], 0x20($out)
1714 movdqu @XMM[4], 0x30($out)
1715 movdqu @XMM[2], 0x40($out)
1716 movdqu @XMM[7], 0x50($out)
1717 movdqu @XMM[3], 0x60($out)
1718 movdqu @XMM[5], 0x70($out)
1719 lea 0x80($out), $out
1726 movdqu 0x00($inp), @XMM[0] # load input
1727 mov %rsp, %rax # pass key schedule
1728 mov %edx, %r10d # pass rounds
1731 movdqu 0x10($inp), @XMM[1]
1733 movdqu 0x20($inp), @XMM[2]
1736 movdqu 0x30($inp), @XMM[3]
1738 movdqu 0x40($inp), @XMM[4]
1741 movdqu 0x50($inp), @XMM[5]
1743 movdqu 0x60($inp), @XMM[6]
1744 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1745 call _bsaes_decrypt8
1746 pxor 0x20(%rbp), @XMM[0] # ^= IV
1747 movdqu 0x00($inp), @XMM[8] # re-load input
1748 movdqu 0x10($inp), @XMM[9]
1749 pxor @XMM[8], @XMM[1]
1750 movdqu 0x20($inp), @XMM[10]
1751 pxor @XMM[9], @XMM[6]
1752 movdqu 0x30($inp), @XMM[11]
1753 pxor @XMM[10], @XMM[4]
1754 movdqu 0x40($inp), @XMM[12]
1755 pxor @XMM[11], @XMM[2]
1756 movdqu 0x50($inp), @XMM[13]
1757 pxor @XMM[12], @XMM[7]
1758 movdqu 0x60($inp), @XMM[15] # IV
1759 pxor @XMM[13], @XMM[3]
1760 movdqu @XMM[0], 0x00($out) # write output
1761 movdqu @XMM[1], 0x10($out)
1762 movdqu @XMM[6], 0x20($out)
1763 movdqu @XMM[4], 0x30($out)
1764 movdqu @XMM[2], 0x40($out)
1765 movdqu @XMM[7], 0x50($out)
1766 movdqu @XMM[3], 0x60($out)
1770 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1771 call _bsaes_decrypt8
1772 pxor 0x20(%rbp), @XMM[0] # ^= IV
1773 movdqu 0x00($inp), @XMM[8] # re-load input
1774 movdqu 0x10($inp), @XMM[9]
1775 pxor @XMM[8], @XMM[1]
1776 movdqu 0x20($inp), @XMM[10]
1777 pxor @XMM[9], @XMM[6]
1778 movdqu 0x30($inp), @XMM[11]
1779 pxor @XMM[10], @XMM[4]
1780 movdqu 0x40($inp), @XMM[12]
1781 pxor @XMM[11], @XMM[2]
1782 movdqu 0x50($inp), @XMM[15] # IV
1783 pxor @XMM[12], @XMM[7]
1784 movdqu @XMM[0], 0x00($out) # write output
1785 movdqu @XMM[1], 0x10($out)
1786 movdqu @XMM[6], 0x20($out)
1787 movdqu @XMM[4], 0x30($out)
1788 movdqu @XMM[2], 0x40($out)
1789 movdqu @XMM[7], 0x50($out)
1793 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1794 call _bsaes_decrypt8
1795 pxor 0x20(%rbp), @XMM[0] # ^= IV
1796 movdqu 0x00($inp), @XMM[8] # re-load input
1797 movdqu 0x10($inp), @XMM[9]
1798 pxor @XMM[8], @XMM[1]
1799 movdqu 0x20($inp), @XMM[10]
1800 pxor @XMM[9], @XMM[6]
1801 movdqu 0x30($inp), @XMM[11]
1802 pxor @XMM[10], @XMM[4]
1803 movdqu 0x40($inp), @XMM[15] # IV
1804 pxor @XMM[11], @XMM[2]
1805 movdqu @XMM[0], 0x00($out) # write output
1806 movdqu @XMM[1], 0x10($out)
1807 movdqu @XMM[6], 0x20($out)
1808 movdqu @XMM[4], 0x30($out)
1809 movdqu @XMM[2], 0x40($out)
1813 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1814 call _bsaes_decrypt8
1815 pxor 0x20(%rbp), @XMM[0] # ^= IV
1816 movdqu 0x00($inp), @XMM[8] # re-load input
1817 movdqu 0x10($inp), @XMM[9]
1818 pxor @XMM[8], @XMM[1]
1819 movdqu 0x20($inp), @XMM[10]
1820 pxor @XMM[9], @XMM[6]
1821 movdqu 0x30($inp), @XMM[15] # IV
1822 pxor @XMM[10], @XMM[4]
1823 movdqu @XMM[0], 0x00($out) # write output
1824 movdqu @XMM[1], 0x10($out)
1825 movdqu @XMM[6], 0x20($out)
1826 movdqu @XMM[4], 0x30($out)
1830 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1831 call _bsaes_decrypt8
1832 pxor 0x20(%rbp), @XMM[0] # ^= IV
1833 movdqu 0x00($inp), @XMM[8] # re-load input
1834 movdqu 0x10($inp), @XMM[9]
1835 pxor @XMM[8], @XMM[1]
1836 movdqu 0x20($inp), @XMM[15] # IV
1837 pxor @XMM[9], @XMM[6]
1838 movdqu @XMM[0], 0x00($out) # write output
1839 movdqu @XMM[1], 0x10($out)
1840 movdqu @XMM[6], 0x20($out)
1844 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1845 call _bsaes_decrypt8
1846 pxor 0x20(%rbp), @XMM[0] # ^= IV
1847 movdqu 0x00($inp), @XMM[8] # re-load input
1848 movdqu 0x10($inp), @XMM[15] # IV
1849 pxor @XMM[8], @XMM[1]
1850 movdqu @XMM[0], 0x00($out) # write output
1851 movdqu @XMM[1], 0x10($out)
1856 lea 0x20(%rbp), $arg2 # buffer output
1858 call asm_AES_decrypt # doesn't touch %xmm
1859 pxor 0x20(%rbp), @XMM[15] # ^= IV
1860 movdqu @XMM[15], ($out) # write output
1861 movdqa @XMM[0], @XMM[15] # IV
1864 movdqu @XMM[15], (%rbx) # return IV
1867 .Lcbc_dec_bzero: # wipe key schedule [if any]
1868 movdqa %xmm0, 0x00(%rax)
1869 movdqa %xmm0, 0x10(%rax)
1870 lea 0x20(%rax), %rax
1877 $code.=<<___ if ($win64);
1878 movaps 0x40(%rbp), %xmm6
1879 movaps 0x50(%rbp), %xmm7
1880 movaps 0x60(%rbp), %xmm8
1881 movaps 0x70(%rbp), %xmm9
1882 movaps 0x80(%rbp), %xmm10
1883 movaps 0x90(%rbp), %xmm11
1884 movaps 0xa0(%rbp), %xmm12
1885 movaps 0xb0(%rbp), %xmm13
1886 movaps 0xc0(%rbp), %xmm14
1887 movaps 0xd0(%rbp), %xmm15
1888 lea 0xa0(%rax), %rax
1904 lea (%rax), %rsp # restore %rsp
1905 .cfi_def_cfa_register %rsp
1909 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1911 .globl bsaes_ctr32_encrypt_blocks
1912 .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1914 bsaes_ctr32_encrypt_blocks:
1930 lea -0x48(%rsp), %rsp
1931 .cfi_adjust_cfa_offset 0x48
1933 $code.=<<___ if ($win64);
1934 mov 0xa0(%rsp),$arg5 # pull ivp
1935 lea -0xa0(%rsp), %rsp
1936 movaps %xmm6, 0x40(%rsp)
1937 movaps %xmm7, 0x50(%rsp)
1938 movaps %xmm8, 0x60(%rsp)
1939 movaps %xmm9, 0x70(%rsp)
1940 movaps %xmm10, 0x80(%rsp)
1941 movaps %xmm11, 0x90(%rsp)
1942 movaps %xmm12, 0xa0(%rsp)
1943 movaps %xmm13, 0xb0(%rsp)
1944 movaps %xmm14, 0xc0(%rsp)
1945 movaps %xmm15, 0xd0(%rsp)
1949 mov %rsp, %rbp # backup %rsp
1950 .cfi_def_cfa_register %rbp
1951 movdqu ($arg5), %xmm0 # load counter
1952 mov 240($arg4), %eax # rounds
1953 mov $arg1, $inp # backup arguments
1957 movdqa %xmm0, 0x20(%rbp) # copy counter
1961 mov %eax, %ebx # rounds
1962 shl \$7, %rax # 128 bytes per inner round key
1963 sub \$`128-32`, %rax # size of bit-sliced key schedule
1966 mov %rsp, %rax # pass key schedule
1967 mov $key, %rcx # pass key
1968 mov %ebx, %r10d # pass rounds
1969 call _bsaes_key_convert
1970 pxor %xmm6,%xmm7 # fix up last round key
1971 movdqa %xmm7,(%rax) # save last round key
1973 movdqa (%rsp), @XMM[9] # load round0 key
1974 lea .LADD1(%rip), %r11
1975 movdqa 0x20(%rbp), @XMM[0] # counter copy
1976 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1977 pshufb @XMM[8], @XMM[9] # byte swap upper part
1978 pshufb @XMM[8], @XMM[0]
1979 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1983 movdqa @XMM[0], 0x20(%rbp) # save counter
1984 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1985 movdqa @XMM[0], @XMM[2]
1986 paddd 0x00(%r11), @XMM[1] # .LADD1
1987 movdqa @XMM[0], @XMM[3]
1988 paddd 0x10(%r11), @XMM[2] # .LADD2
1989 movdqa @XMM[0], @XMM[4]
1990 paddd 0x20(%r11), @XMM[3] # .LADD3
1991 movdqa @XMM[0], @XMM[5]
1992 paddd 0x30(%r11), @XMM[4] # .LADD4
1993 movdqa @XMM[0], @XMM[6]
1994 paddd 0x40(%r11), @XMM[5] # .LADD5
1995 movdqa @XMM[0], @XMM[7]
1996 paddd 0x50(%r11), @XMM[6] # .LADD6
1997 paddd 0x60(%r11), @XMM[7] # .LADD7
1999 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
2000 # to flip byte order in 32-bit counter
2001 movdqa (%rsp), @XMM[9] # round 0 key
2002 lea 0x10(%rsp), %rax # pass key schedule
2003 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
2004 pxor @XMM[9], @XMM[0] # xor with round0 key
2005 pxor @XMM[9], @XMM[1]
2006 pxor @XMM[9], @XMM[2]
2007 pxor @XMM[9], @XMM[3]
2008 pshufb @XMM[8], @XMM[0]
2009 pshufb @XMM[8], @XMM[1]
2010 pxor @XMM[9], @XMM[4]
2011 pxor @XMM[9], @XMM[5]
2012 pshufb @XMM[8], @XMM[2]
2013 pshufb @XMM[8], @XMM[3]
2014 pxor @XMM[9], @XMM[6]
2015 pxor @XMM[9], @XMM[7]
2016 pshufb @XMM[8], @XMM[4]
2017 pshufb @XMM[8], @XMM[5]
2018 pshufb @XMM[8], @XMM[6]
2019 pshufb @XMM[8], @XMM[7]
2020 lea .LBS0(%rip), %r11 # constants table
2021 mov %ebx,%r10d # pass rounds
2023 call _bsaes_encrypt8_bitslice
2026 jc .Lctr_enc_loop_done
2028 movdqu 0x00($inp), @XMM[8] # load input
2029 movdqu 0x10($inp), @XMM[9]
2030 movdqu 0x20($inp), @XMM[10]
2031 movdqu 0x30($inp), @XMM[11]
2032 movdqu 0x40($inp), @XMM[12]
2033 movdqu 0x50($inp), @XMM[13]
2034 movdqu 0x60($inp), @XMM[14]
2035 movdqu 0x70($inp), @XMM[15]
2037 pxor @XMM[0], @XMM[8]
2038 movdqa 0x20(%rbp), @XMM[0] # load counter
2039 pxor @XMM[9], @XMM[1]
2040 movdqu @XMM[8], 0x00($out) # write output
2041 pxor @XMM[10], @XMM[4]
2042 movdqu @XMM[1], 0x10($out)
2043 pxor @XMM[11], @XMM[6]
2044 movdqu @XMM[4], 0x20($out)
2045 pxor @XMM[12], @XMM[3]
2046 movdqu @XMM[6], 0x30($out)
2047 pxor @XMM[13], @XMM[7]
2048 movdqu @XMM[3], 0x40($out)
2049 pxor @XMM[14], @XMM[2]
2050 movdqu @XMM[7], 0x50($out)
2051 pxor @XMM[15], @XMM[5]
2052 movdqu @XMM[2], 0x60($out)
2053 lea .LADD1(%rip), %r11
2054 movdqu @XMM[5], 0x70($out)
2055 lea 0x80($out), $out
2056 paddd 0x70(%r11), @XMM[0] # .LADD8
2061 .Lctr_enc_loop_done:
2063 movdqu 0x00($inp), @XMM[8] # load input
2064 pxor @XMM[8], @XMM[0]
2065 movdqu @XMM[0], 0x00($out) # write output
2068 movdqu 0x10($inp), @XMM[9]
2069 pxor @XMM[9], @XMM[1]
2070 movdqu @XMM[1], 0x10($out)
2072 movdqu 0x20($inp), @XMM[10]
2073 pxor @XMM[10], @XMM[4]
2074 movdqu @XMM[4], 0x20($out)
2077 movdqu 0x30($inp), @XMM[11]
2078 pxor @XMM[11], @XMM[6]
2079 movdqu @XMM[6], 0x30($out)
2081 movdqu 0x40($inp), @XMM[12]
2082 pxor @XMM[12], @XMM[3]
2083 movdqu @XMM[3], 0x40($out)
2086 movdqu 0x50($inp), @XMM[13]
2087 pxor @XMM[13], @XMM[7]
2088 movdqu @XMM[7], 0x50($out)
2090 movdqu 0x60($inp), @XMM[14]
2091 pxor @XMM[14], @XMM[2]
2092 movdqu @XMM[2], 0x60($out)
2097 lea 0x20(%rbp), $arg1
2098 lea 0x30(%rbp), $arg2
2100 call asm_AES_encrypt
2101 movdqu ($inp), @XMM[1]
2103 mov 0x2c(%rbp), %eax # load 32-bit counter
2105 pxor 0x30(%rbp), @XMM[1]
2106 inc %eax # increment
2107 movdqu @XMM[1], ($out)
2110 mov %eax, 0x2c(%rsp) # save 32-bit counter
2117 .Lctr_enc_bzero: # wipe key schedule [if any]
2118 movdqa %xmm0, 0x00(%rax)
2119 movdqa %xmm0, 0x10(%rax)
2120 lea 0x20(%rax), %rax
2127 $code.=<<___ if ($win64);
2128 movaps 0x40(%rbp), %xmm6
2129 movaps 0x50(%rbp), %xmm7
2130 movaps 0x60(%rbp), %xmm8
2131 movaps 0x70(%rbp), %xmm9
2132 movaps 0x80(%rbp), %xmm10
2133 movaps 0x90(%rbp), %xmm11
2134 movaps 0xa0(%rbp), %xmm12
2135 movaps 0xb0(%rbp), %xmm13
2136 movaps 0xc0(%rbp), %xmm14
2137 movaps 0xd0(%rbp), %xmm15
2138 lea 0xa0(%rax), %rax
2154 lea (%rax), %rsp # restore %rsp
2155 .cfi_def_cfa_register %rsp
2159 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2161 ######################################################################
2162 # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2163 # const AES_KEY *key1, const AES_KEY *key2,
2164 # const unsigned char iv[16]);
2166 my ($twmask,$twres,$twtmp)=@XMM[13..15];
2170 .globl bsaes_xts_encrypt
2171 .type bsaes_xts_encrypt,\@abi-omnipotent
2189 lea -0x48(%rsp), %rsp
2190 .cfi_adjust_cfa_offset 0x48
2192 $code.=<<___ if ($win64);
2193 mov 0xa0(%rsp),$arg5 # pull key2
2194 mov 0xa8(%rsp),$arg6 # pull ivp
2195 lea -0xa0(%rsp), %rsp
2196 movaps %xmm6, 0x40(%rsp)
2197 movaps %xmm7, 0x50(%rsp)
2198 movaps %xmm8, 0x60(%rsp)
2199 movaps %xmm9, 0x70(%rsp)
2200 movaps %xmm10, 0x80(%rsp)
2201 movaps %xmm11, 0x90(%rsp)
2202 movaps %xmm12, 0xa0(%rsp)
2203 movaps %xmm13, 0xb0(%rsp)
2204 movaps %xmm14, 0xc0(%rsp)
2205 movaps %xmm15, 0xd0(%rsp)
2209 mov %rsp, %rbp # backup %rsp
2210 .cfi_def_cfa_register %rbp
2211 mov $arg1, $inp # backup arguments
2217 lea 0x20(%rbp), $arg2
2219 call asm_AES_encrypt # generate initial tweak
2221 mov 240($key), %eax # rounds
2222 mov $len, %rbx # backup $len
2224 mov %eax, %edx # rounds
2225 shl \$7, %rax # 128 bytes per inner round key
2226 sub \$`128-32`, %rax # size of bit-sliced key schedule
2229 mov %rsp, %rax # pass key schedule
2230 mov $key, %rcx # pass key
2231 mov %edx, %r10d # pass rounds
2232 call _bsaes_key_convert
2233 pxor %xmm6, %xmm7 # fix up last round key
2234 movdqa %xmm7, (%rax) # save last round key
2237 sub \$0x80, %rsp # place for tweak[8]
2238 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2241 movdqa .Lxts_magic(%rip), $twmask
2242 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2251 for ($i=0;$i<7;$i++) {
2253 pshufd \$0x13, $twtmp, $twres
2255 movdqa @XMM[7], @XMM[$i]
2256 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2257 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2258 pand $twmask, $twres # isolate carry and residue
2259 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2260 pxor $twres, @XMM[7]
2262 $code.=<<___ if ($i>=1);
2263 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2265 $code.=<<___ if ($i>=2);
2266 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2270 movdqu 0x60($inp), @XMM[8+6]
2271 pxor @XMM[8+5], @XMM[5]
2272 movdqu 0x70($inp), @XMM[8+7]
2273 lea 0x80($inp), $inp
2274 movdqa @XMM[7], 0x70(%rsp)
2275 pxor @XMM[8+6], @XMM[6]
2276 lea 0x80(%rsp), %rax # pass key schedule
2277 pxor @XMM[8+7], @XMM[7]
2278 mov %edx, %r10d # pass rounds
2280 call _bsaes_encrypt8
2282 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2283 pxor 0x10(%rsp), @XMM[1]
2284 movdqu @XMM[0], 0x00($out) # write output
2285 pxor 0x20(%rsp), @XMM[4]
2286 movdqu @XMM[1], 0x10($out)
2287 pxor 0x30(%rsp), @XMM[6]
2288 movdqu @XMM[4], 0x20($out)
2289 pxor 0x40(%rsp), @XMM[3]
2290 movdqu @XMM[6], 0x30($out)
2291 pxor 0x50(%rsp), @XMM[7]
2292 movdqu @XMM[3], 0x40($out)
2293 pxor 0x60(%rsp), @XMM[2]
2294 movdqu @XMM[7], 0x50($out)
2295 pxor 0x70(%rsp), @XMM[5]
2296 movdqu @XMM[2], 0x60($out)
2297 movdqu @XMM[5], 0x70($out)
2298 lea 0x80($out), $out
2300 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2302 movdqa .Lxts_magic(%rip), $twmask
2303 pcmpgtd @XMM[7], $twtmp
2304 pshufd \$0x13, $twtmp, $twres
2306 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2307 pand $twmask, $twres # isolate carry and residue
2308 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2309 pxor $twres, @XMM[7]
2318 for ($i=0;$i<7;$i++) {
2320 pshufd \$0x13, $twtmp, $twres
2322 movdqa @XMM[7], @XMM[$i]
2323 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2324 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2325 pand $twmask, $twres # isolate carry and residue
2326 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2327 pxor $twres, @XMM[7]
2329 $code.=<<___ if ($i>=1);
2330 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2331 cmp \$`0x10*$i`,$len
2334 $code.=<<___ if ($i>=2);
2335 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2339 movdqu 0x60($inp), @XMM[8+6]
2340 pxor @XMM[8+5], @XMM[5]
2341 movdqa @XMM[7], 0x70(%rsp)
2342 lea 0x70($inp), $inp
2343 pxor @XMM[8+6], @XMM[6]
2344 lea 0x80(%rsp), %rax # pass key schedule
2345 mov %edx, %r10d # pass rounds
2347 call _bsaes_encrypt8
2349 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2350 pxor 0x10(%rsp), @XMM[1]
2351 movdqu @XMM[0], 0x00($out) # write output
2352 pxor 0x20(%rsp), @XMM[4]
2353 movdqu @XMM[1], 0x10($out)
2354 pxor 0x30(%rsp), @XMM[6]
2355 movdqu @XMM[4], 0x20($out)
2356 pxor 0x40(%rsp), @XMM[3]
2357 movdqu @XMM[6], 0x30($out)
2358 pxor 0x50(%rsp), @XMM[7]
2359 movdqu @XMM[3], 0x40($out)
2360 pxor 0x60(%rsp), @XMM[2]
2361 movdqu @XMM[7], 0x50($out)
2362 movdqu @XMM[2], 0x60($out)
2363 lea 0x70($out), $out
2365 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2369 pxor @XMM[8+4], @XMM[4]
2370 lea 0x60($inp), $inp
2371 pxor @XMM[8+5], @XMM[5]
2372 lea 0x80(%rsp), %rax # pass key schedule
2373 mov %edx, %r10d # pass rounds
2375 call _bsaes_encrypt8
2377 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2378 pxor 0x10(%rsp), @XMM[1]
2379 movdqu @XMM[0], 0x00($out) # write output
2380 pxor 0x20(%rsp), @XMM[4]
2381 movdqu @XMM[1], 0x10($out)
2382 pxor 0x30(%rsp), @XMM[6]
2383 movdqu @XMM[4], 0x20($out)
2384 pxor 0x40(%rsp), @XMM[3]
2385 movdqu @XMM[6], 0x30($out)
2386 pxor 0x50(%rsp), @XMM[7]
2387 movdqu @XMM[3], 0x40($out)
2388 movdqu @XMM[7], 0x50($out)
2389 lea 0x60($out), $out
2391 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2395 pxor @XMM[8+3], @XMM[3]
2396 lea 0x50($inp), $inp
2397 pxor @XMM[8+4], @XMM[4]
2398 lea 0x80(%rsp), %rax # pass key schedule
2399 mov %edx, %r10d # pass rounds
2401 call _bsaes_encrypt8
2403 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2404 pxor 0x10(%rsp), @XMM[1]
2405 movdqu @XMM[0], 0x00($out) # write output
2406 pxor 0x20(%rsp), @XMM[4]
2407 movdqu @XMM[1], 0x10($out)
2408 pxor 0x30(%rsp), @XMM[6]
2409 movdqu @XMM[4], 0x20($out)
2410 pxor 0x40(%rsp), @XMM[3]
2411 movdqu @XMM[6], 0x30($out)
2412 movdqu @XMM[3], 0x40($out)
2413 lea 0x50($out), $out
2415 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2419 pxor @XMM[8+2], @XMM[2]
2420 lea 0x40($inp), $inp
2421 pxor @XMM[8+3], @XMM[3]
2422 lea 0x80(%rsp), %rax # pass key schedule
2423 mov %edx, %r10d # pass rounds
2425 call _bsaes_encrypt8
2427 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2428 pxor 0x10(%rsp), @XMM[1]
2429 movdqu @XMM[0], 0x00($out) # write output
2430 pxor 0x20(%rsp), @XMM[4]
2431 movdqu @XMM[1], 0x10($out)
2432 pxor 0x30(%rsp), @XMM[6]
2433 movdqu @XMM[4], 0x20($out)
2434 movdqu @XMM[6], 0x30($out)
2435 lea 0x40($out), $out
2437 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2441 pxor @XMM[8+1], @XMM[1]
2442 lea 0x30($inp), $inp
2443 pxor @XMM[8+2], @XMM[2]
2444 lea 0x80(%rsp), %rax # pass key schedule
2445 mov %edx, %r10d # pass rounds
2447 call _bsaes_encrypt8
2449 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2450 pxor 0x10(%rsp), @XMM[1]
2451 movdqu @XMM[0], 0x00($out) # write output
2452 pxor 0x20(%rsp), @XMM[4]
2453 movdqu @XMM[1], 0x10($out)
2454 movdqu @XMM[4], 0x20($out)
2455 lea 0x30($out), $out
2457 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2461 pxor @XMM[8+0], @XMM[0]
2462 lea 0x20($inp), $inp
2463 pxor @XMM[8+1], @XMM[1]
2464 lea 0x80(%rsp), %rax # pass key schedule
2465 mov %edx, %r10d # pass rounds
2467 call _bsaes_encrypt8
2469 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2470 pxor 0x10(%rsp), @XMM[1]
2471 movdqu @XMM[0], 0x00($out) # write output
2472 movdqu @XMM[1], 0x10($out)
2473 lea 0x20($out), $out
2475 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2479 pxor @XMM[0], @XMM[8]
2480 lea 0x10($inp), $inp
2481 movdqa @XMM[8], 0x20(%rbp)
2482 lea 0x20(%rbp), $arg1
2483 lea 0x20(%rbp), $arg2
2485 call asm_AES_encrypt # doesn't touch %xmm
2486 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2487 #pxor @XMM[8], @XMM[0]
2488 #lea 0x80(%rsp), %rax # pass key schedule
2489 #mov %edx, %r10d # pass rounds
2490 #call _bsaes_encrypt8
2491 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2492 movdqu @XMM[0], 0x00($out) # write output
2493 lea 0x10($out), $out
2495 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2504 movzb -16(%rdx), %ecx
2512 movdqu -16($out), @XMM[0]
2513 lea 0x20(%rbp), $arg1
2514 pxor @XMM[7], @XMM[0]
2515 lea 0x20(%rbp), $arg2
2516 movdqa @XMM[0], 0x20(%rbp)
2518 call asm_AES_encrypt # doesn't touch %xmm
2519 pxor 0x20(%rbp), @XMM[7]
2520 movdqu @XMM[7], -16($out)
2525 .Lxts_enc_bzero: # wipe key schedule [if any]
2526 movdqa %xmm0, 0x00(%rax)
2527 movdqa %xmm0, 0x10(%rax)
2528 lea 0x20(%rax), %rax
2535 $code.=<<___ if ($win64);
2536 movaps 0x40(%rbp), %xmm6
2537 movaps 0x50(%rbp), %xmm7
2538 movaps 0x60(%rbp), %xmm8
2539 movaps 0x70(%rbp), %xmm9
2540 movaps 0x80(%rbp), %xmm10
2541 movaps 0x90(%rbp), %xmm11
2542 movaps 0xa0(%rbp), %xmm12
2543 movaps 0xb0(%rbp), %xmm13
2544 movaps 0xc0(%rbp), %xmm14
2545 movaps 0xd0(%rbp), %xmm15
2546 lea 0xa0(%rax), %rax
2562 lea (%rax), %rsp # restore %rsp
2563 .cfi_def_cfa_register %rsp
2567 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2569 .globl bsaes_xts_decrypt
2570 .type bsaes_xts_decrypt,\@abi-omnipotent
2588 lea -0x48(%rsp), %rsp
2589 .cfi_adjust_cfa_offset 0x48
2591 $code.=<<___ if ($win64);
2592 mov 0xa0(%rsp),$arg5 # pull key2
2593 mov 0xa8(%rsp),$arg6 # pull ivp
2594 lea -0xa0(%rsp), %rsp
2595 movaps %xmm6, 0x40(%rsp)
2596 movaps %xmm7, 0x50(%rsp)
2597 movaps %xmm8, 0x60(%rsp)
2598 movaps %xmm9, 0x70(%rsp)
2599 movaps %xmm10, 0x80(%rsp)
2600 movaps %xmm11, 0x90(%rsp)
2601 movaps %xmm12, 0xa0(%rsp)
2602 movaps %xmm13, 0xb0(%rsp)
2603 movaps %xmm14, 0xc0(%rsp)
2604 movaps %xmm15, 0xd0(%rsp)
2608 mov %rsp, %rbp # backup %rsp
2609 mov $arg1, $inp # backup arguments
2615 lea 0x20(%rbp), $arg2
2617 call asm_AES_encrypt # generate initial tweak
2619 mov 240($key), %eax # rounds
2620 mov $len, %rbx # backup $len
2622 mov %eax, %edx # rounds
2623 shl \$7, %rax # 128 bytes per inner round key
2624 sub \$`128-32`, %rax # size of bit-sliced key schedule
2627 mov %rsp, %rax # pass key schedule
2628 mov $key, %rcx # pass key
2629 mov %edx, %r10d # pass rounds
2630 call _bsaes_key_convert
2631 pxor (%rsp), %xmm7 # fix up round 0 key
2632 movdqa %xmm6, (%rax) # save last round key
2633 movdqa %xmm7, (%rsp)
2635 xor %eax, %eax # if ($len%16) len-=16;
2642 sub \$0x80, %rsp # place for tweak[8]
2643 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2646 movdqa .Lxts_magic(%rip), $twmask
2647 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2656 for ($i=0;$i<7;$i++) {
2658 pshufd \$0x13, $twtmp, $twres
2660 movdqa @XMM[7], @XMM[$i]
2661 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2662 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2663 pand $twmask, $twres # isolate carry and residue
2664 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2665 pxor $twres, @XMM[7]
2667 $code.=<<___ if ($i>=1);
2668 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2670 $code.=<<___ if ($i>=2);
2671 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2675 movdqu 0x60($inp), @XMM[8+6]
2676 pxor @XMM[8+5], @XMM[5]
2677 movdqu 0x70($inp), @XMM[8+7]
2678 lea 0x80($inp), $inp
2679 movdqa @XMM[7], 0x70(%rsp)
2680 pxor @XMM[8+6], @XMM[6]
2681 lea 0x80(%rsp), %rax # pass key schedule
2682 pxor @XMM[8+7], @XMM[7]
2683 mov %edx, %r10d # pass rounds
2685 call _bsaes_decrypt8
2687 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2688 pxor 0x10(%rsp), @XMM[1]
2689 movdqu @XMM[0], 0x00($out) # write output
2690 pxor 0x20(%rsp), @XMM[6]
2691 movdqu @XMM[1], 0x10($out)
2692 pxor 0x30(%rsp), @XMM[4]
2693 movdqu @XMM[6], 0x20($out)
2694 pxor 0x40(%rsp), @XMM[2]
2695 movdqu @XMM[4], 0x30($out)
2696 pxor 0x50(%rsp), @XMM[7]
2697 movdqu @XMM[2], 0x40($out)
2698 pxor 0x60(%rsp), @XMM[3]
2699 movdqu @XMM[7], 0x50($out)
2700 pxor 0x70(%rsp), @XMM[5]
2701 movdqu @XMM[3], 0x60($out)
2702 movdqu @XMM[5], 0x70($out)
2703 lea 0x80($out), $out
2705 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2707 movdqa .Lxts_magic(%rip), $twmask
2708 pcmpgtd @XMM[7], $twtmp
2709 pshufd \$0x13, $twtmp, $twres
2711 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2712 pand $twmask, $twres # isolate carry and residue
2713 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2714 pxor $twres, @XMM[7]
2723 for ($i=0;$i<7;$i++) {
2725 pshufd \$0x13, $twtmp, $twres
2727 movdqa @XMM[7], @XMM[$i]
2728 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2729 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2730 pand $twmask, $twres # isolate carry and residue
2731 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2732 pxor $twres, @XMM[7]
2734 $code.=<<___ if ($i>=1);
2735 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2736 cmp \$`0x10*$i`,$len
2739 $code.=<<___ if ($i>=2);
2740 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2744 movdqu 0x60($inp), @XMM[8+6]
2745 pxor @XMM[8+5], @XMM[5]
2746 movdqa @XMM[7], 0x70(%rsp)
2747 lea 0x70($inp), $inp
2748 pxor @XMM[8+6], @XMM[6]
2749 lea 0x80(%rsp), %rax # pass key schedule
2750 mov %edx, %r10d # pass rounds
2752 call _bsaes_decrypt8
2754 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2755 pxor 0x10(%rsp), @XMM[1]
2756 movdqu @XMM[0], 0x00($out) # write output
2757 pxor 0x20(%rsp), @XMM[6]
2758 movdqu @XMM[1], 0x10($out)
2759 pxor 0x30(%rsp), @XMM[4]
2760 movdqu @XMM[6], 0x20($out)
2761 pxor 0x40(%rsp), @XMM[2]
2762 movdqu @XMM[4], 0x30($out)
2763 pxor 0x50(%rsp), @XMM[7]
2764 movdqu @XMM[2], 0x40($out)
2765 pxor 0x60(%rsp), @XMM[3]
2766 movdqu @XMM[7], 0x50($out)
2767 movdqu @XMM[3], 0x60($out)
2768 lea 0x70($out), $out
2770 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2774 pxor @XMM[8+4], @XMM[4]
2775 lea 0x60($inp), $inp
2776 pxor @XMM[8+5], @XMM[5]
2777 lea 0x80(%rsp), %rax # pass key schedule
2778 mov %edx, %r10d # pass rounds
2780 call _bsaes_decrypt8
2782 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2783 pxor 0x10(%rsp), @XMM[1]
2784 movdqu @XMM[0], 0x00($out) # write output
2785 pxor 0x20(%rsp), @XMM[6]
2786 movdqu @XMM[1], 0x10($out)
2787 pxor 0x30(%rsp), @XMM[4]
2788 movdqu @XMM[6], 0x20($out)
2789 pxor 0x40(%rsp), @XMM[2]
2790 movdqu @XMM[4], 0x30($out)
2791 pxor 0x50(%rsp), @XMM[7]
2792 movdqu @XMM[2], 0x40($out)
2793 movdqu @XMM[7], 0x50($out)
2794 lea 0x60($out), $out
2796 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2800 pxor @XMM[8+3], @XMM[3]
2801 lea 0x50($inp), $inp
2802 pxor @XMM[8+4], @XMM[4]
2803 lea 0x80(%rsp), %rax # pass key schedule
2804 mov %edx, %r10d # pass rounds
2806 call _bsaes_decrypt8
2808 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2809 pxor 0x10(%rsp), @XMM[1]
2810 movdqu @XMM[0], 0x00($out) # write output
2811 pxor 0x20(%rsp), @XMM[6]
2812 movdqu @XMM[1], 0x10($out)
2813 pxor 0x30(%rsp), @XMM[4]
2814 movdqu @XMM[6], 0x20($out)
2815 pxor 0x40(%rsp), @XMM[2]
2816 movdqu @XMM[4], 0x30($out)
2817 movdqu @XMM[2], 0x40($out)
2818 lea 0x50($out), $out
2820 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2824 pxor @XMM[8+2], @XMM[2]
2825 lea 0x40($inp), $inp
2826 pxor @XMM[8+3], @XMM[3]
2827 lea 0x80(%rsp), %rax # pass key schedule
2828 mov %edx, %r10d # pass rounds
2830 call _bsaes_decrypt8
2832 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2833 pxor 0x10(%rsp), @XMM[1]
2834 movdqu @XMM[0], 0x00($out) # write output
2835 pxor 0x20(%rsp), @XMM[6]
2836 movdqu @XMM[1], 0x10($out)
2837 pxor 0x30(%rsp), @XMM[4]
2838 movdqu @XMM[6], 0x20($out)
2839 movdqu @XMM[4], 0x30($out)
2840 lea 0x40($out), $out
2842 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2846 pxor @XMM[8+1], @XMM[1]
2847 lea 0x30($inp), $inp
2848 pxor @XMM[8+2], @XMM[2]
2849 lea 0x80(%rsp), %rax # pass key schedule
2850 mov %edx, %r10d # pass rounds
2852 call _bsaes_decrypt8
2854 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2855 pxor 0x10(%rsp), @XMM[1]
2856 movdqu @XMM[0], 0x00($out) # write output
2857 pxor 0x20(%rsp), @XMM[6]
2858 movdqu @XMM[1], 0x10($out)
2859 movdqu @XMM[6], 0x20($out)
2860 lea 0x30($out), $out
2862 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2866 pxor @XMM[8+0], @XMM[0]
2867 lea 0x20($inp), $inp
2868 pxor @XMM[8+1], @XMM[1]
2869 lea 0x80(%rsp), %rax # pass key schedule
2870 mov %edx, %r10d # pass rounds
2872 call _bsaes_decrypt8
2874 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2875 pxor 0x10(%rsp), @XMM[1]
2876 movdqu @XMM[0], 0x00($out) # write output
2877 movdqu @XMM[1], 0x10($out)
2878 lea 0x20($out), $out
2880 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2884 pxor @XMM[0], @XMM[8]
2885 lea 0x10($inp), $inp
2886 movdqa @XMM[8], 0x20(%rbp)
2887 lea 0x20(%rbp), $arg1
2888 lea 0x20(%rbp), $arg2
2890 call asm_AES_decrypt # doesn't touch %xmm
2891 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2892 #pxor @XMM[8], @XMM[0]
2893 #lea 0x80(%rsp), %rax # pass key schedule
2894 #mov %edx, %r10d # pass rounds
2895 #call _bsaes_decrypt8
2896 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2897 movdqu @XMM[0], 0x00($out) # write output
2898 lea 0x10($out), $out
2900 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2907 movdqa .Lxts_magic(%rip), $twmask
2908 pcmpgtd @XMM[7], $twtmp
2909 pshufd \$0x13, $twtmp, $twres
2910 movdqa @XMM[7], @XMM[6]
2911 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2912 pand $twmask, $twres # isolate carry and residue
2913 movdqu ($inp), @XMM[0]
2914 pxor $twres, @XMM[7]
2916 lea 0x20(%rbp), $arg1
2917 pxor @XMM[7], @XMM[0]
2918 lea 0x20(%rbp), $arg2
2919 movdqa @XMM[0], 0x20(%rbp)
2921 call asm_AES_decrypt # doesn't touch %xmm
2922 pxor 0x20(%rbp), @XMM[7]
2924 movdqu @XMM[7], ($out)
2927 movzb 16($inp), %eax
2936 movdqu ($out), @XMM[0]
2937 lea 0x20(%rbp), $arg1
2938 pxor @XMM[6], @XMM[0]
2939 lea 0x20(%rbp), $arg2
2940 movdqa @XMM[0], 0x20(%rbp)
2942 call asm_AES_decrypt # doesn't touch %xmm
2943 pxor 0x20(%rbp), @XMM[6]
2944 movdqu @XMM[6], ($out)
2949 .Lxts_dec_bzero: # wipe key schedule [if any]
2950 movdqa %xmm0, 0x00(%rax)
2951 movdqa %xmm0, 0x10(%rax)
2952 lea 0x20(%rax), %rax
2959 $code.=<<___ if ($win64);
2960 movaps 0x40(%rbp), %xmm6
2961 movaps 0x50(%rbp), %xmm7
2962 movaps 0x60(%rbp), %xmm8
2963 movaps 0x70(%rbp), %xmm9
2964 movaps 0x80(%rbp), %xmm10
2965 movaps 0x90(%rbp), %xmm11
2966 movaps 0xa0(%rbp), %xmm12
2967 movaps 0xb0(%rbp), %xmm13
2968 movaps 0xc0(%rbp), %xmm14
2969 movaps 0xd0(%rbp), %xmm15
2970 lea 0xa0(%rax), %rax
2986 lea (%rax), %rsp # restore %rsp
2987 .cfi_def_cfa_register %rsp
2991 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2995 .type _bsaes_const,\@object
2998 .LM0ISR: # InvShiftRows constants
2999 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
3001 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
3003 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
3004 .LBS0: # bit-slice constants
3005 .quad 0x5555555555555555, 0x5555555555555555
3007 .quad 0x3333333333333333, 0x3333333333333333
3009 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
3010 .LSR: # shiftrows constants
3011 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
3013 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
3015 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
3016 .LSWPUP: # byte-swap upper dword
3017 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
3019 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
3020 .LADD1: # counter increment constants
3021 .quad 0x0000000000000000, 0x0000000100000000
3023 .quad 0x0000000000000000, 0x0000000200000000
3025 .quad 0x0000000000000000, 0x0000000300000000
3027 .quad 0x0000000000000000, 0x0000000400000000
3029 .quad 0x0000000000000000, 0x0000000500000000
3031 .quad 0x0000000000000000, 0x0000000600000000
3033 .quad 0x0000000000000000, 0x0000000700000000
3035 .quad 0x0000000000000000, 0x0000000800000000
3039 .quad 0x0101010101010101, 0x0101010101010101
3040 .quad 0x0202020202020202, 0x0202020202020202
3041 .quad 0x0404040404040404, 0x0404040404040404
3042 .quad 0x0808080808080808, 0x0808080808080808
3044 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
3046 .quad 0x6363636363636363, 0x6363636363636363
3047 .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
3049 .size _bsaes_const,.-_bsaes_const
3052 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3053 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
3061 .extern __imp_RtlVirtualUnwind
3062 .type se_handler,\@abi-omnipotent
3076 mov 120($context),%rax # pull context->Rax
3077 mov 248($context),%rbx # pull context->Rip
3079 mov 8($disp),%rsi # disp->ImageBase
3080 mov 56($disp),%r11 # disp->HandlerData
3082 mov 0(%r11),%r10d # HandlerData[0]
3083 lea (%rsi,%r10),%r10 # prologue label
3084 cmp %r10,%rbx # context->Rip<=prologue label
3087 mov 4(%r11),%r10d # HandlerData[1]
3088 lea (%rsi,%r10),%r10 # epilogue label
3089 cmp %r10,%rbx # context->Rip>=epilogue label
3092 mov 8(%r11),%r10d # HandlerData[2]
3093 lea (%rsi,%r10),%r10 # epilogue label
3094 cmp %r10,%rbx # context->Rip>=tail label
3097 mov 160($context),%rax # pull context->Rbp
3099 lea 0x40(%rax),%rsi # %xmm save area
3100 lea 512($context),%rdi # &context.Xmm6
3101 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
3102 .long 0xa548f3fc # cld; rep movsq
3103 lea 0xa0+0x78(%rax),%rax # adjust stack pointer
3112 mov %rbx,144($context) # restore context->Rbx
3113 mov %rbp,160($context) # restore context->Rbp
3114 mov %r12,216($context) # restore context->R12
3115 mov %r13,224($context) # restore context->R13
3116 mov %r14,232($context) # restore context->R14
3117 mov %r15,240($context) # restore context->R15
3120 mov %rax,152($context) # restore context->Rsp
3122 mov 40($disp),%rdi # disp->ContextRecord
3123 mov $context,%rsi # context
3124 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
3125 .long 0xa548f3fc # cld; rep movsq
3128 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3129 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3130 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3131 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3132 mov 40(%rsi),%r10 # disp->ContextRecord
3133 lea 56(%rsi),%r11 # &disp->HandlerData
3134 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3135 mov %r10,32(%rsp) # arg5
3136 mov %r11,40(%rsp) # arg6
3137 mov %r12,48(%rsp) # arg7
3138 mov %rcx,56(%rsp) # arg8, (NULL)
3139 call *__imp_RtlVirtualUnwind(%rip)
3141 mov \$1,%eax # ExceptionContinueSearch
3153 .size se_handler,.-se_handler
3158 $code.=<<___ if ($ecb);
3159 .rva .Lecb_enc_prologue
3160 .rva .Lecb_enc_epilogue
3163 .rva .Lecb_dec_prologue
3164 .rva .Lecb_dec_epilogue
3168 .rva .Lcbc_dec_prologue
3169 .rva .Lcbc_dec_epilogue
3172 .rva .Lctr_enc_prologue
3173 .rva .Lctr_enc_epilogue
3176 .rva .Lxts_enc_prologue
3177 .rva .Lxts_enc_epilogue
3180 .rva .Lxts_dec_prologue
3181 .rva .Lxts_dec_epilogue
3187 $code.=<<___ if ($ecb);
3191 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
3197 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
3205 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
3211 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
3217 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3223 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3229 $code =~ s/\`([^\`]*)\`/eval($1)/gem;