3 ###################################################################
4 ### AES-128 [originally in CTR mode] ###
5 ### bitsliced implementation for Intel Core 2 processors ###
6 ### requires support of SSE extensions up to SSSE3 ###
7 ### Author: Emilia Käsper and Peter Schwabe ###
8 ### Date: 2009-03-19 ###
11 ### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12 ### further information. ###
13 ###################################################################
17 # Started as transliteration to "perlasm" the original code has
18 # undergone following changes:
20 # - code was made position-independent;
21 # - rounds were folded into a loop resulting in >5x size reduction
22 # from 12.5KB to 2.2KB;
23 # - above was possibile thanks to mixcolumns() modification that
24 # allowed to feed its output back to aesenc[last], this was
25 # achieved at cost of two additional inter-registers moves;
26 # - some instruction reordering and interleaving;
27 # - this module doesn't implement key setup subroutine, instead it
28 # relies on conversion of "conventional" key schedule as returned
29 # by AES_set_encrypt_key (see discussion below);
30 # - first and last round keys are treated differently, which allowed
31 # to skip one shiftrows(), reduce bit-sliced key schedule and
32 # speed-up conversion by 22%;
33 # - support for 192- and 256-bit keys was added;
35 # Resulting performance in CPU cycles spent to encrypt one byte out
36 # of 4096-byte buffer with 128-bit key is:
38 # Emilia's this(*) difference
40 # Core 2 9.30 8.69 +7%
41 # Nehalem(**) 7.63 6.98 +9%
42 # Atom 17.1 17.4 -2%(***)
44 # (*) Comparison is not completely fair, because "this" is ECB,
45 # i.e. no extra processing such as counter values calculation
46 # and xor-ing input as in Emilia's CTR implementation is
47 # performed. However, the CTR calculations stand for not more
48 # than 1% of total time, so comparison is *rather* fair.
50 # (**) Results were collected on Westmere, which is considered to
51 # be equivalent to Nehalem for this code.
53 # (***) Slowdown on Atom is rather strange per se, because original
54 # implementation has a number of 9+-bytes instructions, which
55 # are bad for Atom front-end, and which I eliminated completely.
56 # In attempt to address deterioration sbox() was tested in FP
57 # SIMD "domain" (movaps instead of movdqa, xorps instead of
58 # pxor, etc.). While it resulted in nominal 4% improvement on
59 # Atom, it hurted Westmere by more than 2x factor.
61 # As for key schedule conversion subroutine. Interface to OpenSSL
62 # relies on per-invocation on-the-fly conversion. This naturally
63 # has impact on performance, especially for short inputs. Conversion
64 # time in CPU cycles and its ratio to CPU cycles spent in 8x block
67 # conversion conversion/8x block
72 # The ratio values mean that 128-byte blocks will be processed
73 # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
74 # etc. Then keep in mind that input sizes not divisible by 128 are
75 # *effectively* slower, especially shortest ones, e.g. consecutive
76 # 144-byte blocks are processed 44% slower than one would expect,
77 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78 # it's still faster than ["hyper-threading-safe" code path in]
79 # aes-x86_64.pl on all lengths above 64 bytes...
83 # Add decryption procedure. Performance in CPU cycles spent to decrypt
84 # one byte out of 4096-byte buffer with 128-bit key is:
92 # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
93 # suboptimal, but XTS is meant to be used with larger blocks...
99 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
101 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
103 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
104 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
105 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
106 die "can't locate x86_64-xlate.pl";
108 open OUT,"| \"$^X\" $xlate $flavour $output";
111 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
112 my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
113 my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
116 my ($key,$rounds,$const)=("%rax","%r10d","%r11");
119 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
120 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
125 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
126 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
130 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
131 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
153 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
154 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
174 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
175 # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
179 &InvInBasisChange (@b);
180 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
181 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
184 sub InvInBasisChange { # OutBasisChange in reverse
185 my @b=@_[5,1,2,6,3,7,0,4];
203 sub InvOutBasisChange { # InBasisChange in reverse
204 my @b=@_[2,5,7,3,6,1,0,4];
225 #;*************************************************************
226 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
227 #;*************************************************************
228 my ($x0,$x1,$y0,$y1,$t0)=@_;
241 sub Mul_GF4_N { # not used, see next subroutine
242 # multiply and scale by N
243 my ($x0,$x1,$y0,$y1,$t0)=@_;
257 # interleaved Mul_GF4_N and Mul_GF4
258 my ($x0,$x1,$y0,$y1,$t0,
259 $x2,$x3,$y2,$y3,$t1)=@_;
287 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
294 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
295 @x[2], @x[3], @y[2], @y[3], @t[2]);
307 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
308 @x[6], @x[7], @y[2], @y[3], @t[2]);
313 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
322 #;********************************************************************
323 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
324 #;********************************************************************
328 # direct optimizations from hardware
383 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
385 # new smaller inversion
419 # output in s3, s2, s1, t1
421 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
423 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
424 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
426 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
429 # AES linear components
435 pxor 0x00($key),@x[0]
436 pxor 0x10($key),@x[1]
438 pxor 0x20($key),@x[2]
440 pxor 0x30($key),@x[3]
442 pxor 0x40($key),@x[4]
444 pxor 0x50($key),@x[5]
446 pxor 0x60($key),@x[6]
448 pxor 0x70($key),@x[7]
456 # modified to emit output in order suitable for feeding back to aesenc[last]
460 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
461 pshufd \$0x93, @x[1], @t[1]
462 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
463 pshufd \$0x93, @x[2], @t[2]
465 pshufd \$0x93, @x[3], @t[3]
467 pshufd \$0x93, @x[4], @t[4]
469 pshufd \$0x93, @x[5], @t[5]
471 pshufd \$0x93, @x[6], @t[6]
473 pshufd \$0x93, @x[7], @t[7]
480 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
482 pshufd \$0x4E, @x[1], @x[1]
488 pshufd \$0x4E, @x[4], @t[0]
490 pshufd \$0x4E, @x[5], @t[1]
492 pshufd \$0x4E, @x[3], @x[4]
494 pshufd \$0x4E, @x[7], @x[5]
496 pshufd \$0x4E, @x[6], @x[3]
498 pshufd \$0x4E, @x[2], @x[6]
515 # multiplication by 0x0e
516 pshufd \$0x93, @x[7], @t[7]
518 pxor @x[5], @x[7] # 7 5
519 pxor @x[5], @x[2] # 2 5
520 pshufd \$0x93, @x[0], @t[0]
522 pxor @x[0], @x[5] # 5 0 [1]
523 pxor @x[1], @x[0] # 0 1
524 pshufd \$0x93, @x[1], @t[1]
525 pxor @x[2], @x[1] # 1 25
526 pxor @x[6], @x[0] # 01 6 [2]
527 pxor @x[3], @x[1] # 125 3 [4]
528 pshufd \$0x93, @x[3], @t[3]
529 pxor @x[0], @x[2] # 25 016 [3]
530 pxor @x[7], @x[3] # 3 75
531 pxor @x[6], @x[7] # 75 6 [0]
532 pshufd \$0x93, @x[6], @t[6]
534 pxor @x[4], @x[6] # 6 4
535 pxor @x[3], @x[4] # 4 375 [6]
536 pxor @x[7], @x[3] # 375 756=36
537 pxor @t[5], @x[6] # 64 5 [7]
538 pxor @t[2], @x[3] # 36 2
539 pxor @t[4], @x[3] # 362 4 [5]
540 pshufd \$0x93, @t[5], @t[5]
542 my @y = @x[7,5,0,2,1,3,4,6];
544 # multiplication by 0x0b
548 pshufd \$0x93, @t[2], @t[2]
552 pshufd \$0x93, @t[4], @t[4]
553 pxor @t[6], @t[7] # clobber t[7]
557 pshufd \$0x93, @t[0], @t[0]
561 pshufd \$0x93, @t[1], @t[1]
565 pshufd \$0x93, @t[2], @t[2]
569 pshufd \$0x93, @t[3], @t[3]
575 pxor @t[5], @t[7] # clobber t[7] even more
578 pshufd \$0x93, @t[4], @t[4]
583 pshufd \$0x93, @t[5], @t[5]
584 pxor @t[6], @t[7] # restore t[7]
586 # multiplication by 0x0d
589 pshufd \$0x93, @t[6], @t[6]
593 pshufd \$0x93, @t[7], @t[7]
602 pshufd \$0x93, @t[0], @t[0]
606 pshufd \$0x93, @t[1], @t[1]
611 pshufd \$0x93, @t[2], @t[2]
613 pxor @t[3], @t[6] # clobber t[6]
620 pshufd \$0x93, @t[4], @t[4]
623 pxor @t[3], @t[6] # restore t[6]
625 pshufd \$0x93, @t[5], @t[5]
626 pshufd \$0x93, @t[6], @t[6]
627 pshufd \$0x93, @t[7], @t[7]
628 pshufd \$0x93, @t[3], @t[3]
630 # multiplication by 0x09
632 pxor @y[1], @t[1] # t[1]=y[1]
633 pxor @t[5], @t[0] # clobber t[0]
636 pxor @y[0], @t[0] # t[0]=y[0]
638 pxor @t[7], @t[6] # clobber t[6]
641 pxor @y[4], @t[4] # t[4]=y[4]
643 pxor @y[3], @t[3] # t[3]=y[3]
645 pxor @y[2], @t[2] # t[2]=y[2]
647 pxor @y[5], @t[5] # t[5]=y[5]
650 pxor @y[6], @t[6] # t[6]=y[6]
651 pxor @y[7], @t[7] # t[7]=y[7]
664 sub aesenc { # not used
668 movdqa 0x30($const),@t[0] # .LSR
670 &ShiftRows (@b,@t[0]);
672 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
675 sub aesenclast { # not used
679 movdqa 0x40($const),@t[0] # .LSRM0
681 &ShiftRows (@b,@t[0]);
684 pxor 0x00($key),@b[0]
685 pxor 0x10($key),@b[1]
686 pxor 0x20($key),@b[4]
687 pxor 0x30($key),@b[6]
688 pxor 0x40($key),@b[3]
689 pxor 0x50($key),@b[7]
690 pxor 0x60($key),@b[2]
691 pxor 0x70($key),@b[5]
696 my ($a,$b,$n,$mask,$t)=@_;
708 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
728 my @x=reverse(@_[0..7]);
729 my ($t0,$t1,$t2,$t3)=@_[8..11];
731 movdqa 0x00($const),$t0 # .LBS0
732 movdqa 0x10($const),$t1 # .LBS1
734 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
735 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
737 movdqa 0x20($const),$t0 # .LBS2
739 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
740 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
742 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
743 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
749 .extern asm_AES_encrypt
750 .extern asm_AES_decrypt
752 .type _bsaes_encrypt8,\@abi-omnipotent
755 lea .LBS0(%rip), $const # constants table
757 movdqa ($key), @XMM[9] # round 0 key
759 movdqa 0x50($const), @XMM[8] # .LM0SR
760 pxor @XMM[9], @XMM[0] # xor with round0 key
761 pxor @XMM[9], @XMM[1]
762 pshufb @XMM[8], @XMM[0]
763 pxor @XMM[9], @XMM[2]
764 pshufb @XMM[8], @XMM[1]
765 pxor @XMM[9], @XMM[3]
766 pshufb @XMM[8], @XMM[2]
767 pxor @XMM[9], @XMM[4]
768 pshufb @XMM[8], @XMM[3]
769 pxor @XMM[9], @XMM[5]
770 pshufb @XMM[8], @XMM[4]
771 pxor @XMM[9], @XMM[6]
772 pshufb @XMM[8], @XMM[5]
773 pxor @XMM[9], @XMM[7]
774 pshufb @XMM[8], @XMM[6]
775 pshufb @XMM[8], @XMM[7]
776 _bsaes_encrypt8_bitslice:
778 &bitslice (@XMM[0..7, 8..11]);
785 &ShiftRows (@XMM[0..7, 8]);
786 $code.=".Lenc_sbox:\n";
787 &Sbox (@XMM[0..7, 8..15]);
792 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
794 movdqa 0x30($const), @XMM[8] # .LSR
796 movdqa 0x40($const), @XMM[8] # .LSRM0
801 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
802 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
804 movdqa ($key), @XMM[8] # last round key
805 pxor @XMM[8], @XMM[4]
806 pxor @XMM[8], @XMM[6]
807 pxor @XMM[8], @XMM[3]
808 pxor @XMM[8], @XMM[7]
809 pxor @XMM[8], @XMM[2]
810 pxor @XMM[8], @XMM[5]
811 pxor @XMM[8], @XMM[0]
812 pxor @XMM[8], @XMM[1]
814 .size _bsaes_encrypt8,.-_bsaes_encrypt8
816 .type _bsaes_decrypt8,\@abi-omnipotent
819 lea .LBS0(%rip), $const # constants table
821 movdqa ($key), @XMM[9] # round 0 key
823 movdqa -0x30($const), @XMM[8] # .LM0ISR
824 pxor @XMM[9], @XMM[0] # xor with round0 key
825 pxor @XMM[9], @XMM[1]
826 pshufb @XMM[8], @XMM[0]
827 pxor @XMM[9], @XMM[2]
828 pshufb @XMM[8], @XMM[1]
829 pxor @XMM[9], @XMM[3]
830 pshufb @XMM[8], @XMM[2]
831 pxor @XMM[9], @XMM[4]
832 pshufb @XMM[8], @XMM[3]
833 pxor @XMM[9], @XMM[5]
834 pshufb @XMM[8], @XMM[4]
835 pxor @XMM[9], @XMM[6]
836 pshufb @XMM[8], @XMM[5]
837 pxor @XMM[9], @XMM[7]
838 pshufb @XMM[8], @XMM[6]
839 pshufb @XMM[8], @XMM[7]
841 &bitslice (@XMM[0..7, 8..11]);
848 &ShiftRows (@XMM[0..7, 8]);
849 $code.=".Ldec_sbox:\n";
850 &InvSbox (@XMM[0..7, 8..15]);
855 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
857 movdqa -0x10($const), @XMM[8] # .LISR
859 movdqa -0x20($const), @XMM[8] # .LISRM0
864 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
866 movdqa ($key), @XMM[8] # last round key
867 pxor @XMM[8], @XMM[6]
868 pxor @XMM[8], @XMM[4]
869 pxor @XMM[8], @XMM[2]
870 pxor @XMM[8], @XMM[7]
871 pxor @XMM[8], @XMM[3]
872 pxor @XMM[8], @XMM[5]
873 pxor @XMM[8], @XMM[0]
874 pxor @XMM[8], @XMM[1]
876 .size _bsaes_decrypt8,.-_bsaes_decrypt8
880 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
883 my @x=reverse(@_[0..7]);
884 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
886 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
888 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
892 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
894 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
896 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
902 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
903 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
907 .type _bsaes_key_convert,\@abi-omnipotent
910 lea .Lmasks(%rip), $const
911 movdqu ($inp), %xmm7 # load round 0 key
913 movdqa 0x00($const), %xmm0 # 0x01...
914 movdqa 0x10($const), %xmm1 # 0x02...
915 movdqa 0x20($const), %xmm2 # 0x04...
916 movdqa 0x30($const), %xmm3 # 0x08...
917 movdqa 0x40($const), %xmm4 # .LM0
918 pcmpeqd %xmm5, %xmm5 # .LNOT
920 movdqu ($inp), %xmm6 # load round 1 key
921 movdqa %xmm7, ($out) # save round 0 key
927 pshufb %xmm4, %xmm6 # .LM0
936 psllq \$4, %xmm0 # 0x10...
939 psllq \$4, %xmm1 # 0x20...
944 pcmpeqb %xmm2, %xmm10
945 psllq \$4, %xmm2 # 0x40...
947 pcmpeqb %xmm3, %xmm11
948 psllq \$4, %xmm3 # 0x80...
952 pxor %xmm5, %xmm8 # "pnot"
957 movdqa %xmm8, 0x00($out) # write bit-sliced round key
958 pcmpeqb %xmm0, %xmm12
959 psrlq \$4, %xmm0 # 0x01...
960 movdqa %xmm9, 0x10($out)
961 pcmpeqb %xmm1, %xmm13
962 psrlq \$4, %xmm1 # 0x02...
967 movdqa %xmm10, 0x20($out)
968 pcmpeqb %xmm2, %xmm14
969 psrlq \$4, %xmm2 # 0x04...
970 movdqa %xmm11, 0x30($out)
971 pcmpeqb %xmm3, %xmm15
972 psrlq \$4, %xmm3 # 0x08...
973 movdqu ($inp), %xmm6 # load next round key
975 pxor %xmm5, %xmm13 # "pnot"
977 movdqa %xmm12, 0x40($out)
978 movdqa %xmm13, 0x50($out)
979 movdqa %xmm14, 0x60($out)
980 movdqa %xmm15, 0x70($out)
985 movdqa 0x50($const), %xmm7 # .L63
986 #movdqa %xmm6, ($out) # don't save last round key
988 .size _bsaes_key_convert,.-_bsaes_key_convert
992 if (0 && !$win64) { # following four functions are unsupported interface
993 # used for benchmarking...
995 .globl bsaes_enc_key_convert
996 .type bsaes_enc_key_convert,\@function,2
998 bsaes_enc_key_convert:
999 mov 240($inp),%r10d # pass rounds
1000 mov $inp,%rcx # pass key
1001 mov $out,%rax # pass key schedule
1002 call _bsaes_key_convert
1003 pxor %xmm6,%xmm7 # fix up last round key
1004 movdqa %xmm7,(%rax) # save last round key
1006 .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1008 .globl bsaes_encrypt_128
1009 .type bsaes_encrypt_128,\@function,4
1013 movdqu 0x00($inp), @XMM[0] # load input
1014 movdqu 0x10($inp), @XMM[1]
1015 movdqu 0x20($inp), @XMM[2]
1016 movdqu 0x30($inp), @XMM[3]
1017 movdqu 0x40($inp), @XMM[4]
1018 movdqu 0x50($inp), @XMM[5]
1019 movdqu 0x60($inp), @XMM[6]
1020 movdqu 0x70($inp), @XMM[7]
1021 mov $key, %rax # pass the $key
1022 lea 0x80($inp), $inp
1025 call _bsaes_encrypt8
1027 movdqu @XMM[0], 0x00($out) # write output
1028 movdqu @XMM[1], 0x10($out)
1029 movdqu @XMM[4], 0x20($out)
1030 movdqu @XMM[6], 0x30($out)
1031 movdqu @XMM[3], 0x40($out)
1032 movdqu @XMM[7], 0x50($out)
1033 movdqu @XMM[2], 0x60($out)
1034 movdqu @XMM[5], 0x70($out)
1035 lea 0x80($out), $out
1039 .size bsaes_encrypt_128,.-bsaes_encrypt_128
1041 .globl bsaes_dec_key_convert
1042 .type bsaes_dec_key_convert,\@function,2
1044 bsaes_dec_key_convert:
1045 mov 240($inp),%r10d # pass rounds
1046 mov $inp,%rcx # pass key
1047 mov $out,%rax # pass key schedule
1048 call _bsaes_key_convert
1049 pxor ($out),%xmm7 # fix up round 0 key
1050 movdqa %xmm6,(%rax) # save last round key
1053 .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1055 .globl bsaes_decrypt_128
1056 .type bsaes_decrypt_128,\@function,4
1060 movdqu 0x00($inp), @XMM[0] # load input
1061 movdqu 0x10($inp), @XMM[1]
1062 movdqu 0x20($inp), @XMM[2]
1063 movdqu 0x30($inp), @XMM[3]
1064 movdqu 0x40($inp), @XMM[4]
1065 movdqu 0x50($inp), @XMM[5]
1066 movdqu 0x60($inp), @XMM[6]
1067 movdqu 0x70($inp), @XMM[7]
1068 mov $key, %rax # pass the $key
1069 lea 0x80($inp), $inp
1072 call _bsaes_decrypt8
1074 movdqu @XMM[0], 0x00($out) # write output
1075 movdqu @XMM[1], 0x10($out)
1076 movdqu @XMM[6], 0x20($out)
1077 movdqu @XMM[4], 0x30($out)
1078 movdqu @XMM[2], 0x40($out)
1079 movdqu @XMM[7], 0x50($out)
1080 movdqu @XMM[3], 0x60($out)
1081 movdqu @XMM[5], 0x70($out)
1082 lea 0x80($out), $out
1086 .size bsaes_decrypt_128,.-bsaes_decrypt_128
1090 ######################################################################
1094 my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1095 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1096 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1100 .globl bsaes_ecb_encrypt_blocks
1101 .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1103 bsaes_ecb_encrypt_blocks:
1112 lea -0x48(%rsp),%rsp
1114 $code.=<<___ if ($win64);
1115 lea -0xa0(%rsp), %rsp
1116 movaps %xmm6, 0x40(%rsp)
1117 movaps %xmm7, 0x50(%rsp)
1118 movaps %xmm8, 0x60(%rsp)
1119 movaps %xmm9, 0x70(%rsp)
1120 movaps %xmm10, 0x80(%rsp)
1121 movaps %xmm11, 0x90(%rsp)
1122 movaps %xmm12, 0xa0(%rsp)
1123 movaps %xmm13, 0xb0(%rsp)
1124 movaps %xmm14, 0xc0(%rsp)
1125 movaps %xmm15, 0xd0(%rsp)
1129 mov %rsp,%rbp # backup %rsp
1130 mov 240($arg4),%eax # rounds
1131 mov $arg1,$inp # backup arguments
1138 mov %eax,%ebx # backup rounds
1139 shl \$7,%rax # 128 bytes per inner round key
1140 sub \$`128-32`,%rax # size of bit-sliced key schedule
1142 mov %rsp,%rax # pass key schedule
1143 mov $key,%rcx # pass key
1144 mov %ebx,%r10d # pass rounds
1145 call _bsaes_key_convert
1146 pxor %xmm6,%xmm7 # fix up last round key
1147 movdqa %xmm7,(%rax) # save last round key
1151 movdqu 0x00($inp), @XMM[0] # load input
1152 movdqu 0x10($inp), @XMM[1]
1153 movdqu 0x20($inp), @XMM[2]
1154 movdqu 0x30($inp), @XMM[3]
1155 movdqu 0x40($inp), @XMM[4]
1156 movdqu 0x50($inp), @XMM[5]
1157 mov %rsp, %rax # pass key schedule
1158 movdqu 0x60($inp), @XMM[6]
1159 mov %ebx,%r10d # pass rounds
1160 movdqu 0x70($inp), @XMM[7]
1161 lea 0x80($inp), $inp
1163 call _bsaes_encrypt8
1165 movdqu @XMM[0], 0x00($out) # write output
1166 movdqu @XMM[1], 0x10($out)
1167 movdqu @XMM[4], 0x20($out)
1168 movdqu @XMM[6], 0x30($out)
1169 movdqu @XMM[3], 0x40($out)
1170 movdqu @XMM[7], 0x50($out)
1171 movdqu @XMM[2], 0x60($out)
1172 movdqu @XMM[5], 0x70($out)
1173 lea 0x80($out), $out
1180 movdqu 0x00($inp), @XMM[0] # load input
1181 mov %rsp, %rax # pass key schedule
1182 mov %ebx,%r10d # pass rounds
1185 movdqu 0x10($inp), @XMM[1]
1187 movdqu 0x20($inp), @XMM[2]
1190 movdqu 0x30($inp), @XMM[3]
1192 movdqu 0x40($inp), @XMM[4]
1195 movdqu 0x50($inp), @XMM[5]
1197 movdqu 0x60($inp), @XMM[6]
1198 call _bsaes_encrypt8
1199 movdqu @XMM[0], 0x00($out) # write output
1200 movdqu @XMM[1], 0x10($out)
1201 movdqu @XMM[4], 0x20($out)
1202 movdqu @XMM[6], 0x30($out)
1203 movdqu @XMM[3], 0x40($out)
1204 movdqu @XMM[7], 0x50($out)
1205 movdqu @XMM[2], 0x60($out)
1209 call _bsaes_encrypt8
1210 movdqu @XMM[0], 0x00($out) # write output
1211 movdqu @XMM[1], 0x10($out)
1212 movdqu @XMM[4], 0x20($out)
1213 movdqu @XMM[6], 0x30($out)
1214 movdqu @XMM[3], 0x40($out)
1215 movdqu @XMM[7], 0x50($out)
1219 call _bsaes_encrypt8
1220 movdqu @XMM[0], 0x00($out) # write output
1221 movdqu @XMM[1], 0x10($out)
1222 movdqu @XMM[4], 0x20($out)
1223 movdqu @XMM[6], 0x30($out)
1224 movdqu @XMM[3], 0x40($out)
1228 call _bsaes_encrypt8
1229 movdqu @XMM[0], 0x00($out) # write output
1230 movdqu @XMM[1], 0x10($out)
1231 movdqu @XMM[4], 0x20($out)
1232 movdqu @XMM[6], 0x30($out)
1236 call _bsaes_encrypt8
1237 movdqu @XMM[0], 0x00($out) # write output
1238 movdqu @XMM[1], 0x10($out)
1239 movdqu @XMM[4], 0x20($out)
1243 call _bsaes_encrypt8
1244 movdqu @XMM[0], 0x00($out) # write output
1245 movdqu @XMM[1], 0x10($out)
1249 call _bsaes_encrypt8
1250 movdqu @XMM[0], 0x00($out) # write output
1257 call asm_AES_encrypt
1266 .Lecb_enc_bzero: # wipe key schedule [if any]
1267 movdqa %xmm0, 0x00(%rax)
1268 movdqa %xmm0, 0x10(%rax)
1269 lea 0x20(%rax), %rax
1273 lea (%rbp),%rsp # restore %rsp
1275 $code.=<<___ if ($win64);
1276 movaps 0x40(%rbp), %xmm6
1277 movaps 0x50(%rbp), %xmm7
1278 movaps 0x60(%rbp), %xmm8
1279 movaps 0x70(%rbp), %xmm9
1280 movaps 0x80(%rbp), %xmm10
1281 movaps 0x90(%rbp), %xmm11
1282 movaps 0xa0(%rbp), %xmm12
1283 movaps 0xb0(%rbp), %xmm13
1284 movaps 0xc0(%rbp), %xmm14
1285 movaps 0xd0(%rbp), %xmm15
1286 lea 0xa0(%rbp), %rsp
1289 mov 0x48(%rsp), %r15
1290 mov 0x50(%rsp), %r14
1291 mov 0x58(%rsp), %r13
1292 mov 0x60(%rsp), %r12
1293 mov 0x68(%rsp), %rbx
1294 mov 0x70(%rsp), %rax
1295 lea 0x78(%rsp), %rsp
1299 .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1301 .globl bsaes_ecb_decrypt_blocks
1302 .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1304 bsaes_ecb_decrypt_blocks:
1313 lea -0x48(%rsp),%rsp
1315 $code.=<<___ if ($win64);
1316 lea -0xa0(%rsp), %rsp
1317 movaps %xmm6, 0x40(%rsp)
1318 movaps %xmm7, 0x50(%rsp)
1319 movaps %xmm8, 0x60(%rsp)
1320 movaps %xmm9, 0x70(%rsp)
1321 movaps %xmm10, 0x80(%rsp)
1322 movaps %xmm11, 0x90(%rsp)
1323 movaps %xmm12, 0xa0(%rsp)
1324 movaps %xmm13, 0xb0(%rsp)
1325 movaps %xmm14, 0xc0(%rsp)
1326 movaps %xmm15, 0xd0(%rsp)
1330 mov %rsp,%rbp # backup %rsp
1331 mov 240($arg4),%eax # rounds
1332 mov $arg1,$inp # backup arguments
1339 mov %eax,%ebx # backup rounds
1340 shl \$7,%rax # 128 bytes per inner round key
1341 sub \$`128-32`,%rax # size of bit-sliced key schedule
1343 mov %rsp,%rax # pass key schedule
1344 mov $key,%rcx # pass key
1345 mov %ebx,%r10d # pass rounds
1346 call _bsaes_key_convert
1347 pxor (%rsp),%xmm7 # fix up 0 round key
1348 movdqa %xmm6,(%rax) # save last round key
1353 movdqu 0x00($inp), @XMM[0] # load input
1354 movdqu 0x10($inp), @XMM[1]
1355 movdqu 0x20($inp), @XMM[2]
1356 movdqu 0x30($inp), @XMM[3]
1357 movdqu 0x40($inp), @XMM[4]
1358 movdqu 0x50($inp), @XMM[5]
1359 mov %rsp, %rax # pass key schedule
1360 movdqu 0x60($inp), @XMM[6]
1361 mov %ebx,%r10d # pass rounds
1362 movdqu 0x70($inp), @XMM[7]
1363 lea 0x80($inp), $inp
1365 call _bsaes_decrypt8
1367 movdqu @XMM[0], 0x00($out) # write output
1368 movdqu @XMM[1], 0x10($out)
1369 movdqu @XMM[6], 0x20($out)
1370 movdqu @XMM[4], 0x30($out)
1371 movdqu @XMM[2], 0x40($out)
1372 movdqu @XMM[7], 0x50($out)
1373 movdqu @XMM[3], 0x60($out)
1374 movdqu @XMM[5], 0x70($out)
1375 lea 0x80($out), $out
1382 movdqu 0x00($inp), @XMM[0] # load input
1383 mov %rsp, %rax # pass key schedule
1384 mov %ebx,%r10d # pass rounds
1387 movdqu 0x10($inp), @XMM[1]
1389 movdqu 0x20($inp), @XMM[2]
1392 movdqu 0x30($inp), @XMM[3]
1394 movdqu 0x40($inp), @XMM[4]
1397 movdqu 0x50($inp), @XMM[5]
1399 movdqu 0x60($inp), @XMM[6]
1400 call _bsaes_decrypt8
1401 movdqu @XMM[0], 0x00($out) # write output
1402 movdqu @XMM[1], 0x10($out)
1403 movdqu @XMM[6], 0x20($out)
1404 movdqu @XMM[4], 0x30($out)
1405 movdqu @XMM[2], 0x40($out)
1406 movdqu @XMM[7], 0x50($out)
1407 movdqu @XMM[3], 0x60($out)
1411 call _bsaes_decrypt8
1412 movdqu @XMM[0], 0x00($out) # write output
1413 movdqu @XMM[1], 0x10($out)
1414 movdqu @XMM[6], 0x20($out)
1415 movdqu @XMM[4], 0x30($out)
1416 movdqu @XMM[2], 0x40($out)
1417 movdqu @XMM[7], 0x50($out)
1421 call _bsaes_decrypt8
1422 movdqu @XMM[0], 0x00($out) # write output
1423 movdqu @XMM[1], 0x10($out)
1424 movdqu @XMM[6], 0x20($out)
1425 movdqu @XMM[4], 0x30($out)
1426 movdqu @XMM[2], 0x40($out)
1430 call _bsaes_decrypt8
1431 movdqu @XMM[0], 0x00($out) # write output
1432 movdqu @XMM[1], 0x10($out)
1433 movdqu @XMM[6], 0x20($out)
1434 movdqu @XMM[4], 0x30($out)
1438 call _bsaes_decrypt8
1439 movdqu @XMM[0], 0x00($out) # write output
1440 movdqu @XMM[1], 0x10($out)
1441 movdqu @XMM[6], 0x20($out)
1445 call _bsaes_decrypt8
1446 movdqu @XMM[0], 0x00($out) # write output
1447 movdqu @XMM[1], 0x10($out)
1451 call _bsaes_decrypt8
1452 movdqu @XMM[0], 0x00($out) # write output
1459 call asm_AES_decrypt
1468 .Lecb_dec_bzero: # wipe key schedule [if any]
1469 movdqa %xmm0, 0x00(%rax)
1470 movdqa %xmm0, 0x10(%rax)
1471 lea 0x20(%rax), %rax
1475 lea (%rbp),%rsp # restore %rsp
1477 $code.=<<___ if ($win64);
1478 movaps 0x40(%rbp), %xmm6
1479 movaps 0x50(%rbp), %xmm7
1480 movaps 0x60(%rbp), %xmm8
1481 movaps 0x70(%rbp), %xmm9
1482 movaps 0x80(%rbp), %xmm10
1483 movaps 0x90(%rbp), %xmm11
1484 movaps 0xa0(%rbp), %xmm12
1485 movaps 0xb0(%rbp), %xmm13
1486 movaps 0xc0(%rbp), %xmm14
1487 movaps 0xd0(%rbp), %xmm15
1488 lea 0xa0(%rbp), %rsp
1491 mov 0x48(%rsp), %r15
1492 mov 0x50(%rsp), %r14
1493 mov 0x58(%rsp), %r13
1494 mov 0x60(%rsp), %r12
1495 mov 0x68(%rsp), %rbx
1496 mov 0x70(%rsp), %rax
1497 lea 0x78(%rsp), %rsp
1501 .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1505 .extern asm_AES_cbc_encrypt
1506 .globl bsaes_cbc_encrypt
1507 .type bsaes_cbc_encrypt,\@abi-omnipotent
1511 $code.=<<___ if ($win64);
1512 mov 48(%rsp),$arg6 # pull direction flag
1516 jne asm_AES_cbc_encrypt
1518 jb asm_AES_cbc_encrypt
1528 lea -0x48(%rsp), %rsp
1530 $code.=<<___ if ($win64);
1531 mov 0xa0(%rsp),$arg5 # pull ivp
1532 lea -0xa0(%rsp), %rsp
1533 movaps %xmm6, 0x40(%rsp)
1534 movaps %xmm7, 0x50(%rsp)
1535 movaps %xmm8, 0x60(%rsp)
1536 movaps %xmm9, 0x70(%rsp)
1537 movaps %xmm10, 0x80(%rsp)
1538 movaps %xmm11, 0x90(%rsp)
1539 movaps %xmm12, 0xa0(%rsp)
1540 movaps %xmm13, 0xb0(%rsp)
1541 movaps %xmm14, 0xc0(%rsp)
1542 movaps %xmm15, 0xd0(%rsp)
1546 mov %rsp, %rbp # backup %rsp
1547 mov 240($arg4), %eax # rounds
1548 mov $arg1, $inp # backup arguments
1553 shr \$4, $len # bytes to blocks
1555 mov %eax, %edx # rounds
1556 shl \$7, %rax # 128 bytes per inner round key
1557 sub \$`128-32`, %rax # size of bit-sliced key schedule
1560 mov %rsp, %rax # pass key schedule
1561 mov $key, %rcx # pass key
1562 mov %edx, %r10d # pass rounds
1563 call _bsaes_key_convert
1564 pxor (%rsp),%xmm7 # fix up 0 round key
1565 movdqa %xmm6,(%rax) # save last round key
1568 movdqu (%rbx), @XMM[15] # load IV
1571 movdqu 0x00($inp), @XMM[0] # load input
1572 movdqu 0x10($inp), @XMM[1]
1573 movdqu 0x20($inp), @XMM[2]
1574 movdqu 0x30($inp), @XMM[3]
1575 movdqu 0x40($inp), @XMM[4]
1576 movdqu 0x50($inp), @XMM[5]
1577 mov %rsp, %rax # pass key schedule
1578 movdqu 0x60($inp), @XMM[6]
1579 mov %edx,%r10d # pass rounds
1580 movdqu 0x70($inp), @XMM[7]
1581 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1583 call _bsaes_decrypt8
1585 pxor 0x20(%rbp), @XMM[0] # ^= IV
1586 movdqu 0x00($inp), @XMM[8] # re-load input
1587 movdqu 0x10($inp), @XMM[9]
1588 pxor @XMM[8], @XMM[1]
1589 movdqu 0x20($inp), @XMM[10]
1590 pxor @XMM[9], @XMM[6]
1591 movdqu 0x30($inp), @XMM[11]
1592 pxor @XMM[10], @XMM[4]
1593 movdqu 0x40($inp), @XMM[12]
1594 pxor @XMM[11], @XMM[2]
1595 movdqu 0x50($inp), @XMM[13]
1596 pxor @XMM[12], @XMM[7]
1597 movdqu 0x60($inp), @XMM[14]
1598 pxor @XMM[13], @XMM[3]
1599 movdqu 0x70($inp), @XMM[15] # IV
1600 pxor @XMM[14], @XMM[5]
1601 movdqu @XMM[0], 0x00($out) # write output
1602 lea 0x80($inp), $inp
1603 movdqu @XMM[1], 0x10($out)
1604 movdqu @XMM[6], 0x20($out)
1605 movdqu @XMM[4], 0x30($out)
1606 movdqu @XMM[2], 0x40($out)
1607 movdqu @XMM[7], 0x50($out)
1608 movdqu @XMM[3], 0x60($out)
1609 movdqu @XMM[5], 0x70($out)
1610 lea 0x80($out), $out
1617 movdqu 0x00($inp), @XMM[0] # load input
1618 mov %rsp, %rax # pass key schedule
1619 mov %edx, %r10d # pass rounds
1622 movdqu 0x10($inp), @XMM[1]
1624 movdqu 0x20($inp), @XMM[2]
1627 movdqu 0x30($inp), @XMM[3]
1629 movdqu 0x40($inp), @XMM[4]
1632 movdqu 0x50($inp), @XMM[5]
1634 movdqu 0x60($inp), @XMM[6]
1635 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1636 call _bsaes_decrypt8
1637 pxor 0x20(%rbp), @XMM[0] # ^= IV
1638 movdqu 0x00($inp), @XMM[8] # re-load input
1639 movdqu 0x10($inp), @XMM[9]
1640 pxor @XMM[8], @XMM[1]
1641 movdqu 0x20($inp), @XMM[10]
1642 pxor @XMM[9], @XMM[6]
1643 movdqu 0x30($inp), @XMM[11]
1644 pxor @XMM[10], @XMM[4]
1645 movdqu 0x40($inp), @XMM[12]
1646 pxor @XMM[11], @XMM[2]
1647 movdqu 0x50($inp), @XMM[13]
1648 pxor @XMM[12], @XMM[7]
1649 movdqu 0x60($inp), @XMM[15] # IV
1650 pxor @XMM[13], @XMM[3]
1651 movdqu @XMM[0], 0x00($out) # write output
1652 movdqu @XMM[1], 0x10($out)
1653 movdqu @XMM[6], 0x20($out)
1654 movdqu @XMM[4], 0x30($out)
1655 movdqu @XMM[2], 0x40($out)
1656 movdqu @XMM[7], 0x50($out)
1657 movdqu @XMM[3], 0x60($out)
1661 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1662 call _bsaes_decrypt8
1663 pxor 0x20(%rbp), @XMM[0] # ^= IV
1664 movdqu 0x00($inp), @XMM[8] # re-load input
1665 movdqu 0x10($inp), @XMM[9]
1666 pxor @XMM[8], @XMM[1]
1667 movdqu 0x20($inp), @XMM[10]
1668 pxor @XMM[9], @XMM[6]
1669 movdqu 0x30($inp), @XMM[11]
1670 pxor @XMM[10], @XMM[4]
1671 movdqu 0x40($inp), @XMM[12]
1672 pxor @XMM[11], @XMM[2]
1673 movdqu 0x50($inp), @XMM[15] # IV
1674 pxor @XMM[12], @XMM[7]
1675 movdqu @XMM[0], 0x00($out) # write output
1676 movdqu @XMM[1], 0x10($out)
1677 movdqu @XMM[6], 0x20($out)
1678 movdqu @XMM[4], 0x30($out)
1679 movdqu @XMM[2], 0x40($out)
1680 movdqu @XMM[7], 0x50($out)
1684 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1685 call _bsaes_decrypt8
1686 pxor 0x20(%rbp), @XMM[0] # ^= IV
1687 movdqu 0x00($inp), @XMM[8] # re-load input
1688 movdqu 0x10($inp), @XMM[9]
1689 pxor @XMM[8], @XMM[1]
1690 movdqu 0x20($inp), @XMM[10]
1691 pxor @XMM[9], @XMM[6]
1692 movdqu 0x30($inp), @XMM[11]
1693 pxor @XMM[10], @XMM[4]
1694 movdqu 0x40($inp), @XMM[15] # IV
1695 pxor @XMM[11], @XMM[2]
1696 movdqu @XMM[0], 0x00($out) # write output
1697 movdqu @XMM[1], 0x10($out)
1698 movdqu @XMM[6], 0x20($out)
1699 movdqu @XMM[4], 0x30($out)
1700 movdqu @XMM[2], 0x40($out)
1704 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1705 call _bsaes_decrypt8
1706 pxor 0x20(%rbp), @XMM[0] # ^= IV
1707 movdqu 0x00($inp), @XMM[8] # re-load input
1708 movdqu 0x10($inp), @XMM[9]
1709 pxor @XMM[8], @XMM[1]
1710 movdqu 0x20($inp), @XMM[10]
1711 pxor @XMM[9], @XMM[6]
1712 movdqu 0x30($inp), @XMM[15] # IV
1713 pxor @XMM[10], @XMM[4]
1714 movdqu @XMM[0], 0x00($out) # write output
1715 movdqu @XMM[1], 0x10($out)
1716 movdqu @XMM[6], 0x20($out)
1717 movdqu @XMM[4], 0x30($out)
1721 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1722 call _bsaes_decrypt8
1723 pxor 0x20(%rbp), @XMM[0] # ^= IV
1724 movdqu 0x00($inp), @XMM[8] # re-load input
1725 movdqu 0x10($inp), @XMM[9]
1726 pxor @XMM[8], @XMM[1]
1727 movdqu 0x20($inp), @XMM[15] # IV
1728 pxor @XMM[9], @XMM[6]
1729 movdqu @XMM[0], 0x00($out) # write output
1730 movdqu @XMM[1], 0x10($out)
1731 movdqu @XMM[6], 0x20($out)
1735 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1736 call _bsaes_decrypt8
1737 pxor 0x20(%rbp), @XMM[0] # ^= IV
1738 movdqu 0x00($inp), @XMM[8] # re-load input
1739 movdqu 0x10($inp), @XMM[15] # IV
1740 pxor @XMM[8], @XMM[1]
1741 movdqu @XMM[0], 0x00($out) # write output
1742 movdqu @XMM[1], 0x10($out)
1747 lea 0x20(%rbp), $arg2 # buffer output
1749 call asm_AES_decrypt # doesn't touch %xmm
1750 pxor 0x20(%rbp), @XMM[15] # ^= IV
1751 movdqu @XMM[15], ($out) # write output
1752 movdqa @XMM[0], @XMM[15] # IV
1755 movdqu @XMM[15], (%rbx) # return IV
1758 .Lcbc_dec_bzero: # wipe key schedule [if any]
1759 movdqa %xmm0, 0x00(%rax)
1760 movdqa %xmm0, 0x10(%rax)
1761 lea 0x20(%rax), %rax
1765 lea (%rbp),%rsp # restore %rsp
1767 $code.=<<___ if ($win64);
1768 movaps 0x40(%rbp), %xmm6
1769 movaps 0x50(%rbp), %xmm7
1770 movaps 0x60(%rbp), %xmm8
1771 movaps 0x70(%rbp), %xmm9
1772 movaps 0x80(%rbp), %xmm10
1773 movaps 0x90(%rbp), %xmm11
1774 movaps 0xa0(%rbp), %xmm12
1775 movaps 0xb0(%rbp), %xmm13
1776 movaps 0xc0(%rbp), %xmm14
1777 movaps 0xd0(%rbp), %xmm15
1778 lea 0xa0(%rbp), %rsp
1781 mov 0x48(%rsp), %r15
1782 mov 0x50(%rsp), %r14
1783 mov 0x58(%rsp), %r13
1784 mov 0x60(%rsp), %r12
1785 mov 0x68(%rsp), %rbx
1786 mov 0x70(%rsp), %rax
1787 lea 0x78(%rsp), %rsp
1791 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1793 .globl bsaes_ctr32_encrypt_blocks
1794 .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1796 bsaes_ctr32_encrypt_blocks:
1805 lea -0x48(%rsp), %rsp
1807 $code.=<<___ if ($win64);
1808 mov 0xa0(%rsp),$arg5 # pull ivp
1809 lea -0xa0(%rsp), %rsp
1810 movaps %xmm6, 0x40(%rsp)
1811 movaps %xmm7, 0x50(%rsp)
1812 movaps %xmm8, 0x60(%rsp)
1813 movaps %xmm9, 0x70(%rsp)
1814 movaps %xmm10, 0x80(%rsp)
1815 movaps %xmm11, 0x90(%rsp)
1816 movaps %xmm12, 0xa0(%rsp)
1817 movaps %xmm13, 0xb0(%rsp)
1818 movaps %xmm14, 0xc0(%rsp)
1819 movaps %xmm15, 0xd0(%rsp)
1823 mov %rsp, %rbp # backup %rsp
1824 movdqu ($arg5), %xmm0 # load counter
1825 mov 240($arg4), %eax # rounds
1826 mov $arg1, $inp # backup arguments
1830 movdqa %xmm0, 0x20(%rbp) # copy counter
1834 mov %eax, %ebx # rounds
1835 shl \$7, %rax # 128 bytes per inner round key
1836 sub \$`128-32`, %rax # size of bit-sliced key schedule
1839 mov %rsp, %rax # pass key schedule
1840 mov $key, %rcx # pass key
1841 mov %ebx, %r10d # pass rounds
1842 call _bsaes_key_convert
1843 pxor %xmm6,%xmm7 # fix up last round key
1844 movdqa %xmm7,(%rax) # save last round key
1846 movdqa (%rsp), @XMM[9] # load round0 key
1847 lea .LADD1(%rip), %r11
1848 movdqa 0x20(%rbp), @XMM[0] # counter copy
1849 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1850 pshufb @XMM[8], @XMM[9] # byte swap upper part
1851 pshufb @XMM[8], @XMM[0]
1852 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1856 movdqa @XMM[0], 0x20(%rbp) # save counter
1857 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1858 movdqa @XMM[0], @XMM[2]
1859 paddd 0x00(%r11), @XMM[1] # .LADD1
1860 movdqa @XMM[0], @XMM[3]
1861 paddd 0x10(%r11), @XMM[2] # .LADD2
1862 movdqa @XMM[0], @XMM[4]
1863 paddd 0x20(%r11), @XMM[3] # .LADD3
1864 movdqa @XMM[0], @XMM[5]
1865 paddd 0x30(%r11), @XMM[4] # .LADD4
1866 movdqa @XMM[0], @XMM[6]
1867 paddd 0x40(%r11), @XMM[5] # .LADD5
1868 movdqa @XMM[0], @XMM[7]
1869 paddd 0x50(%r11), @XMM[6] # .LADD6
1870 paddd 0x60(%r11), @XMM[7] # .LADD7
1872 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1873 # to flip byte order in 32-bit counter
1874 movdqa (%rsp), @XMM[9] # round 0 key
1875 lea 0x10(%rsp), %rax # pass key schedule
1876 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1877 pxor @XMM[9], @XMM[0] # xor with round0 key
1878 pxor @XMM[9], @XMM[1]
1879 pshufb @XMM[8], @XMM[0]
1880 pxor @XMM[9], @XMM[2]
1881 pshufb @XMM[8], @XMM[1]
1882 pxor @XMM[9], @XMM[3]
1883 pshufb @XMM[8], @XMM[2]
1884 pxor @XMM[9], @XMM[4]
1885 pshufb @XMM[8], @XMM[3]
1886 pxor @XMM[9], @XMM[5]
1887 pshufb @XMM[8], @XMM[4]
1888 pxor @XMM[9], @XMM[6]
1889 pshufb @XMM[8], @XMM[5]
1890 pxor @XMM[9], @XMM[7]
1891 pshufb @XMM[8], @XMM[6]
1892 lea .LBS0(%rip), %r11 # constants table
1893 pshufb @XMM[8], @XMM[7]
1894 mov %ebx,%r10d # pass rounds
1896 call _bsaes_encrypt8_bitslice
1899 jc .Lctr_enc_loop_done
1901 movdqu 0x00($inp), @XMM[8] # load input
1902 movdqu 0x10($inp), @XMM[9]
1903 movdqu 0x20($inp), @XMM[10]
1904 movdqu 0x30($inp), @XMM[11]
1905 movdqu 0x40($inp), @XMM[12]
1906 movdqu 0x50($inp), @XMM[13]
1907 movdqu 0x60($inp), @XMM[14]
1908 movdqu 0x70($inp), @XMM[15]
1910 pxor @XMM[0], @XMM[8]
1911 movdqa 0x20(%rbp), @XMM[0] # load counter
1912 pxor @XMM[9], @XMM[1]
1913 movdqu @XMM[8], 0x00($out) # write output
1914 pxor @XMM[10], @XMM[4]
1915 movdqu @XMM[1], 0x10($out)
1916 pxor @XMM[11], @XMM[6]
1917 movdqu @XMM[4], 0x20($out)
1918 pxor @XMM[12], @XMM[3]
1919 movdqu @XMM[6], 0x30($out)
1920 pxor @XMM[13], @XMM[7]
1921 movdqu @XMM[3], 0x40($out)
1922 pxor @XMM[14], @XMM[2]
1923 movdqu @XMM[7], 0x50($out)
1924 pxor @XMM[15], @XMM[5]
1925 movdqu @XMM[2], 0x60($out)
1926 lea .LADD1(%rip), %r11
1927 movdqu @XMM[5], 0x70($out)
1928 lea 0x80($out), $out
1929 paddd 0x70(%r11), @XMM[0] # .LADD8
1934 .Lctr_enc_loop_done:
1936 movdqu 0x00($inp), @XMM[8] # load input
1937 pxor @XMM[8], @XMM[0]
1938 movdqu @XMM[0], 0x00($out) # write output
1941 movdqu 0x10($inp), @XMM[9]
1942 pxor @XMM[9], @XMM[1]
1943 movdqu @XMM[1], 0x10($out)
1945 movdqu 0x20($inp), @XMM[10]
1946 pxor @XMM[10], @XMM[4]
1947 movdqu @XMM[4], 0x20($out)
1950 movdqu 0x30($inp), @XMM[11]
1951 pxor @XMM[11], @XMM[6]
1952 movdqu @XMM[6], 0x30($out)
1954 movdqu 0x40($inp), @XMM[12]
1955 pxor @XMM[12], @XMM[3]
1956 movdqu @XMM[3], 0x40($out)
1959 movdqu 0x50($inp), @XMM[13]
1960 pxor @XMM[13], @XMM[7]
1961 movdqu @XMM[7], 0x50($out)
1963 movdqu 0x60($inp), @XMM[14]
1964 pxor @XMM[14], @XMM[2]
1965 movdqu @XMM[2], 0x60($out)
1970 lea 0x20(%rbp), $arg1
1971 lea 0x30(%rbp), $arg2
1973 call asm_AES_encrypt
1974 movdqu ($inp), @XMM[1]
1976 mov 0x2c(%rbp), %eax # load 32-bit counter
1978 pxor 0x30(%rbp), @XMM[1]
1979 inc %eax # increment
1980 movdqu @XMM[1], ($out)
1983 mov %eax, 0x2c(%rsp) # save 32-bit counter
1990 .Lctr_enc_bzero: # wipe key schedule [if any]
1991 movdqa %xmm0, 0x00(%rax)
1992 movdqa %xmm0, 0x10(%rax)
1993 lea 0x20(%rax), %rax
1997 lea (%rbp),%rsp # restore %rsp
1999 $code.=<<___ if ($win64);
2000 movaps 0x40(%rbp), %xmm6
2001 movaps 0x50(%rbp), %xmm7
2002 movaps 0x60(%rbp), %xmm8
2003 movaps 0x70(%rbp), %xmm9
2004 movaps 0x80(%rbp), %xmm10
2005 movaps 0x90(%rbp), %xmm11
2006 movaps 0xa0(%rbp), %xmm12
2007 movaps 0xb0(%rbp), %xmm13
2008 movaps 0xc0(%rbp), %xmm14
2009 movaps 0xd0(%rbp), %xmm15
2010 lea 0xa0(%rbp), %rsp
2013 mov 0x48(%rsp), %r15
2014 mov 0x50(%rsp), %r14
2015 mov 0x58(%rsp), %r13
2016 mov 0x60(%rsp), %r12
2017 mov 0x68(%rsp), %rbx
2018 mov 0x70(%rsp), %rax
2019 lea 0x78(%rsp), %rsp
2023 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2025 ######################################################################
2026 # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2027 # const AES_KEY *key1, const AES_KEY *key2,
2028 # const unsigned char iv[16]);
2030 my ($twmask,$twres,$twtmp)=@XMM[13..15];
2032 .globl bsaes_xts_encrypt
2033 .type bsaes_xts_encrypt,\@abi-omnipotent
2044 lea -0x48(%rsp), %rsp
2046 $code.=<<___ if ($win64);
2047 mov 0xa0(%rsp),$arg5 # pull key2
2048 mov 0xa8(%rsp),$arg6 # pull ivp
2049 lea -0xa0(%rsp), %rsp
2050 movaps %xmm6, 0x40(%rsp)
2051 movaps %xmm7, 0x50(%rsp)
2052 movaps %xmm8, 0x60(%rsp)
2053 movaps %xmm9, 0x70(%rsp)
2054 movaps %xmm10, 0x80(%rsp)
2055 movaps %xmm11, 0x90(%rsp)
2056 movaps %xmm12, 0xa0(%rsp)
2057 movaps %xmm13, 0xb0(%rsp)
2058 movaps %xmm14, 0xc0(%rsp)
2059 movaps %xmm15, 0xd0(%rsp)
2063 mov %rsp, %rbp # backup %rsp
2064 mov $arg1, $inp # backup arguments
2070 lea 0x20(%rbp), $arg2
2072 call asm_AES_encrypt # generate initial tweak
2074 mov 240($key), %eax # rounds
2075 mov $len, %rbx # backup $len
2077 mov %eax, %edx # rounds
2078 shl \$7, %rax # 128 bytes per inner round key
2079 sub \$`128-32`, %rax # size of bit-sliced key schedule
2082 mov %rsp, %rax # pass key schedule
2083 mov $key, %rcx # pass key
2084 mov %edx, %r10d # pass rounds
2085 call _bsaes_key_convert
2086 pxor %xmm6, %xmm7 # fix up last round key
2087 movdqa %xmm7, (%rax) # save last round key
2090 sub \$0x80, %rsp # place for tweak[8]
2091 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2094 movdqa .Lxts_magic(%rip), $twmask
2095 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2104 for ($i=0;$i<7;$i++) {
2106 pshufd \$0x13, $twtmp, $twres
2108 movdqa @XMM[7], @XMM[$i]
2109 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2110 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2111 pand $twmask, $twres # isolate carry and residue
2112 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2113 pxor $twres, @XMM[7]
2115 $code.=<<___ if ($i>=1);
2116 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2118 $code.=<<___ if ($i>=2);
2119 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2123 movdqu 0x60($inp), @XMM[8+6]
2124 pxor @XMM[8+5], @XMM[5]
2125 movdqu 0x70($inp), @XMM[8+7]
2126 lea 0x80($inp), $inp
2127 movdqa @XMM[7], 0x70(%rsp)
2128 pxor @XMM[8+6], @XMM[6]
2129 lea 0x80(%rsp), %rax # pass key schedule
2130 pxor @XMM[8+7], @XMM[7]
2131 mov %edx, %r10d # pass rounds
2133 call _bsaes_encrypt8
2135 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2136 pxor 0x10(%rsp), @XMM[1]
2137 movdqu @XMM[0], 0x00($out) # write output
2138 pxor 0x20(%rsp), @XMM[4]
2139 movdqu @XMM[1], 0x10($out)
2140 pxor 0x30(%rsp), @XMM[6]
2141 movdqu @XMM[4], 0x20($out)
2142 pxor 0x40(%rsp), @XMM[3]
2143 movdqu @XMM[6], 0x30($out)
2144 pxor 0x50(%rsp), @XMM[7]
2145 movdqu @XMM[3], 0x40($out)
2146 pxor 0x60(%rsp), @XMM[2]
2147 movdqu @XMM[7], 0x50($out)
2148 pxor 0x70(%rsp), @XMM[5]
2149 movdqu @XMM[2], 0x60($out)
2150 movdqu @XMM[5], 0x70($out)
2151 lea 0x80($out), $out
2153 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2155 movdqa .Lxts_magic(%rip), $twmask
2156 pcmpgtd @XMM[7], $twtmp
2157 pshufd \$0x13, $twtmp, $twres
2159 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2160 pand $twmask, $twres # isolate carry and residue
2161 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2162 pxor $twres, @XMM[7]
2171 for ($i=0;$i<7;$i++) {
2173 pshufd \$0x13, $twtmp, $twres
2175 movdqa @XMM[7], @XMM[$i]
2176 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2177 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2178 pand $twmask, $twres # isolate carry and residue
2179 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2180 pxor $twres, @XMM[7]
2182 $code.=<<___ if ($i>=1);
2183 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2184 cmp \$`0x10*$i`,$len
2187 $code.=<<___ if ($i>=2);
2188 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2192 movdqu 0x60($inp), @XMM[8+6]
2193 pxor @XMM[8+5], @XMM[5]
2194 movdqa @XMM[7], 0x70(%rsp)
2195 lea 0x70($inp), $inp
2196 pxor @XMM[8+6], @XMM[6]
2197 lea 0x80(%rsp), %rax # pass key schedule
2198 mov %edx, %r10d # pass rounds
2200 call _bsaes_encrypt8
2202 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2203 pxor 0x10(%rsp), @XMM[1]
2204 movdqu @XMM[0], 0x00($out) # write output
2205 pxor 0x20(%rsp), @XMM[4]
2206 movdqu @XMM[1], 0x10($out)
2207 pxor 0x30(%rsp), @XMM[6]
2208 movdqu @XMM[4], 0x20($out)
2209 pxor 0x40(%rsp), @XMM[3]
2210 movdqu @XMM[6], 0x30($out)
2211 pxor 0x50(%rsp), @XMM[7]
2212 movdqu @XMM[3], 0x40($out)
2213 pxor 0x60(%rsp), @XMM[2]
2214 movdqu @XMM[7], 0x50($out)
2215 movdqu @XMM[2], 0x60($out)
2216 lea 0x70($out), $out
2218 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2222 pxor @XMM[8+4], @XMM[4]
2223 lea 0x60($inp), $inp
2224 pxor @XMM[8+5], @XMM[5]
2225 lea 0x80(%rsp), %rax # pass key schedule
2226 mov %edx, %r10d # pass rounds
2228 call _bsaes_encrypt8
2230 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2231 pxor 0x10(%rsp), @XMM[1]
2232 movdqu @XMM[0], 0x00($out) # write output
2233 pxor 0x20(%rsp), @XMM[4]
2234 movdqu @XMM[1], 0x10($out)
2235 pxor 0x30(%rsp), @XMM[6]
2236 movdqu @XMM[4], 0x20($out)
2237 pxor 0x40(%rsp), @XMM[3]
2238 movdqu @XMM[6], 0x30($out)
2239 pxor 0x50(%rsp), @XMM[7]
2240 movdqu @XMM[3], 0x40($out)
2241 movdqu @XMM[7], 0x50($out)
2242 lea 0x60($out), $out
2244 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2248 pxor @XMM[8+3], @XMM[3]
2249 lea 0x50($inp), $inp
2250 pxor @XMM[8+4], @XMM[4]
2251 lea 0x80(%rsp), %rax # pass key schedule
2252 mov %edx, %r10d # pass rounds
2254 call _bsaes_encrypt8
2256 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2257 pxor 0x10(%rsp), @XMM[1]
2258 movdqu @XMM[0], 0x00($out) # write output
2259 pxor 0x20(%rsp), @XMM[4]
2260 movdqu @XMM[1], 0x10($out)
2261 pxor 0x30(%rsp), @XMM[6]
2262 movdqu @XMM[4], 0x20($out)
2263 pxor 0x40(%rsp), @XMM[3]
2264 movdqu @XMM[6], 0x30($out)
2265 movdqu @XMM[3], 0x40($out)
2266 lea 0x50($out), $out
2268 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2272 pxor @XMM[8+2], @XMM[2]
2273 lea 0x40($inp), $inp
2274 pxor @XMM[8+3], @XMM[3]
2275 lea 0x80(%rsp), %rax # pass key schedule
2276 mov %edx, %r10d # pass rounds
2278 call _bsaes_encrypt8
2280 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2281 pxor 0x10(%rsp), @XMM[1]
2282 movdqu @XMM[0], 0x00($out) # write output
2283 pxor 0x20(%rsp), @XMM[4]
2284 movdqu @XMM[1], 0x10($out)
2285 pxor 0x30(%rsp), @XMM[6]
2286 movdqu @XMM[4], 0x20($out)
2287 movdqu @XMM[6], 0x30($out)
2288 lea 0x40($out), $out
2290 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2294 pxor @XMM[8+1], @XMM[1]
2295 lea 0x30($inp), $inp
2296 pxor @XMM[8+2], @XMM[2]
2297 lea 0x80(%rsp), %rax # pass key schedule
2298 mov %edx, %r10d # pass rounds
2300 call _bsaes_encrypt8
2302 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2303 pxor 0x10(%rsp), @XMM[1]
2304 movdqu @XMM[0], 0x00($out) # write output
2305 pxor 0x20(%rsp), @XMM[4]
2306 movdqu @XMM[1], 0x10($out)
2307 movdqu @XMM[4], 0x20($out)
2308 lea 0x30($out), $out
2310 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2314 pxor @XMM[8+0], @XMM[0]
2315 lea 0x20($inp), $inp
2316 pxor @XMM[8+1], @XMM[1]
2317 lea 0x80(%rsp), %rax # pass key schedule
2318 mov %edx, %r10d # pass rounds
2320 call _bsaes_encrypt8
2322 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2323 pxor 0x10(%rsp), @XMM[1]
2324 movdqu @XMM[0], 0x00($out) # write output
2325 movdqu @XMM[1], 0x10($out)
2326 lea 0x20($out), $out
2328 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2332 pxor @XMM[0], @XMM[8]
2333 lea 0x10($inp), $inp
2334 movdqa @XMM[8], 0x20(%rbp)
2335 lea 0x20(%rbp), $arg1
2336 lea 0x20(%rbp), $arg2
2338 call asm_AES_encrypt # doesn't touch %xmm
2339 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2340 #pxor @XMM[8], @XMM[0]
2341 #lea 0x80(%rsp), %rax # pass key schedule
2342 #mov %edx, %r10d # pass rounds
2343 #call _bsaes_encrypt8
2344 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2345 movdqu @XMM[0], 0x00($out) # write output
2346 lea 0x10($out), $out
2348 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2357 movzb -16(%rdx), %ecx
2365 movdqu -16($out), @XMM[0]
2366 lea 0x20(%rbp), $arg1
2367 pxor @XMM[7], @XMM[0]
2368 lea 0x20(%rbp), $arg2
2369 movdqa @XMM[0], 0x20(%rbp)
2371 call asm_AES_encrypt # doesn't touch %xmm
2372 pxor 0x20(%rbp), @XMM[7]
2373 movdqu @XMM[7], -16($out)
2378 .Lxts_enc_bzero: # wipe key schedule [if any]
2379 movdqa %xmm0, 0x00(%rax)
2380 movdqa %xmm0, 0x10(%rax)
2381 lea 0x20(%rax), %rax
2385 lea (%rbp),%rsp # restore %rsp
2387 $code.=<<___ if ($win64);
2388 movaps 0x40(%rbp), %xmm6
2389 movaps 0x50(%rbp), %xmm7
2390 movaps 0x60(%rbp), %xmm8
2391 movaps 0x70(%rbp), %xmm9
2392 movaps 0x80(%rbp), %xmm10
2393 movaps 0x90(%rbp), %xmm11
2394 movaps 0xa0(%rbp), %xmm12
2395 movaps 0xb0(%rbp), %xmm13
2396 movaps 0xc0(%rbp), %xmm14
2397 movaps 0xd0(%rbp), %xmm15
2398 lea 0xa0(%rbp), %rsp
2401 mov 0x48(%rsp), %r15
2402 mov 0x50(%rsp), %r14
2403 mov 0x58(%rsp), %r13
2404 mov 0x60(%rsp), %r12
2405 mov 0x68(%rsp), %rbx
2406 mov 0x70(%rsp), %rax
2407 lea 0x78(%rsp), %rsp
2411 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2413 .globl bsaes_xts_decrypt
2414 .type bsaes_xts_decrypt,\@abi-omnipotent
2425 lea -0x48(%rsp), %rsp
2427 $code.=<<___ if ($win64);
2428 mov 0xa0(%rsp),$arg5 # pull key2
2429 mov 0xa8(%rsp),$arg6 # pull ivp
2430 lea -0xa0(%rsp), %rsp
2431 movaps %xmm6, 0x40(%rsp)
2432 movaps %xmm7, 0x50(%rsp)
2433 movaps %xmm8, 0x60(%rsp)
2434 movaps %xmm9, 0x70(%rsp)
2435 movaps %xmm10, 0x80(%rsp)
2436 movaps %xmm11, 0x90(%rsp)
2437 movaps %xmm12, 0xa0(%rsp)
2438 movaps %xmm13, 0xb0(%rsp)
2439 movaps %xmm14, 0xc0(%rsp)
2440 movaps %xmm15, 0xd0(%rsp)
2444 mov %rsp, %rbp # backup %rsp
2445 mov $arg1, $inp # backup arguments
2451 lea 0x20(%rbp), $arg2
2453 call asm_AES_encrypt # generate initial tweak
2455 mov 240($key), %eax # rounds
2456 mov $len, %rbx # backup $len
2458 mov %eax, %edx # rounds
2459 shl \$7, %rax # 128 bytes per inner round key
2460 sub \$`128-32`, %rax # size of bit-sliced key schedule
2463 mov %rsp, %rax # pass key schedule
2464 mov $key, %rcx # pass key
2465 mov %edx, %r10d # pass rounds
2466 call _bsaes_key_convert
2467 pxor (%rsp), %xmm7 # fix up round 0 key
2468 movdqa %xmm6, (%rax) # save last round key
2469 movdqa %xmm7, (%rsp)
2471 xor %eax, %eax # if ($len%16) len-=16;
2478 sub \$0x80, %rsp # place for tweak[8]
2479 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2482 movdqa .Lxts_magic(%rip), $twmask
2483 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2492 for ($i=0;$i<7;$i++) {
2494 pshufd \$0x13, $twtmp, $twres
2496 movdqa @XMM[7], @XMM[$i]
2497 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2498 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2499 pand $twmask, $twres # isolate carry and residue
2500 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2501 pxor $twres, @XMM[7]
2503 $code.=<<___ if ($i>=1);
2504 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2506 $code.=<<___ if ($i>=2);
2507 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2511 movdqu 0x60($inp), @XMM[8+6]
2512 pxor @XMM[8+5], @XMM[5]
2513 movdqu 0x70($inp), @XMM[8+7]
2514 lea 0x80($inp), $inp
2515 movdqa @XMM[7], 0x70(%rsp)
2516 pxor @XMM[8+6], @XMM[6]
2517 lea 0x80(%rsp), %rax # pass key schedule
2518 pxor @XMM[8+7], @XMM[7]
2519 mov %edx, %r10d # pass rounds
2521 call _bsaes_decrypt8
2523 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2524 pxor 0x10(%rsp), @XMM[1]
2525 movdqu @XMM[0], 0x00($out) # write output
2526 pxor 0x20(%rsp), @XMM[6]
2527 movdqu @XMM[1], 0x10($out)
2528 pxor 0x30(%rsp), @XMM[4]
2529 movdqu @XMM[6], 0x20($out)
2530 pxor 0x40(%rsp), @XMM[2]
2531 movdqu @XMM[4], 0x30($out)
2532 pxor 0x50(%rsp), @XMM[7]
2533 movdqu @XMM[2], 0x40($out)
2534 pxor 0x60(%rsp), @XMM[3]
2535 movdqu @XMM[7], 0x50($out)
2536 pxor 0x70(%rsp), @XMM[5]
2537 movdqu @XMM[3], 0x60($out)
2538 movdqu @XMM[5], 0x70($out)
2539 lea 0x80($out), $out
2541 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2543 movdqa .Lxts_magic(%rip), $twmask
2544 pcmpgtd @XMM[7], $twtmp
2545 pshufd \$0x13, $twtmp, $twres
2547 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2548 pand $twmask, $twres # isolate carry and residue
2549 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2550 pxor $twres, @XMM[7]
2559 for ($i=0;$i<7;$i++) {
2561 pshufd \$0x13, $twtmp, $twres
2563 movdqa @XMM[7], @XMM[$i]
2564 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2565 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2566 pand $twmask, $twres # isolate carry and residue
2567 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2568 pxor $twres, @XMM[7]
2570 $code.=<<___ if ($i>=1);
2571 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2572 cmp \$`0x10*$i`,$len
2575 $code.=<<___ if ($i>=2);
2576 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2580 movdqu 0x60($inp), @XMM[8+6]
2581 pxor @XMM[8+5], @XMM[5]
2582 movdqa @XMM[7], 0x70(%rsp)
2583 lea 0x70($inp), $inp
2584 pxor @XMM[8+6], @XMM[6]
2585 lea 0x80(%rsp), %rax # pass key schedule
2586 mov %edx, %r10d # pass rounds
2588 call _bsaes_decrypt8
2590 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2591 pxor 0x10(%rsp), @XMM[1]
2592 movdqu @XMM[0], 0x00($out) # write output
2593 pxor 0x20(%rsp), @XMM[6]
2594 movdqu @XMM[1], 0x10($out)
2595 pxor 0x30(%rsp), @XMM[4]
2596 movdqu @XMM[6], 0x20($out)
2597 pxor 0x40(%rsp), @XMM[2]
2598 movdqu @XMM[4], 0x30($out)
2599 pxor 0x50(%rsp), @XMM[7]
2600 movdqu @XMM[2], 0x40($out)
2601 pxor 0x60(%rsp), @XMM[3]
2602 movdqu @XMM[7], 0x50($out)
2603 movdqu @XMM[3], 0x60($out)
2604 lea 0x70($out), $out
2606 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2610 pxor @XMM[8+4], @XMM[4]
2611 lea 0x60($inp), $inp
2612 pxor @XMM[8+5], @XMM[5]
2613 lea 0x80(%rsp), %rax # pass key schedule
2614 mov %edx, %r10d # pass rounds
2616 call _bsaes_decrypt8
2618 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2619 pxor 0x10(%rsp), @XMM[1]
2620 movdqu @XMM[0], 0x00($out) # write output
2621 pxor 0x20(%rsp), @XMM[6]
2622 movdqu @XMM[1], 0x10($out)
2623 pxor 0x30(%rsp), @XMM[4]
2624 movdqu @XMM[6], 0x20($out)
2625 pxor 0x40(%rsp), @XMM[2]
2626 movdqu @XMM[4], 0x30($out)
2627 pxor 0x50(%rsp), @XMM[7]
2628 movdqu @XMM[2], 0x40($out)
2629 movdqu @XMM[7], 0x50($out)
2630 lea 0x60($out), $out
2632 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2636 pxor @XMM[8+3], @XMM[3]
2637 lea 0x50($inp), $inp
2638 pxor @XMM[8+4], @XMM[4]
2639 lea 0x80(%rsp), %rax # pass key schedule
2640 mov %edx, %r10d # pass rounds
2642 call _bsaes_decrypt8
2644 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2645 pxor 0x10(%rsp), @XMM[1]
2646 movdqu @XMM[0], 0x00($out) # write output
2647 pxor 0x20(%rsp), @XMM[6]
2648 movdqu @XMM[1], 0x10($out)
2649 pxor 0x30(%rsp), @XMM[4]
2650 movdqu @XMM[6], 0x20($out)
2651 pxor 0x40(%rsp), @XMM[2]
2652 movdqu @XMM[4], 0x30($out)
2653 movdqu @XMM[2], 0x40($out)
2654 lea 0x50($out), $out
2656 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2660 pxor @XMM[8+2], @XMM[2]
2661 lea 0x40($inp), $inp
2662 pxor @XMM[8+3], @XMM[3]
2663 lea 0x80(%rsp), %rax # pass key schedule
2664 mov %edx, %r10d # pass rounds
2666 call _bsaes_decrypt8
2668 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2669 pxor 0x10(%rsp), @XMM[1]
2670 movdqu @XMM[0], 0x00($out) # write output
2671 pxor 0x20(%rsp), @XMM[6]
2672 movdqu @XMM[1], 0x10($out)
2673 pxor 0x30(%rsp), @XMM[4]
2674 movdqu @XMM[6], 0x20($out)
2675 movdqu @XMM[4], 0x30($out)
2676 lea 0x40($out), $out
2678 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2682 pxor @XMM[8+1], @XMM[1]
2683 lea 0x30($inp), $inp
2684 pxor @XMM[8+2], @XMM[2]
2685 lea 0x80(%rsp), %rax # pass key schedule
2686 mov %edx, %r10d # pass rounds
2688 call _bsaes_decrypt8
2690 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2691 pxor 0x10(%rsp), @XMM[1]
2692 movdqu @XMM[0], 0x00($out) # write output
2693 pxor 0x20(%rsp), @XMM[6]
2694 movdqu @XMM[1], 0x10($out)
2695 movdqu @XMM[6], 0x20($out)
2696 lea 0x30($out), $out
2698 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2702 pxor @XMM[8+0], @XMM[0]
2703 lea 0x20($inp), $inp
2704 pxor @XMM[8+1], @XMM[1]
2705 lea 0x80(%rsp), %rax # pass key schedule
2706 mov %edx, %r10d # pass rounds
2708 call _bsaes_decrypt8
2710 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2711 pxor 0x10(%rsp), @XMM[1]
2712 movdqu @XMM[0], 0x00($out) # write output
2713 movdqu @XMM[1], 0x10($out)
2714 lea 0x20($out), $out
2716 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2720 pxor @XMM[0], @XMM[8]
2721 lea 0x10($inp), $inp
2722 movdqa @XMM[8], 0x20(%rbp)
2723 lea 0x20(%rbp), $arg1
2724 lea 0x20(%rbp), $arg2
2726 call asm_AES_decrypt # doesn't touch %xmm
2727 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2728 #pxor @XMM[8], @XMM[0]
2729 #lea 0x80(%rsp), %rax # pass key schedule
2730 #mov %edx, %r10d # pass rounds
2731 #call _bsaes_decrypt8
2732 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2733 movdqu @XMM[0], 0x00($out) # write output
2734 lea 0x10($out), $out
2736 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2743 movdqa .Lxts_magic(%rip), $twmask
2744 pcmpgtd @XMM[7], $twtmp
2745 pshufd \$0x13, $twtmp, $twres
2746 movdqa @XMM[7], @XMM[6]
2747 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2748 pand $twmask, $twres # isolate carry and residue
2749 movdqu ($inp), @XMM[0]
2750 pxor $twres, @XMM[7]
2752 lea 0x20(%rbp), $arg1
2753 pxor @XMM[7], @XMM[0]
2754 lea 0x20(%rbp), $arg2
2755 movdqa @XMM[0], 0x20(%rbp)
2757 call asm_AES_decrypt # doesn't touch %xmm
2758 pxor 0x20(%rbp), @XMM[7]
2760 movdqu @XMM[7], ($out)
2763 movzb 16($inp), %eax
2772 movdqu ($out), @XMM[0]
2773 lea 0x20(%rbp), $arg1
2774 pxor @XMM[6], @XMM[0]
2775 lea 0x20(%rbp), $arg2
2776 movdqa @XMM[0], 0x20(%rbp)
2778 call asm_AES_decrypt # doesn't touch %xmm
2779 pxor 0x20(%rbp), @XMM[6]
2780 movdqu @XMM[6], ($out)
2785 .Lxts_dec_bzero: # wipe key schedule [if any]
2786 movdqa %xmm0, 0x00(%rax)
2787 movdqa %xmm0, 0x10(%rax)
2788 lea 0x20(%rax), %rax
2792 lea (%rbp),%rsp # restore %rsp
2794 $code.=<<___ if ($win64);
2795 movaps 0x40(%rbp), %xmm6
2796 movaps 0x50(%rbp), %xmm7
2797 movaps 0x60(%rbp), %xmm8
2798 movaps 0x70(%rbp), %xmm9
2799 movaps 0x80(%rbp), %xmm10
2800 movaps 0x90(%rbp), %xmm11
2801 movaps 0xa0(%rbp), %xmm12
2802 movaps 0xb0(%rbp), %xmm13
2803 movaps 0xc0(%rbp), %xmm14
2804 movaps 0xd0(%rbp), %xmm15
2805 lea 0xa0(%rbp), %rsp
2808 mov 0x48(%rsp), %r15
2809 mov 0x50(%rsp), %r14
2810 mov 0x58(%rsp), %r13
2811 mov 0x60(%rsp), %r12
2812 mov 0x68(%rsp), %rbx
2813 mov 0x70(%rsp), %rax
2814 lea 0x78(%rsp), %rsp
2818 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2822 .type _bsaes_const,\@object
2825 .LM0ISR: # InvShiftRows constants
2826 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
2828 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
2830 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
2831 .LBS0: # bit-slice constants
2832 .quad 0x5555555555555555, 0x5555555555555555
2834 .quad 0x3333333333333333, 0x3333333333333333
2836 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2837 .LSR: # shiftrows constants
2838 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
2840 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
2842 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
2843 .LSWPUP: # byte-swap upper dword
2844 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
2846 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
2847 .LADD1: # counter increment constants
2848 .quad 0x0000000000000000, 0x0000000100000000
2850 .quad 0x0000000000000000, 0x0000000200000000
2852 .quad 0x0000000000000000, 0x0000000300000000
2854 .quad 0x0000000000000000, 0x0000000400000000
2856 .quad 0x0000000000000000, 0x0000000500000000
2858 .quad 0x0000000000000000, 0x0000000600000000
2860 .quad 0x0000000000000000, 0x0000000700000000
2862 .quad 0x0000000000000000, 0x0000000800000000
2866 .quad 0x0101010101010101, 0x0101010101010101
2867 .quad 0x0202020202020202, 0x0202020202020202
2868 .quad 0x0404040404040404, 0x0404040404040404
2869 .quad 0x0808080808080808, 0x0808080808080808
2871 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
2873 .quad 0x6363636363636363, 0x6363636363636363
2874 .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
2876 .size _bsaes_const,.-_bsaes_const
2879 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2880 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2888 .extern __imp_RtlVirtualUnwind
2889 .type se_handler,\@abi-omnipotent
2903 mov 120($context),%rax # pull context->Rax
2904 mov 248($context),%rbx # pull context->Rip
2906 mov 8($disp),%rsi # disp->ImageBase
2907 mov 56($disp),%r11 # disp->HandlerData
2909 mov 0(%r11),%r10d # HandlerData[0]
2910 lea (%rsi,%r10),%r10 # prologue label
2911 cmp %r10,%rbx # context->Rip<prologue label
2914 mov 152($context),%rax # pull context->Rsp
2916 mov 4(%r11),%r10d # HandlerData[1]
2917 lea (%rsi,%r10),%r10 # epilogue label
2918 cmp %r10,%rbx # context->Rip>=epilogue label
2921 mov 160($context),%rax # pull context->Rbp
2923 lea 0x40(%rax),%rsi # %xmm save area
2924 lea 512($context),%rdi # &context.Xmm6
2925 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
2926 .long 0xa548f3fc # cld; rep movsq
2927 lea 0xa0(%rax),%rax # adjust stack pointer
2935 lea 0x78(%rax),%rax # adjust stack pointer
2936 mov %rbx,144($context) # restore context->Rbx
2937 mov %rbp,160($context) # restore context->Rbp
2938 mov %r12,216($context) # restore context->R12
2939 mov %r13,224($context) # restore context->R13
2940 mov %r14,232($context) # restore context->R14
2941 mov %r15,240($context) # restore context->R15
2944 mov %rax,152($context) # restore context->Rsp
2946 mov 40($disp),%rdi # disp->ContextRecord
2947 mov $context,%rsi # context
2948 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
2949 .long 0xa548f3fc # cld; rep movsq
2952 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2953 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2954 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2955 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2956 mov 40(%rsi),%r10 # disp->ContextRecord
2957 lea 56(%rsi),%r11 # &disp->HandlerData
2958 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2959 mov %r10,32(%rsp) # arg5
2960 mov %r11,40(%rsp) # arg6
2961 mov %r12,48(%rsp) # arg7
2962 mov %rcx,56(%rsp) # arg8, (NULL)
2963 call *__imp_RtlVirtualUnwind(%rip)
2965 mov \$1,%eax # ExceptionContinueSearch
2977 .size se_handler,.-se_handler
2982 $code.=<<___ if ($ecb);
2983 .rva .Lecb_enc_prologue
2984 .rva .Lecb_enc_epilogue
2987 .rva .Lecb_dec_prologue
2988 .rva .Lecb_dec_epilogue
2992 .rva .Lcbc_dec_prologue
2993 .rva .Lcbc_dec_epilogue
2996 .rva .Lctr_enc_prologue
2997 .rva .Lctr_enc_epilogue
3000 .rva .Lxts_enc_prologue
3001 .rva .Lxts_enc_epilogue
3004 .rva .Lxts_dec_prologue
3005 .rva .Lxts_dec_epilogue
3011 $code.=<<___ if ($ecb);
3015 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
3019 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
3025 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
3029 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
3033 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3037 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3041 $code =~ s/\`([^\`]*)\`/eval($1)/gem;