3 ###################################################################
4 ### AES-128 [originally in CTR mode] ###
5 ### bitsliced implementation for Intel Core 2 processors ###
6 ### requires support of SSE extensions up to SSSE3 ###
7 ### Author: Emilia Käsper and Peter Schwabe ###
8 ### Date: 2009-03-19 ###
11 ### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12 ### further information. ###
13 ###################################################################
17 # Started as transliteration to "perlasm" the original code has
18 # undergone following changes:
20 # - code was made position-independent;
21 # - rounds were folded into a loop resulting in >5x size reduction
22 # from 12.5KB to 2.2KB;
23 # - above was possibile thanks to mixcolumns() modification that
24 # allowed to feed its output back to aesenc[last], this was
25 # achieved at cost of two additional inter-registers moves;
26 # - some instruction reordering and interleaving;
27 # - this module doesn't implement key setup subroutine, instead it
28 # relies on conversion of "conventional" key schedule as returned
29 # by AES_set_encrypt_key (see discussion below);
30 # - first and last round keys are treated differently, which allowed
31 # to skip one shiftrows(), reduce bit-sliced key schedule and
32 # speed-up conversion by 22%;
33 # - support for 192- and 256-bit keys was added;
35 # Resulting performance in CPU cycles spent to encrypt one byte out
36 # of 4096-byte buffer with 128-bit key is:
38 # Emilia's this(*) difference
40 # Core 2 9.30 8.69 +7%
41 # Nehalem(**) 7.63 6.88 +11%
45 # (*) Comparison is not completely fair, because "this" is ECB,
46 # i.e. no extra processing such as counter values calculation
47 # and xor-ing input as in Emilia's CTR implementation is
48 # performed. However, the CTR calculations stand for not more
49 # than 1% of total time, so comparison is *rather* fair.
51 # (**) Results were collected on Westmere, which is considered to
52 # be equivalent to Nehalem for this code.
54 # As for key schedule conversion subroutine. Interface to OpenSSL
55 # relies on per-invocation on-the-fly conversion. This naturally
56 # has impact on performance, especially for short inputs. Conversion
57 # time in CPU cycles and its ratio to CPU cycles spent in 8x block
60 # conversion conversion/8x block
65 # The ratio values mean that 128-byte blocks will be processed
66 # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
67 # etc. Then keep in mind that input sizes not divisible by 128 are
68 # *effectively* slower, especially shortest ones, e.g. consecutive
69 # 144-byte blocks are processed 44% slower than one would expect,
70 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
71 # it's still faster than ["hyper-threading-safe" code path in]
72 # aes-x86_64.pl on all lengths above 64 bytes...
76 # Add decryption procedure. Performance in CPU cycles spent to decrypt
77 # one byte out of 4096-byte buffer with 128-bit key is:
86 # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
87 # suboptimal, but XTS is meant to be used with larger blocks...
93 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
95 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
97 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
98 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
99 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
100 die "can't locate x86_64-xlate.pl";
102 open OUT,"| \"$^X\" $xlate $flavour $output";
105 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
106 my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
107 my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
110 my ($key,$rounds,$const)=("%rax","%r10d","%r11");
113 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
114 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
119 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
120 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
124 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
125 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
147 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
148 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
168 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
169 # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
173 &InvInBasisChange (@b);
174 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
175 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
178 sub InvInBasisChange { # OutBasisChange in reverse
179 my @b=@_[5,1,2,6,3,7,0,4];
197 sub InvOutBasisChange { # InBasisChange in reverse
198 my @b=@_[2,5,7,3,6,1,0,4];
219 #;*************************************************************
220 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
221 #;*************************************************************
222 my ($x0,$x1,$y0,$y1,$t0)=@_;
235 sub Mul_GF4_N { # not used, see next subroutine
236 # multiply and scale by N
237 my ($x0,$x1,$y0,$y1,$t0)=@_;
251 # interleaved Mul_GF4_N and Mul_GF4
252 my ($x0,$x1,$y0,$y1,$t0,
253 $x2,$x3,$y2,$y3,$t1)=@_;
281 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
288 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
289 @x[2], @x[3], @y[2], @y[3], @t[2]);
301 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
302 @x[6], @x[7], @y[2], @y[3], @t[2]);
307 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
316 #;********************************************************************
317 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
318 #;********************************************************************
322 # direct optimizations from hardware
377 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
379 # new smaller inversion
413 # output in s3, s2, s1, t1
415 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
417 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
418 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
420 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
423 # AES linear components
429 pxor 0x00($key),@x[0]
430 pxor 0x10($key),@x[1]
431 pxor 0x20($key),@x[2]
432 pxor 0x30($key),@x[3]
435 pxor 0x40($key),@x[4]
436 pxor 0x50($key),@x[5]
439 pxor 0x60($key),@x[6]
440 pxor 0x70($key),@x[7]
450 # modified to emit output in order suitable for feeding back to aesenc[last]
453 my $inv=@_[16]; # optional
455 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
456 pshufd \$0x93, @x[1], @t[1]
457 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
458 pshufd \$0x93, @x[2], @t[2]
460 pshufd \$0x93, @x[3], @t[3]
462 pshufd \$0x93, @x[4], @t[4]
464 pshufd \$0x93, @x[5], @t[5]
466 pshufd \$0x93, @x[6], @t[6]
468 pshufd \$0x93, @x[7], @t[7]
475 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
477 pshufd \$0x4E, @x[1], @x[1]
483 pshufd \$0x4E, @x[4], @t[0]
485 pshufd \$0x4E, @x[5], @t[1]
487 pshufd \$0x4E, @x[3], @x[4]
489 pshufd \$0x4E, @x[7], @x[5]
491 pshufd \$0x4E, @x[6], @x[3]
493 pshufd \$0x4E, @x[2], @x[6]
496 $code.=<<___ if (!$inv);
504 $code.=<<___ if ($inv);
517 sub InvMixColumns_orig {
522 # multiplication by 0x0e
523 pshufd \$0x93, @x[7], @t[7]
525 pxor @x[5], @x[7] # 7 5
526 pxor @x[5], @x[2] # 2 5
527 pshufd \$0x93, @x[0], @t[0]
529 pxor @x[0], @x[5] # 5 0 [1]
530 pxor @x[1], @x[0] # 0 1
531 pshufd \$0x93, @x[1], @t[1]
532 pxor @x[2], @x[1] # 1 25
533 pxor @x[6], @x[0] # 01 6 [2]
534 pxor @x[3], @x[1] # 125 3 [4]
535 pshufd \$0x93, @x[3], @t[3]
536 pxor @x[0], @x[2] # 25 016 [3]
537 pxor @x[7], @x[3] # 3 75
538 pxor @x[6], @x[7] # 75 6 [0]
539 pshufd \$0x93, @x[6], @t[6]
541 pxor @x[4], @x[6] # 6 4
542 pxor @x[3], @x[4] # 4 375 [6]
543 pxor @x[7], @x[3] # 375 756=36
544 pxor @t[5], @x[6] # 64 5 [7]
545 pxor @t[2], @x[3] # 36 2
546 pxor @t[4], @x[3] # 362 4 [5]
547 pshufd \$0x93, @t[5], @t[5]
549 my @y = @x[7,5,0,2,1,3,4,6];
551 # multiplication by 0x0b
555 pshufd \$0x93, @t[2], @t[2]
559 pshufd \$0x93, @t[4], @t[4]
560 pxor @t[6], @t[7] # clobber t[7]
564 pshufd \$0x93, @t[0], @t[0]
568 pshufd \$0x93, @t[1], @t[1]
572 pshufd \$0x93, @t[2], @t[2]
576 pshufd \$0x93, @t[3], @t[3]
582 pxor @t[5], @t[7] # clobber t[7] even more
585 pshufd \$0x93, @t[4], @t[4]
590 pshufd \$0x93, @t[5], @t[5]
591 pxor @t[6], @t[7] # restore t[7]
593 # multiplication by 0x0d
596 pshufd \$0x93, @t[6], @t[6]
600 pshufd \$0x93, @t[7], @t[7]
609 pshufd \$0x93, @t[0], @t[0]
613 pshufd \$0x93, @t[1], @t[1]
618 pshufd \$0x93, @t[2], @t[2]
620 pxor @t[3], @t[6] # clobber t[6]
627 pshufd \$0x93, @t[4], @t[4]
630 pxor @t[3], @t[6] # restore t[6]
632 pshufd \$0x93, @t[5], @t[5]
633 pshufd \$0x93, @t[6], @t[6]
634 pshufd \$0x93, @t[7], @t[7]
635 pshufd \$0x93, @t[3], @t[3]
637 # multiplication by 0x09
639 pxor @y[1], @t[1] # t[1]=y[1]
640 pxor @t[5], @t[0] # clobber t[0]
643 pxor @y[0], @t[0] # t[0]=y[0]
645 pxor @t[7], @t[6] # clobber t[6]
648 pxor @y[4], @t[4] # t[4]=y[4]
650 pxor @y[3], @t[3] # t[3]=y[3]
652 pxor @y[2], @t[2] # t[2]=y[2]
654 pxor @y[5], @t[5] # t[5]=y[5]
657 pxor @y[6], @t[6] # t[6]=y[6]
658 pxor @y[7], @t[7] # t[7]=y[7]
675 # Thanks to Jussi Kivilinna for providing pointer to
677 # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
678 # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
679 # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
680 # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
683 # multiplication by 0x05-0x00-0x04-0x00
684 pshufd \$0x4E, @x[0], @t[0]
685 pshufd \$0x4E, @x[6], @t[6]
687 pshufd \$0x4E, @x[7], @t[7]
689 pshufd \$0x4E, @x[1], @t[1]
691 pshufd \$0x4E, @x[2], @t[2]
693 pshufd \$0x4E, @x[3], @t[3]
697 pshufd \$0x4E, @x[4], @t[4]
701 pshufd \$0x4E, @x[5], @t[5]
716 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
719 sub aesenc { # not used
723 movdqa 0x30($const),@t[0] # .LSR
725 &ShiftRows (@b,@t[0]);
727 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
730 sub aesenclast { # not used
734 movdqa 0x40($const),@t[0] # .LSRM0
736 &ShiftRows (@b,@t[0]);
739 pxor 0x00($key),@b[0]
740 pxor 0x10($key),@b[1]
741 pxor 0x20($key),@b[4]
742 pxor 0x30($key),@b[6]
743 pxor 0x40($key),@b[3]
744 pxor 0x50($key),@b[7]
745 pxor 0x60($key),@b[2]
746 pxor 0x70($key),@b[5]
751 my ($a,$b,$n,$mask,$t)=@_;
763 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
783 my @x=reverse(@_[0..7]);
784 my ($t0,$t1,$t2,$t3)=@_[8..11];
786 movdqa 0x00($const),$t0 # .LBS0
787 movdqa 0x10($const),$t1 # .LBS1
789 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
790 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
792 movdqa 0x20($const),$t0 # .LBS2
794 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
795 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
797 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
798 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
804 .extern asm_AES_encrypt
805 .extern asm_AES_decrypt
807 .type _bsaes_encrypt8,\@abi-omnipotent
810 lea .LBS0(%rip), $const # constants table
812 movdqa ($key), @XMM[9] # round 0 key
814 movdqa 0x50($const), @XMM[8] # .LM0SR
815 pxor @XMM[9], @XMM[0] # xor with round0 key
816 pxor @XMM[9], @XMM[1]
817 pxor @XMM[9], @XMM[2]
818 pxor @XMM[9], @XMM[3]
819 pshufb @XMM[8], @XMM[0]
820 pshufb @XMM[8], @XMM[1]
821 pxor @XMM[9], @XMM[4]
822 pxor @XMM[9], @XMM[5]
823 pshufb @XMM[8], @XMM[2]
824 pshufb @XMM[8], @XMM[3]
825 pxor @XMM[9], @XMM[6]
826 pxor @XMM[9], @XMM[7]
827 pshufb @XMM[8], @XMM[4]
828 pshufb @XMM[8], @XMM[5]
829 pshufb @XMM[8], @XMM[6]
830 pshufb @XMM[8], @XMM[7]
831 _bsaes_encrypt8_bitslice:
833 &bitslice (@XMM[0..7, 8..11]);
840 &ShiftRows (@XMM[0..7, 8]);
841 $code.=".Lenc_sbox:\n";
842 &Sbox (@XMM[0..7, 8..15]);
847 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
849 movdqa 0x30($const), @XMM[8] # .LSR
851 movdqa 0x40($const), @XMM[8] # .LSRM0
856 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
857 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
859 movdqa ($key), @XMM[8] # last round key
860 pxor @XMM[8], @XMM[4]
861 pxor @XMM[8], @XMM[6]
862 pxor @XMM[8], @XMM[3]
863 pxor @XMM[8], @XMM[7]
864 pxor @XMM[8], @XMM[2]
865 pxor @XMM[8], @XMM[5]
866 pxor @XMM[8], @XMM[0]
867 pxor @XMM[8], @XMM[1]
869 .size _bsaes_encrypt8,.-_bsaes_encrypt8
871 .type _bsaes_decrypt8,\@abi-omnipotent
874 lea .LBS0(%rip), $const # constants table
876 movdqa ($key), @XMM[9] # round 0 key
878 movdqa -0x30($const), @XMM[8] # .LM0ISR
879 pxor @XMM[9], @XMM[0] # xor with round0 key
880 pxor @XMM[9], @XMM[1]
881 pxor @XMM[9], @XMM[2]
882 pxor @XMM[9], @XMM[3]
883 pshufb @XMM[8], @XMM[0]
884 pshufb @XMM[8], @XMM[1]
885 pxor @XMM[9], @XMM[4]
886 pxor @XMM[9], @XMM[5]
887 pshufb @XMM[8], @XMM[2]
888 pshufb @XMM[8], @XMM[3]
889 pxor @XMM[9], @XMM[6]
890 pxor @XMM[9], @XMM[7]
891 pshufb @XMM[8], @XMM[4]
892 pshufb @XMM[8], @XMM[5]
893 pshufb @XMM[8], @XMM[6]
894 pshufb @XMM[8], @XMM[7]
896 &bitslice (@XMM[0..7, 8..11]);
903 &ShiftRows (@XMM[0..7, 8]);
904 $code.=".Ldec_sbox:\n";
905 &InvSbox (@XMM[0..7, 8..15]);
910 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
912 movdqa -0x10($const), @XMM[8] # .LISR
914 movdqa -0x20($const), @XMM[8] # .LISRM0
919 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
921 movdqa ($key), @XMM[8] # last round key
922 pxor @XMM[8], @XMM[6]
923 pxor @XMM[8], @XMM[4]
924 pxor @XMM[8], @XMM[2]
925 pxor @XMM[8], @XMM[7]
926 pxor @XMM[8], @XMM[3]
927 pxor @XMM[8], @XMM[5]
928 pxor @XMM[8], @XMM[0]
929 pxor @XMM[8], @XMM[1]
931 .size _bsaes_decrypt8,.-_bsaes_decrypt8
935 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
938 my @x=reverse(@_[0..7]);
939 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
941 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
943 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
947 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
949 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
951 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
957 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
958 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
962 .type _bsaes_key_convert,\@abi-omnipotent
965 lea .Lmasks(%rip), $const
966 movdqu ($inp), %xmm7 # load round 0 key
968 movdqa 0x00($const), %xmm0 # 0x01...
969 movdqa 0x10($const), %xmm1 # 0x02...
970 movdqa 0x20($const), %xmm2 # 0x04...
971 movdqa 0x30($const), %xmm3 # 0x08...
972 movdqa 0x40($const), %xmm4 # .LM0
973 pcmpeqd %xmm5, %xmm5 # .LNOT
975 movdqu ($inp), %xmm6 # load round 1 key
976 movdqa %xmm7, ($out) # save round 0 key
982 pshufb %xmm4, %xmm6 # .LM0
991 psllq \$4, %xmm0 # 0x10...
994 psllq \$4, %xmm1 # 0x20...
999 pcmpeqb %xmm2, %xmm10
1000 psllq \$4, %xmm2 # 0x40...
1001 movdqa %xmm1, %xmm13
1002 pcmpeqb %xmm3, %xmm11
1003 psllq \$4, %xmm3 # 0x80...
1005 movdqa %xmm2, %xmm14
1006 movdqa %xmm3, %xmm15
1007 pxor %xmm5, %xmm8 # "pnot"
1012 movdqa %xmm8, 0x00($out) # write bit-sliced round key
1013 pcmpeqb %xmm0, %xmm12
1014 psrlq \$4, %xmm0 # 0x01...
1015 movdqa %xmm9, 0x10($out)
1016 pcmpeqb %xmm1, %xmm13
1017 psrlq \$4, %xmm1 # 0x02...
1018 lea 0x10($inp), $inp
1022 movdqa %xmm10, 0x20($out)
1023 pcmpeqb %xmm2, %xmm14
1024 psrlq \$4, %xmm2 # 0x04...
1025 movdqa %xmm11, 0x30($out)
1026 pcmpeqb %xmm3, %xmm15
1027 psrlq \$4, %xmm3 # 0x08...
1028 movdqu ($inp), %xmm6 # load next round key
1030 pxor %xmm5, %xmm13 # "pnot"
1032 movdqa %xmm12, 0x40($out)
1033 movdqa %xmm13, 0x50($out)
1034 movdqa %xmm14, 0x60($out)
1035 movdqa %xmm15, 0x70($out)
1040 movdqa 0x50($const), %xmm7 # .L63
1041 #movdqa %xmm6, ($out) # don't save last round key
1043 .size _bsaes_key_convert,.-_bsaes_key_convert
1047 if (0 && !$win64) { # following four functions are unsupported interface
1048 # used for benchmarking...
1050 .globl bsaes_enc_key_convert
1051 .type bsaes_enc_key_convert,\@function,2
1053 bsaes_enc_key_convert:
1054 mov 240($inp),%r10d # pass rounds
1055 mov $inp,%rcx # pass key
1056 mov $out,%rax # pass key schedule
1057 call _bsaes_key_convert
1058 pxor %xmm6,%xmm7 # fix up last round key
1059 movdqa %xmm7,(%rax) # save last round key
1061 .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1063 .globl bsaes_encrypt_128
1064 .type bsaes_encrypt_128,\@function,4
1068 movdqu 0x00($inp), @XMM[0] # load input
1069 movdqu 0x10($inp), @XMM[1]
1070 movdqu 0x20($inp), @XMM[2]
1071 movdqu 0x30($inp), @XMM[3]
1072 movdqu 0x40($inp), @XMM[4]
1073 movdqu 0x50($inp), @XMM[5]
1074 movdqu 0x60($inp), @XMM[6]
1075 movdqu 0x70($inp), @XMM[7]
1076 mov $key, %rax # pass the $key
1077 lea 0x80($inp), $inp
1080 call _bsaes_encrypt8
1082 movdqu @XMM[0], 0x00($out) # write output
1083 movdqu @XMM[1], 0x10($out)
1084 movdqu @XMM[4], 0x20($out)
1085 movdqu @XMM[6], 0x30($out)
1086 movdqu @XMM[3], 0x40($out)
1087 movdqu @XMM[7], 0x50($out)
1088 movdqu @XMM[2], 0x60($out)
1089 movdqu @XMM[5], 0x70($out)
1090 lea 0x80($out), $out
1094 .size bsaes_encrypt_128,.-bsaes_encrypt_128
1096 .globl bsaes_dec_key_convert
1097 .type bsaes_dec_key_convert,\@function,2
1099 bsaes_dec_key_convert:
1100 mov 240($inp),%r10d # pass rounds
1101 mov $inp,%rcx # pass key
1102 mov $out,%rax # pass key schedule
1103 call _bsaes_key_convert
1104 pxor ($out),%xmm7 # fix up round 0 key
1105 movdqa %xmm6,(%rax) # save last round key
1108 .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1110 .globl bsaes_decrypt_128
1111 .type bsaes_decrypt_128,\@function,4
1115 movdqu 0x00($inp), @XMM[0] # load input
1116 movdqu 0x10($inp), @XMM[1]
1117 movdqu 0x20($inp), @XMM[2]
1118 movdqu 0x30($inp), @XMM[3]
1119 movdqu 0x40($inp), @XMM[4]
1120 movdqu 0x50($inp), @XMM[5]
1121 movdqu 0x60($inp), @XMM[6]
1122 movdqu 0x70($inp), @XMM[7]
1123 mov $key, %rax # pass the $key
1124 lea 0x80($inp), $inp
1127 call _bsaes_decrypt8
1129 movdqu @XMM[0], 0x00($out) # write output
1130 movdqu @XMM[1], 0x10($out)
1131 movdqu @XMM[6], 0x20($out)
1132 movdqu @XMM[4], 0x30($out)
1133 movdqu @XMM[2], 0x40($out)
1134 movdqu @XMM[7], 0x50($out)
1135 movdqu @XMM[3], 0x60($out)
1136 movdqu @XMM[5], 0x70($out)
1137 lea 0x80($out), $out
1141 .size bsaes_decrypt_128,.-bsaes_decrypt_128
1145 ######################################################################
1149 my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1150 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1151 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1155 .globl bsaes_ecb_encrypt_blocks
1156 .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1158 bsaes_ecb_encrypt_blocks:
1167 lea -0x48(%rsp),%rsp
1169 $code.=<<___ if ($win64);
1170 lea -0xa0(%rsp), %rsp
1171 movaps %xmm6, 0x40(%rsp)
1172 movaps %xmm7, 0x50(%rsp)
1173 movaps %xmm8, 0x60(%rsp)
1174 movaps %xmm9, 0x70(%rsp)
1175 movaps %xmm10, 0x80(%rsp)
1176 movaps %xmm11, 0x90(%rsp)
1177 movaps %xmm12, 0xa0(%rsp)
1178 movaps %xmm13, 0xb0(%rsp)
1179 movaps %xmm14, 0xc0(%rsp)
1180 movaps %xmm15, 0xd0(%rsp)
1184 mov %rsp,%rbp # backup %rsp
1185 mov 240($arg4),%eax # rounds
1186 mov $arg1,$inp # backup arguments
1193 mov %eax,%ebx # backup rounds
1194 shl \$7,%rax # 128 bytes per inner round key
1195 sub \$`128-32`,%rax # size of bit-sliced key schedule
1197 mov %rsp,%rax # pass key schedule
1198 mov $key,%rcx # pass key
1199 mov %ebx,%r10d # pass rounds
1200 call _bsaes_key_convert
1201 pxor %xmm6,%xmm7 # fix up last round key
1202 movdqa %xmm7,(%rax) # save last round key
1206 movdqu 0x00($inp), @XMM[0] # load input
1207 movdqu 0x10($inp), @XMM[1]
1208 movdqu 0x20($inp), @XMM[2]
1209 movdqu 0x30($inp), @XMM[3]
1210 movdqu 0x40($inp), @XMM[4]
1211 movdqu 0x50($inp), @XMM[5]
1212 mov %rsp, %rax # pass key schedule
1213 movdqu 0x60($inp), @XMM[6]
1214 mov %ebx,%r10d # pass rounds
1215 movdqu 0x70($inp), @XMM[7]
1216 lea 0x80($inp), $inp
1218 call _bsaes_encrypt8
1220 movdqu @XMM[0], 0x00($out) # write output
1221 movdqu @XMM[1], 0x10($out)
1222 movdqu @XMM[4], 0x20($out)
1223 movdqu @XMM[6], 0x30($out)
1224 movdqu @XMM[3], 0x40($out)
1225 movdqu @XMM[7], 0x50($out)
1226 movdqu @XMM[2], 0x60($out)
1227 movdqu @XMM[5], 0x70($out)
1228 lea 0x80($out), $out
1235 movdqu 0x00($inp), @XMM[0] # load input
1236 mov %rsp, %rax # pass key schedule
1237 mov %ebx,%r10d # pass rounds
1240 movdqu 0x10($inp), @XMM[1]
1242 movdqu 0x20($inp), @XMM[2]
1245 movdqu 0x30($inp), @XMM[3]
1247 movdqu 0x40($inp), @XMM[4]
1250 movdqu 0x50($inp), @XMM[5]
1252 movdqu 0x60($inp), @XMM[6]
1253 call _bsaes_encrypt8
1254 movdqu @XMM[0], 0x00($out) # write output
1255 movdqu @XMM[1], 0x10($out)
1256 movdqu @XMM[4], 0x20($out)
1257 movdqu @XMM[6], 0x30($out)
1258 movdqu @XMM[3], 0x40($out)
1259 movdqu @XMM[7], 0x50($out)
1260 movdqu @XMM[2], 0x60($out)
1264 call _bsaes_encrypt8
1265 movdqu @XMM[0], 0x00($out) # write output
1266 movdqu @XMM[1], 0x10($out)
1267 movdqu @XMM[4], 0x20($out)
1268 movdqu @XMM[6], 0x30($out)
1269 movdqu @XMM[3], 0x40($out)
1270 movdqu @XMM[7], 0x50($out)
1274 call _bsaes_encrypt8
1275 movdqu @XMM[0], 0x00($out) # write output
1276 movdqu @XMM[1], 0x10($out)
1277 movdqu @XMM[4], 0x20($out)
1278 movdqu @XMM[6], 0x30($out)
1279 movdqu @XMM[3], 0x40($out)
1283 call _bsaes_encrypt8
1284 movdqu @XMM[0], 0x00($out) # write output
1285 movdqu @XMM[1], 0x10($out)
1286 movdqu @XMM[4], 0x20($out)
1287 movdqu @XMM[6], 0x30($out)
1291 call _bsaes_encrypt8
1292 movdqu @XMM[0], 0x00($out) # write output
1293 movdqu @XMM[1], 0x10($out)
1294 movdqu @XMM[4], 0x20($out)
1298 call _bsaes_encrypt8
1299 movdqu @XMM[0], 0x00($out) # write output
1300 movdqu @XMM[1], 0x10($out)
1304 call _bsaes_encrypt8
1305 movdqu @XMM[0], 0x00($out) # write output
1312 call asm_AES_encrypt
1321 .Lecb_enc_bzero: # wipe key schedule [if any]
1322 movdqa %xmm0, 0x00(%rax)
1323 movdqa %xmm0, 0x10(%rax)
1324 lea 0x20(%rax), %rax
1328 lea (%rbp),%rsp # restore %rsp
1330 $code.=<<___ if ($win64);
1331 movaps 0x40(%rbp), %xmm6
1332 movaps 0x50(%rbp), %xmm7
1333 movaps 0x60(%rbp), %xmm8
1334 movaps 0x70(%rbp), %xmm9
1335 movaps 0x80(%rbp), %xmm10
1336 movaps 0x90(%rbp), %xmm11
1337 movaps 0xa0(%rbp), %xmm12
1338 movaps 0xb0(%rbp), %xmm13
1339 movaps 0xc0(%rbp), %xmm14
1340 movaps 0xd0(%rbp), %xmm15
1341 lea 0xa0(%rbp), %rsp
1344 mov 0x48(%rsp), %r15
1345 mov 0x50(%rsp), %r14
1346 mov 0x58(%rsp), %r13
1347 mov 0x60(%rsp), %r12
1348 mov 0x68(%rsp), %rbx
1349 mov 0x70(%rsp), %rax
1350 lea 0x78(%rsp), %rsp
1354 .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1356 .globl bsaes_ecb_decrypt_blocks
1357 .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1359 bsaes_ecb_decrypt_blocks:
1368 lea -0x48(%rsp),%rsp
1370 $code.=<<___ if ($win64);
1371 lea -0xa0(%rsp), %rsp
1372 movaps %xmm6, 0x40(%rsp)
1373 movaps %xmm7, 0x50(%rsp)
1374 movaps %xmm8, 0x60(%rsp)
1375 movaps %xmm9, 0x70(%rsp)
1376 movaps %xmm10, 0x80(%rsp)
1377 movaps %xmm11, 0x90(%rsp)
1378 movaps %xmm12, 0xa0(%rsp)
1379 movaps %xmm13, 0xb0(%rsp)
1380 movaps %xmm14, 0xc0(%rsp)
1381 movaps %xmm15, 0xd0(%rsp)
1385 mov %rsp,%rbp # backup %rsp
1386 mov 240($arg4),%eax # rounds
1387 mov $arg1,$inp # backup arguments
1394 mov %eax,%ebx # backup rounds
1395 shl \$7,%rax # 128 bytes per inner round key
1396 sub \$`128-32`,%rax # size of bit-sliced key schedule
1398 mov %rsp,%rax # pass key schedule
1399 mov $key,%rcx # pass key
1400 mov %ebx,%r10d # pass rounds
1401 call _bsaes_key_convert
1402 pxor (%rsp),%xmm7 # fix up 0 round key
1403 movdqa %xmm6,(%rax) # save last round key
1408 movdqu 0x00($inp), @XMM[0] # load input
1409 movdqu 0x10($inp), @XMM[1]
1410 movdqu 0x20($inp), @XMM[2]
1411 movdqu 0x30($inp), @XMM[3]
1412 movdqu 0x40($inp), @XMM[4]
1413 movdqu 0x50($inp), @XMM[5]
1414 mov %rsp, %rax # pass key schedule
1415 movdqu 0x60($inp), @XMM[6]
1416 mov %ebx,%r10d # pass rounds
1417 movdqu 0x70($inp), @XMM[7]
1418 lea 0x80($inp), $inp
1420 call _bsaes_decrypt8
1422 movdqu @XMM[0], 0x00($out) # write output
1423 movdqu @XMM[1], 0x10($out)
1424 movdqu @XMM[6], 0x20($out)
1425 movdqu @XMM[4], 0x30($out)
1426 movdqu @XMM[2], 0x40($out)
1427 movdqu @XMM[7], 0x50($out)
1428 movdqu @XMM[3], 0x60($out)
1429 movdqu @XMM[5], 0x70($out)
1430 lea 0x80($out), $out
1437 movdqu 0x00($inp), @XMM[0] # load input
1438 mov %rsp, %rax # pass key schedule
1439 mov %ebx,%r10d # pass rounds
1442 movdqu 0x10($inp), @XMM[1]
1444 movdqu 0x20($inp), @XMM[2]
1447 movdqu 0x30($inp), @XMM[3]
1449 movdqu 0x40($inp), @XMM[4]
1452 movdqu 0x50($inp), @XMM[5]
1454 movdqu 0x60($inp), @XMM[6]
1455 call _bsaes_decrypt8
1456 movdqu @XMM[0], 0x00($out) # write output
1457 movdqu @XMM[1], 0x10($out)
1458 movdqu @XMM[6], 0x20($out)
1459 movdqu @XMM[4], 0x30($out)
1460 movdqu @XMM[2], 0x40($out)
1461 movdqu @XMM[7], 0x50($out)
1462 movdqu @XMM[3], 0x60($out)
1466 call _bsaes_decrypt8
1467 movdqu @XMM[0], 0x00($out) # write output
1468 movdqu @XMM[1], 0x10($out)
1469 movdqu @XMM[6], 0x20($out)
1470 movdqu @XMM[4], 0x30($out)
1471 movdqu @XMM[2], 0x40($out)
1472 movdqu @XMM[7], 0x50($out)
1476 call _bsaes_decrypt8
1477 movdqu @XMM[0], 0x00($out) # write output
1478 movdqu @XMM[1], 0x10($out)
1479 movdqu @XMM[6], 0x20($out)
1480 movdqu @XMM[4], 0x30($out)
1481 movdqu @XMM[2], 0x40($out)
1485 call _bsaes_decrypt8
1486 movdqu @XMM[0], 0x00($out) # write output
1487 movdqu @XMM[1], 0x10($out)
1488 movdqu @XMM[6], 0x20($out)
1489 movdqu @XMM[4], 0x30($out)
1493 call _bsaes_decrypt8
1494 movdqu @XMM[0], 0x00($out) # write output
1495 movdqu @XMM[1], 0x10($out)
1496 movdqu @XMM[6], 0x20($out)
1500 call _bsaes_decrypt8
1501 movdqu @XMM[0], 0x00($out) # write output
1502 movdqu @XMM[1], 0x10($out)
1506 call _bsaes_decrypt8
1507 movdqu @XMM[0], 0x00($out) # write output
1514 call asm_AES_decrypt
1523 .Lecb_dec_bzero: # wipe key schedule [if any]
1524 movdqa %xmm0, 0x00(%rax)
1525 movdqa %xmm0, 0x10(%rax)
1526 lea 0x20(%rax), %rax
1530 lea (%rbp),%rsp # restore %rsp
1532 $code.=<<___ if ($win64);
1533 movaps 0x40(%rbp), %xmm6
1534 movaps 0x50(%rbp), %xmm7
1535 movaps 0x60(%rbp), %xmm8
1536 movaps 0x70(%rbp), %xmm9
1537 movaps 0x80(%rbp), %xmm10
1538 movaps 0x90(%rbp), %xmm11
1539 movaps 0xa0(%rbp), %xmm12
1540 movaps 0xb0(%rbp), %xmm13
1541 movaps 0xc0(%rbp), %xmm14
1542 movaps 0xd0(%rbp), %xmm15
1543 lea 0xa0(%rbp), %rsp
1546 mov 0x48(%rsp), %r15
1547 mov 0x50(%rsp), %r14
1548 mov 0x58(%rsp), %r13
1549 mov 0x60(%rsp), %r12
1550 mov 0x68(%rsp), %rbx
1551 mov 0x70(%rsp), %rax
1552 lea 0x78(%rsp), %rsp
1556 .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1560 .extern asm_AES_cbc_encrypt
1561 .globl bsaes_cbc_encrypt
1562 .type bsaes_cbc_encrypt,\@abi-omnipotent
1566 $code.=<<___ if ($win64);
1567 mov 48(%rsp),$arg6 # pull direction flag
1571 jne asm_AES_cbc_encrypt
1573 jb asm_AES_cbc_encrypt
1583 lea -0x48(%rsp), %rsp
1585 $code.=<<___ if ($win64);
1586 mov 0xa0(%rsp),$arg5 # pull ivp
1587 lea -0xa0(%rsp), %rsp
1588 movaps %xmm6, 0x40(%rsp)
1589 movaps %xmm7, 0x50(%rsp)
1590 movaps %xmm8, 0x60(%rsp)
1591 movaps %xmm9, 0x70(%rsp)
1592 movaps %xmm10, 0x80(%rsp)
1593 movaps %xmm11, 0x90(%rsp)
1594 movaps %xmm12, 0xa0(%rsp)
1595 movaps %xmm13, 0xb0(%rsp)
1596 movaps %xmm14, 0xc0(%rsp)
1597 movaps %xmm15, 0xd0(%rsp)
1601 mov %rsp, %rbp # backup %rsp
1602 mov 240($arg4), %eax # rounds
1603 mov $arg1, $inp # backup arguments
1608 shr \$4, $len # bytes to blocks
1610 mov %eax, %edx # rounds
1611 shl \$7, %rax # 128 bytes per inner round key
1612 sub \$`128-32`, %rax # size of bit-sliced key schedule
1615 mov %rsp, %rax # pass key schedule
1616 mov $key, %rcx # pass key
1617 mov %edx, %r10d # pass rounds
1618 call _bsaes_key_convert
1619 pxor (%rsp),%xmm7 # fix up 0 round key
1620 movdqa %xmm6,(%rax) # save last round key
1623 movdqu (%rbx), @XMM[15] # load IV
1626 movdqu 0x00($inp), @XMM[0] # load input
1627 movdqu 0x10($inp), @XMM[1]
1628 movdqu 0x20($inp), @XMM[2]
1629 movdqu 0x30($inp), @XMM[3]
1630 movdqu 0x40($inp), @XMM[4]
1631 movdqu 0x50($inp), @XMM[5]
1632 mov %rsp, %rax # pass key schedule
1633 movdqu 0x60($inp), @XMM[6]
1634 mov %edx,%r10d # pass rounds
1635 movdqu 0x70($inp), @XMM[7]
1636 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1638 call _bsaes_decrypt8
1640 pxor 0x20(%rbp), @XMM[0] # ^= IV
1641 movdqu 0x00($inp), @XMM[8] # re-load input
1642 movdqu 0x10($inp), @XMM[9]
1643 pxor @XMM[8], @XMM[1]
1644 movdqu 0x20($inp), @XMM[10]
1645 pxor @XMM[9], @XMM[6]
1646 movdqu 0x30($inp), @XMM[11]
1647 pxor @XMM[10], @XMM[4]
1648 movdqu 0x40($inp), @XMM[12]
1649 pxor @XMM[11], @XMM[2]
1650 movdqu 0x50($inp), @XMM[13]
1651 pxor @XMM[12], @XMM[7]
1652 movdqu 0x60($inp), @XMM[14]
1653 pxor @XMM[13], @XMM[3]
1654 movdqu 0x70($inp), @XMM[15] # IV
1655 pxor @XMM[14], @XMM[5]
1656 movdqu @XMM[0], 0x00($out) # write output
1657 lea 0x80($inp), $inp
1658 movdqu @XMM[1], 0x10($out)
1659 movdqu @XMM[6], 0x20($out)
1660 movdqu @XMM[4], 0x30($out)
1661 movdqu @XMM[2], 0x40($out)
1662 movdqu @XMM[7], 0x50($out)
1663 movdqu @XMM[3], 0x60($out)
1664 movdqu @XMM[5], 0x70($out)
1665 lea 0x80($out), $out
1672 movdqu 0x00($inp), @XMM[0] # load input
1673 mov %rsp, %rax # pass key schedule
1674 mov %edx, %r10d # pass rounds
1677 movdqu 0x10($inp), @XMM[1]
1679 movdqu 0x20($inp), @XMM[2]
1682 movdqu 0x30($inp), @XMM[3]
1684 movdqu 0x40($inp), @XMM[4]
1687 movdqu 0x50($inp), @XMM[5]
1689 movdqu 0x60($inp), @XMM[6]
1690 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1691 call _bsaes_decrypt8
1692 pxor 0x20(%rbp), @XMM[0] # ^= IV
1693 movdqu 0x00($inp), @XMM[8] # re-load input
1694 movdqu 0x10($inp), @XMM[9]
1695 pxor @XMM[8], @XMM[1]
1696 movdqu 0x20($inp), @XMM[10]
1697 pxor @XMM[9], @XMM[6]
1698 movdqu 0x30($inp), @XMM[11]
1699 pxor @XMM[10], @XMM[4]
1700 movdqu 0x40($inp), @XMM[12]
1701 pxor @XMM[11], @XMM[2]
1702 movdqu 0x50($inp), @XMM[13]
1703 pxor @XMM[12], @XMM[7]
1704 movdqu 0x60($inp), @XMM[15] # IV
1705 pxor @XMM[13], @XMM[3]
1706 movdqu @XMM[0], 0x00($out) # write output
1707 movdqu @XMM[1], 0x10($out)
1708 movdqu @XMM[6], 0x20($out)
1709 movdqu @XMM[4], 0x30($out)
1710 movdqu @XMM[2], 0x40($out)
1711 movdqu @XMM[7], 0x50($out)
1712 movdqu @XMM[3], 0x60($out)
1716 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1717 call _bsaes_decrypt8
1718 pxor 0x20(%rbp), @XMM[0] # ^= IV
1719 movdqu 0x00($inp), @XMM[8] # re-load input
1720 movdqu 0x10($inp), @XMM[9]
1721 pxor @XMM[8], @XMM[1]
1722 movdqu 0x20($inp), @XMM[10]
1723 pxor @XMM[9], @XMM[6]
1724 movdqu 0x30($inp), @XMM[11]
1725 pxor @XMM[10], @XMM[4]
1726 movdqu 0x40($inp), @XMM[12]
1727 pxor @XMM[11], @XMM[2]
1728 movdqu 0x50($inp), @XMM[15] # IV
1729 pxor @XMM[12], @XMM[7]
1730 movdqu @XMM[0], 0x00($out) # write output
1731 movdqu @XMM[1], 0x10($out)
1732 movdqu @XMM[6], 0x20($out)
1733 movdqu @XMM[4], 0x30($out)
1734 movdqu @XMM[2], 0x40($out)
1735 movdqu @XMM[7], 0x50($out)
1739 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1740 call _bsaes_decrypt8
1741 pxor 0x20(%rbp), @XMM[0] # ^= IV
1742 movdqu 0x00($inp), @XMM[8] # re-load input
1743 movdqu 0x10($inp), @XMM[9]
1744 pxor @XMM[8], @XMM[1]
1745 movdqu 0x20($inp), @XMM[10]
1746 pxor @XMM[9], @XMM[6]
1747 movdqu 0x30($inp), @XMM[11]
1748 pxor @XMM[10], @XMM[4]
1749 movdqu 0x40($inp), @XMM[15] # IV
1750 pxor @XMM[11], @XMM[2]
1751 movdqu @XMM[0], 0x00($out) # write output
1752 movdqu @XMM[1], 0x10($out)
1753 movdqu @XMM[6], 0x20($out)
1754 movdqu @XMM[4], 0x30($out)
1755 movdqu @XMM[2], 0x40($out)
1759 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1760 call _bsaes_decrypt8
1761 pxor 0x20(%rbp), @XMM[0] # ^= IV
1762 movdqu 0x00($inp), @XMM[8] # re-load input
1763 movdqu 0x10($inp), @XMM[9]
1764 pxor @XMM[8], @XMM[1]
1765 movdqu 0x20($inp), @XMM[10]
1766 pxor @XMM[9], @XMM[6]
1767 movdqu 0x30($inp), @XMM[15] # IV
1768 pxor @XMM[10], @XMM[4]
1769 movdqu @XMM[0], 0x00($out) # write output
1770 movdqu @XMM[1], 0x10($out)
1771 movdqu @XMM[6], 0x20($out)
1772 movdqu @XMM[4], 0x30($out)
1776 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1777 call _bsaes_decrypt8
1778 pxor 0x20(%rbp), @XMM[0] # ^= IV
1779 movdqu 0x00($inp), @XMM[8] # re-load input
1780 movdqu 0x10($inp), @XMM[9]
1781 pxor @XMM[8], @XMM[1]
1782 movdqu 0x20($inp), @XMM[15] # IV
1783 pxor @XMM[9], @XMM[6]
1784 movdqu @XMM[0], 0x00($out) # write output
1785 movdqu @XMM[1], 0x10($out)
1786 movdqu @XMM[6], 0x20($out)
1790 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1791 call _bsaes_decrypt8
1792 pxor 0x20(%rbp), @XMM[0] # ^= IV
1793 movdqu 0x00($inp), @XMM[8] # re-load input
1794 movdqu 0x10($inp), @XMM[15] # IV
1795 pxor @XMM[8], @XMM[1]
1796 movdqu @XMM[0], 0x00($out) # write output
1797 movdqu @XMM[1], 0x10($out)
1802 lea 0x20(%rbp), $arg2 # buffer output
1804 call asm_AES_decrypt # doesn't touch %xmm
1805 pxor 0x20(%rbp), @XMM[15] # ^= IV
1806 movdqu @XMM[15], ($out) # write output
1807 movdqa @XMM[0], @XMM[15] # IV
1810 movdqu @XMM[15], (%rbx) # return IV
1813 .Lcbc_dec_bzero: # wipe key schedule [if any]
1814 movdqa %xmm0, 0x00(%rax)
1815 movdqa %xmm0, 0x10(%rax)
1816 lea 0x20(%rax), %rax
1820 lea (%rbp),%rsp # restore %rsp
1822 $code.=<<___ if ($win64);
1823 movaps 0x40(%rbp), %xmm6
1824 movaps 0x50(%rbp), %xmm7
1825 movaps 0x60(%rbp), %xmm8
1826 movaps 0x70(%rbp), %xmm9
1827 movaps 0x80(%rbp), %xmm10
1828 movaps 0x90(%rbp), %xmm11
1829 movaps 0xa0(%rbp), %xmm12
1830 movaps 0xb0(%rbp), %xmm13
1831 movaps 0xc0(%rbp), %xmm14
1832 movaps 0xd0(%rbp), %xmm15
1833 lea 0xa0(%rbp), %rsp
1836 mov 0x48(%rsp), %r15
1837 mov 0x50(%rsp), %r14
1838 mov 0x58(%rsp), %r13
1839 mov 0x60(%rsp), %r12
1840 mov 0x68(%rsp), %rbx
1841 mov 0x70(%rsp), %rax
1842 lea 0x78(%rsp), %rsp
1846 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1848 .globl bsaes_ctr32_encrypt_blocks
1849 .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1851 bsaes_ctr32_encrypt_blocks:
1860 lea -0x48(%rsp), %rsp
1862 $code.=<<___ if ($win64);
1863 mov 0xa0(%rsp),$arg5 # pull ivp
1864 lea -0xa0(%rsp), %rsp
1865 movaps %xmm6, 0x40(%rsp)
1866 movaps %xmm7, 0x50(%rsp)
1867 movaps %xmm8, 0x60(%rsp)
1868 movaps %xmm9, 0x70(%rsp)
1869 movaps %xmm10, 0x80(%rsp)
1870 movaps %xmm11, 0x90(%rsp)
1871 movaps %xmm12, 0xa0(%rsp)
1872 movaps %xmm13, 0xb0(%rsp)
1873 movaps %xmm14, 0xc0(%rsp)
1874 movaps %xmm15, 0xd0(%rsp)
1878 mov %rsp, %rbp # backup %rsp
1879 movdqu ($arg5), %xmm0 # load counter
1880 mov 240($arg4), %eax # rounds
1881 mov $arg1, $inp # backup arguments
1885 movdqa %xmm0, 0x20(%rbp) # copy counter
1889 mov %eax, %ebx # rounds
1890 shl \$7, %rax # 128 bytes per inner round key
1891 sub \$`128-32`, %rax # size of bit-sliced key schedule
1894 mov %rsp, %rax # pass key schedule
1895 mov $key, %rcx # pass key
1896 mov %ebx, %r10d # pass rounds
1897 call _bsaes_key_convert
1898 pxor %xmm6,%xmm7 # fix up last round key
1899 movdqa %xmm7,(%rax) # save last round key
1901 movdqa (%rsp), @XMM[9] # load round0 key
1902 lea .LADD1(%rip), %r11
1903 movdqa 0x20(%rbp), @XMM[0] # counter copy
1904 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1905 pshufb @XMM[8], @XMM[9] # byte swap upper part
1906 pshufb @XMM[8], @XMM[0]
1907 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1911 movdqa @XMM[0], 0x20(%rbp) # save counter
1912 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1913 movdqa @XMM[0], @XMM[2]
1914 paddd 0x00(%r11), @XMM[1] # .LADD1
1915 movdqa @XMM[0], @XMM[3]
1916 paddd 0x10(%r11), @XMM[2] # .LADD2
1917 movdqa @XMM[0], @XMM[4]
1918 paddd 0x20(%r11), @XMM[3] # .LADD3
1919 movdqa @XMM[0], @XMM[5]
1920 paddd 0x30(%r11), @XMM[4] # .LADD4
1921 movdqa @XMM[0], @XMM[6]
1922 paddd 0x40(%r11), @XMM[5] # .LADD5
1923 movdqa @XMM[0], @XMM[7]
1924 paddd 0x50(%r11), @XMM[6] # .LADD6
1925 paddd 0x60(%r11), @XMM[7] # .LADD7
1927 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1928 # to flip byte order in 32-bit counter
1929 movdqa (%rsp), @XMM[9] # round 0 key
1930 lea 0x10(%rsp), %rax # pass key schedule
1931 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1932 pxor @XMM[9], @XMM[0] # xor with round0 key
1933 pxor @XMM[9], @XMM[1]
1934 pxor @XMM[9], @XMM[2]
1935 pxor @XMM[9], @XMM[3]
1936 pshufb @XMM[8], @XMM[0]
1937 pshufb @XMM[8], @XMM[1]
1938 pxor @XMM[9], @XMM[4]
1939 pxor @XMM[9], @XMM[5]
1940 pshufb @XMM[8], @XMM[2]
1941 pshufb @XMM[8], @XMM[3]
1942 pxor @XMM[9], @XMM[6]
1943 pxor @XMM[9], @XMM[7]
1944 pshufb @XMM[8], @XMM[4]
1945 pshufb @XMM[8], @XMM[5]
1946 pshufb @XMM[8], @XMM[6]
1947 pshufb @XMM[8], @XMM[7]
1948 lea .LBS0(%rip), %r11 # constants table
1949 mov %ebx,%r10d # pass rounds
1951 call _bsaes_encrypt8_bitslice
1954 jc .Lctr_enc_loop_done
1956 movdqu 0x00($inp), @XMM[8] # load input
1957 movdqu 0x10($inp), @XMM[9]
1958 movdqu 0x20($inp), @XMM[10]
1959 movdqu 0x30($inp), @XMM[11]
1960 movdqu 0x40($inp), @XMM[12]
1961 movdqu 0x50($inp), @XMM[13]
1962 movdqu 0x60($inp), @XMM[14]
1963 movdqu 0x70($inp), @XMM[15]
1965 pxor @XMM[0], @XMM[8]
1966 movdqa 0x20(%rbp), @XMM[0] # load counter
1967 pxor @XMM[9], @XMM[1]
1968 movdqu @XMM[8], 0x00($out) # write output
1969 pxor @XMM[10], @XMM[4]
1970 movdqu @XMM[1], 0x10($out)
1971 pxor @XMM[11], @XMM[6]
1972 movdqu @XMM[4], 0x20($out)
1973 pxor @XMM[12], @XMM[3]
1974 movdqu @XMM[6], 0x30($out)
1975 pxor @XMM[13], @XMM[7]
1976 movdqu @XMM[3], 0x40($out)
1977 pxor @XMM[14], @XMM[2]
1978 movdqu @XMM[7], 0x50($out)
1979 pxor @XMM[15], @XMM[5]
1980 movdqu @XMM[2], 0x60($out)
1981 lea .LADD1(%rip), %r11
1982 movdqu @XMM[5], 0x70($out)
1983 lea 0x80($out), $out
1984 paddd 0x70(%r11), @XMM[0] # .LADD8
1989 .Lctr_enc_loop_done:
1991 movdqu 0x00($inp), @XMM[8] # load input
1992 pxor @XMM[8], @XMM[0]
1993 movdqu @XMM[0], 0x00($out) # write output
1996 movdqu 0x10($inp), @XMM[9]
1997 pxor @XMM[9], @XMM[1]
1998 movdqu @XMM[1], 0x10($out)
2000 movdqu 0x20($inp), @XMM[10]
2001 pxor @XMM[10], @XMM[4]
2002 movdqu @XMM[4], 0x20($out)
2005 movdqu 0x30($inp), @XMM[11]
2006 pxor @XMM[11], @XMM[6]
2007 movdqu @XMM[6], 0x30($out)
2009 movdqu 0x40($inp), @XMM[12]
2010 pxor @XMM[12], @XMM[3]
2011 movdqu @XMM[3], 0x40($out)
2014 movdqu 0x50($inp), @XMM[13]
2015 pxor @XMM[13], @XMM[7]
2016 movdqu @XMM[7], 0x50($out)
2018 movdqu 0x60($inp), @XMM[14]
2019 pxor @XMM[14], @XMM[2]
2020 movdqu @XMM[2], 0x60($out)
2025 lea 0x20(%rbp), $arg1
2026 lea 0x30(%rbp), $arg2
2028 call asm_AES_encrypt
2029 movdqu ($inp), @XMM[1]
2031 mov 0x2c(%rbp), %eax # load 32-bit counter
2033 pxor 0x30(%rbp), @XMM[1]
2034 inc %eax # increment
2035 movdqu @XMM[1], ($out)
2038 mov %eax, 0x2c(%rsp) # save 32-bit counter
2045 .Lctr_enc_bzero: # wipe key schedule [if any]
2046 movdqa %xmm0, 0x00(%rax)
2047 movdqa %xmm0, 0x10(%rax)
2048 lea 0x20(%rax), %rax
2052 lea (%rbp),%rsp # restore %rsp
2054 $code.=<<___ if ($win64);
2055 movaps 0x40(%rbp), %xmm6
2056 movaps 0x50(%rbp), %xmm7
2057 movaps 0x60(%rbp), %xmm8
2058 movaps 0x70(%rbp), %xmm9
2059 movaps 0x80(%rbp), %xmm10
2060 movaps 0x90(%rbp), %xmm11
2061 movaps 0xa0(%rbp), %xmm12
2062 movaps 0xb0(%rbp), %xmm13
2063 movaps 0xc0(%rbp), %xmm14
2064 movaps 0xd0(%rbp), %xmm15
2065 lea 0xa0(%rbp), %rsp
2068 mov 0x48(%rsp), %r15
2069 mov 0x50(%rsp), %r14
2070 mov 0x58(%rsp), %r13
2071 mov 0x60(%rsp), %r12
2072 mov 0x68(%rsp), %rbx
2073 mov 0x70(%rsp), %rax
2074 lea 0x78(%rsp), %rsp
2078 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2080 ######################################################################
2081 # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2082 # const AES_KEY *key1, const AES_KEY *key2,
2083 # const unsigned char iv[16]);
2085 my ($twmask,$twres,$twtmp)=@XMM[13..15];
2089 .globl bsaes_xts_encrypt
2090 .type bsaes_xts_encrypt,\@abi-omnipotent
2101 lea -0x48(%rsp), %rsp
2103 $code.=<<___ if ($win64);
2104 mov 0xa0(%rsp),$arg5 # pull key2
2105 mov 0xa8(%rsp),$arg6 # pull ivp
2106 lea -0xa0(%rsp), %rsp
2107 movaps %xmm6, 0x40(%rsp)
2108 movaps %xmm7, 0x50(%rsp)
2109 movaps %xmm8, 0x60(%rsp)
2110 movaps %xmm9, 0x70(%rsp)
2111 movaps %xmm10, 0x80(%rsp)
2112 movaps %xmm11, 0x90(%rsp)
2113 movaps %xmm12, 0xa0(%rsp)
2114 movaps %xmm13, 0xb0(%rsp)
2115 movaps %xmm14, 0xc0(%rsp)
2116 movaps %xmm15, 0xd0(%rsp)
2120 mov %rsp, %rbp # backup %rsp
2121 mov $arg1, $inp # backup arguments
2127 lea 0x20(%rbp), $arg2
2129 call asm_AES_encrypt # generate initial tweak
2131 mov 240($key), %eax # rounds
2132 mov $len, %rbx # backup $len
2134 mov %eax, %edx # rounds
2135 shl \$7, %rax # 128 bytes per inner round key
2136 sub \$`128-32`, %rax # size of bit-sliced key schedule
2139 mov %rsp, %rax # pass key schedule
2140 mov $key, %rcx # pass key
2141 mov %edx, %r10d # pass rounds
2142 call _bsaes_key_convert
2143 pxor %xmm6, %xmm7 # fix up last round key
2144 movdqa %xmm7, (%rax) # save last round key
2147 sub \$0x80, %rsp # place for tweak[8]
2148 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2151 movdqa .Lxts_magic(%rip), $twmask
2152 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2161 for ($i=0;$i<7;$i++) {
2163 pshufd \$0x13, $twtmp, $twres
2165 movdqa @XMM[7], @XMM[$i]
2166 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2167 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2168 pand $twmask, $twres # isolate carry and residue
2169 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2170 pxor $twres, @XMM[7]
2172 $code.=<<___ if ($i>=1);
2173 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2175 $code.=<<___ if ($i>=2);
2176 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2180 movdqu 0x60($inp), @XMM[8+6]
2181 pxor @XMM[8+5], @XMM[5]
2182 movdqu 0x70($inp), @XMM[8+7]
2183 lea 0x80($inp), $inp
2184 movdqa @XMM[7], 0x70(%rsp)
2185 pxor @XMM[8+6], @XMM[6]
2186 lea 0x80(%rsp), %rax # pass key schedule
2187 pxor @XMM[8+7], @XMM[7]
2188 mov %edx, %r10d # pass rounds
2190 call _bsaes_encrypt8
2192 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2193 pxor 0x10(%rsp), @XMM[1]
2194 movdqu @XMM[0], 0x00($out) # write output
2195 pxor 0x20(%rsp), @XMM[4]
2196 movdqu @XMM[1], 0x10($out)
2197 pxor 0x30(%rsp), @XMM[6]
2198 movdqu @XMM[4], 0x20($out)
2199 pxor 0x40(%rsp), @XMM[3]
2200 movdqu @XMM[6], 0x30($out)
2201 pxor 0x50(%rsp), @XMM[7]
2202 movdqu @XMM[3], 0x40($out)
2203 pxor 0x60(%rsp), @XMM[2]
2204 movdqu @XMM[7], 0x50($out)
2205 pxor 0x70(%rsp), @XMM[5]
2206 movdqu @XMM[2], 0x60($out)
2207 movdqu @XMM[5], 0x70($out)
2208 lea 0x80($out), $out
2210 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2212 movdqa .Lxts_magic(%rip), $twmask
2213 pcmpgtd @XMM[7], $twtmp
2214 pshufd \$0x13, $twtmp, $twres
2216 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2217 pand $twmask, $twres # isolate carry and residue
2218 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2219 pxor $twres, @XMM[7]
2228 for ($i=0;$i<7;$i++) {
2230 pshufd \$0x13, $twtmp, $twres
2232 movdqa @XMM[7], @XMM[$i]
2233 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2234 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2235 pand $twmask, $twres # isolate carry and residue
2236 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2237 pxor $twres, @XMM[7]
2239 $code.=<<___ if ($i>=1);
2240 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2241 cmp \$`0x10*$i`,$len
2244 $code.=<<___ if ($i>=2);
2245 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2249 movdqu 0x60($inp), @XMM[8+6]
2250 pxor @XMM[8+5], @XMM[5]
2251 movdqa @XMM[7], 0x70(%rsp)
2252 lea 0x70($inp), $inp
2253 pxor @XMM[8+6], @XMM[6]
2254 lea 0x80(%rsp), %rax # pass key schedule
2255 mov %edx, %r10d # pass rounds
2257 call _bsaes_encrypt8
2259 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2260 pxor 0x10(%rsp), @XMM[1]
2261 movdqu @XMM[0], 0x00($out) # write output
2262 pxor 0x20(%rsp), @XMM[4]
2263 movdqu @XMM[1], 0x10($out)
2264 pxor 0x30(%rsp), @XMM[6]
2265 movdqu @XMM[4], 0x20($out)
2266 pxor 0x40(%rsp), @XMM[3]
2267 movdqu @XMM[6], 0x30($out)
2268 pxor 0x50(%rsp), @XMM[7]
2269 movdqu @XMM[3], 0x40($out)
2270 pxor 0x60(%rsp), @XMM[2]
2271 movdqu @XMM[7], 0x50($out)
2272 movdqu @XMM[2], 0x60($out)
2273 lea 0x70($out), $out
2275 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2279 pxor @XMM[8+4], @XMM[4]
2280 lea 0x60($inp), $inp
2281 pxor @XMM[8+5], @XMM[5]
2282 lea 0x80(%rsp), %rax # pass key schedule
2283 mov %edx, %r10d # pass rounds
2285 call _bsaes_encrypt8
2287 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2288 pxor 0x10(%rsp), @XMM[1]
2289 movdqu @XMM[0], 0x00($out) # write output
2290 pxor 0x20(%rsp), @XMM[4]
2291 movdqu @XMM[1], 0x10($out)
2292 pxor 0x30(%rsp), @XMM[6]
2293 movdqu @XMM[4], 0x20($out)
2294 pxor 0x40(%rsp), @XMM[3]
2295 movdqu @XMM[6], 0x30($out)
2296 pxor 0x50(%rsp), @XMM[7]
2297 movdqu @XMM[3], 0x40($out)
2298 movdqu @XMM[7], 0x50($out)
2299 lea 0x60($out), $out
2301 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2305 pxor @XMM[8+3], @XMM[3]
2306 lea 0x50($inp), $inp
2307 pxor @XMM[8+4], @XMM[4]
2308 lea 0x80(%rsp), %rax # pass key schedule
2309 mov %edx, %r10d # pass rounds
2311 call _bsaes_encrypt8
2313 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2314 pxor 0x10(%rsp), @XMM[1]
2315 movdqu @XMM[0], 0x00($out) # write output
2316 pxor 0x20(%rsp), @XMM[4]
2317 movdqu @XMM[1], 0x10($out)
2318 pxor 0x30(%rsp), @XMM[6]
2319 movdqu @XMM[4], 0x20($out)
2320 pxor 0x40(%rsp), @XMM[3]
2321 movdqu @XMM[6], 0x30($out)
2322 movdqu @XMM[3], 0x40($out)
2323 lea 0x50($out), $out
2325 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2329 pxor @XMM[8+2], @XMM[2]
2330 lea 0x40($inp), $inp
2331 pxor @XMM[8+3], @XMM[3]
2332 lea 0x80(%rsp), %rax # pass key schedule
2333 mov %edx, %r10d # pass rounds
2335 call _bsaes_encrypt8
2337 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2338 pxor 0x10(%rsp), @XMM[1]
2339 movdqu @XMM[0], 0x00($out) # write output
2340 pxor 0x20(%rsp), @XMM[4]
2341 movdqu @XMM[1], 0x10($out)
2342 pxor 0x30(%rsp), @XMM[6]
2343 movdqu @XMM[4], 0x20($out)
2344 movdqu @XMM[6], 0x30($out)
2345 lea 0x40($out), $out
2347 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2351 pxor @XMM[8+1], @XMM[1]
2352 lea 0x30($inp), $inp
2353 pxor @XMM[8+2], @XMM[2]
2354 lea 0x80(%rsp), %rax # pass key schedule
2355 mov %edx, %r10d # pass rounds
2357 call _bsaes_encrypt8
2359 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2360 pxor 0x10(%rsp), @XMM[1]
2361 movdqu @XMM[0], 0x00($out) # write output
2362 pxor 0x20(%rsp), @XMM[4]
2363 movdqu @XMM[1], 0x10($out)
2364 movdqu @XMM[4], 0x20($out)
2365 lea 0x30($out), $out
2367 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2371 pxor @XMM[8+0], @XMM[0]
2372 lea 0x20($inp), $inp
2373 pxor @XMM[8+1], @XMM[1]
2374 lea 0x80(%rsp), %rax # pass key schedule
2375 mov %edx, %r10d # pass rounds
2377 call _bsaes_encrypt8
2379 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2380 pxor 0x10(%rsp), @XMM[1]
2381 movdqu @XMM[0], 0x00($out) # write output
2382 movdqu @XMM[1], 0x10($out)
2383 lea 0x20($out), $out
2385 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2389 pxor @XMM[0], @XMM[8]
2390 lea 0x10($inp), $inp
2391 movdqa @XMM[8], 0x20(%rbp)
2392 lea 0x20(%rbp), $arg1
2393 lea 0x20(%rbp), $arg2
2395 call asm_AES_encrypt # doesn't touch %xmm
2396 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2397 #pxor @XMM[8], @XMM[0]
2398 #lea 0x80(%rsp), %rax # pass key schedule
2399 #mov %edx, %r10d # pass rounds
2400 #call _bsaes_encrypt8
2401 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2402 movdqu @XMM[0], 0x00($out) # write output
2403 lea 0x10($out), $out
2405 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2414 movzb -16(%rdx), %ecx
2422 movdqu -16($out), @XMM[0]
2423 lea 0x20(%rbp), $arg1
2424 pxor @XMM[7], @XMM[0]
2425 lea 0x20(%rbp), $arg2
2426 movdqa @XMM[0], 0x20(%rbp)
2428 call asm_AES_encrypt # doesn't touch %xmm
2429 pxor 0x20(%rbp), @XMM[7]
2430 movdqu @XMM[7], -16($out)
2435 .Lxts_enc_bzero: # wipe key schedule [if any]
2436 movdqa %xmm0, 0x00(%rax)
2437 movdqa %xmm0, 0x10(%rax)
2438 lea 0x20(%rax), %rax
2442 lea (%rbp),%rsp # restore %rsp
2444 $code.=<<___ if ($win64);
2445 movaps 0x40(%rbp), %xmm6
2446 movaps 0x50(%rbp), %xmm7
2447 movaps 0x60(%rbp), %xmm8
2448 movaps 0x70(%rbp), %xmm9
2449 movaps 0x80(%rbp), %xmm10
2450 movaps 0x90(%rbp), %xmm11
2451 movaps 0xa0(%rbp), %xmm12
2452 movaps 0xb0(%rbp), %xmm13
2453 movaps 0xc0(%rbp), %xmm14
2454 movaps 0xd0(%rbp), %xmm15
2455 lea 0xa0(%rbp), %rsp
2458 mov 0x48(%rsp), %r15
2459 mov 0x50(%rsp), %r14
2460 mov 0x58(%rsp), %r13
2461 mov 0x60(%rsp), %r12
2462 mov 0x68(%rsp), %rbx
2463 mov 0x70(%rsp), %rax
2464 lea 0x78(%rsp), %rsp
2468 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2470 .globl bsaes_xts_decrypt
2471 .type bsaes_xts_decrypt,\@abi-omnipotent
2482 lea -0x48(%rsp), %rsp
2484 $code.=<<___ if ($win64);
2485 mov 0xa0(%rsp),$arg5 # pull key2
2486 mov 0xa8(%rsp),$arg6 # pull ivp
2487 lea -0xa0(%rsp), %rsp
2488 movaps %xmm6, 0x40(%rsp)
2489 movaps %xmm7, 0x50(%rsp)
2490 movaps %xmm8, 0x60(%rsp)
2491 movaps %xmm9, 0x70(%rsp)
2492 movaps %xmm10, 0x80(%rsp)
2493 movaps %xmm11, 0x90(%rsp)
2494 movaps %xmm12, 0xa0(%rsp)
2495 movaps %xmm13, 0xb0(%rsp)
2496 movaps %xmm14, 0xc0(%rsp)
2497 movaps %xmm15, 0xd0(%rsp)
2501 mov %rsp, %rbp # backup %rsp
2502 mov $arg1, $inp # backup arguments
2508 lea 0x20(%rbp), $arg2
2510 call asm_AES_encrypt # generate initial tweak
2512 mov 240($key), %eax # rounds
2513 mov $len, %rbx # backup $len
2515 mov %eax, %edx # rounds
2516 shl \$7, %rax # 128 bytes per inner round key
2517 sub \$`128-32`, %rax # size of bit-sliced key schedule
2520 mov %rsp, %rax # pass key schedule
2521 mov $key, %rcx # pass key
2522 mov %edx, %r10d # pass rounds
2523 call _bsaes_key_convert
2524 pxor (%rsp), %xmm7 # fix up round 0 key
2525 movdqa %xmm6, (%rax) # save last round key
2526 movdqa %xmm7, (%rsp)
2528 xor %eax, %eax # if ($len%16) len-=16;
2535 sub \$0x80, %rsp # place for tweak[8]
2536 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2539 movdqa .Lxts_magic(%rip), $twmask
2540 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2549 for ($i=0;$i<7;$i++) {
2551 pshufd \$0x13, $twtmp, $twres
2553 movdqa @XMM[7], @XMM[$i]
2554 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2555 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2556 pand $twmask, $twres # isolate carry and residue
2557 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2558 pxor $twres, @XMM[7]
2560 $code.=<<___ if ($i>=1);
2561 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2563 $code.=<<___ if ($i>=2);
2564 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2568 movdqu 0x60($inp), @XMM[8+6]
2569 pxor @XMM[8+5], @XMM[5]
2570 movdqu 0x70($inp), @XMM[8+7]
2571 lea 0x80($inp), $inp
2572 movdqa @XMM[7], 0x70(%rsp)
2573 pxor @XMM[8+6], @XMM[6]
2574 lea 0x80(%rsp), %rax # pass key schedule
2575 pxor @XMM[8+7], @XMM[7]
2576 mov %edx, %r10d # pass rounds
2578 call _bsaes_decrypt8
2580 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2581 pxor 0x10(%rsp), @XMM[1]
2582 movdqu @XMM[0], 0x00($out) # write output
2583 pxor 0x20(%rsp), @XMM[6]
2584 movdqu @XMM[1], 0x10($out)
2585 pxor 0x30(%rsp), @XMM[4]
2586 movdqu @XMM[6], 0x20($out)
2587 pxor 0x40(%rsp), @XMM[2]
2588 movdqu @XMM[4], 0x30($out)
2589 pxor 0x50(%rsp), @XMM[7]
2590 movdqu @XMM[2], 0x40($out)
2591 pxor 0x60(%rsp), @XMM[3]
2592 movdqu @XMM[7], 0x50($out)
2593 pxor 0x70(%rsp), @XMM[5]
2594 movdqu @XMM[3], 0x60($out)
2595 movdqu @XMM[5], 0x70($out)
2596 lea 0x80($out), $out
2598 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2600 movdqa .Lxts_magic(%rip), $twmask
2601 pcmpgtd @XMM[7], $twtmp
2602 pshufd \$0x13, $twtmp, $twres
2604 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2605 pand $twmask, $twres # isolate carry and residue
2606 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2607 pxor $twres, @XMM[7]
2616 for ($i=0;$i<7;$i++) {
2618 pshufd \$0x13, $twtmp, $twres
2620 movdqa @XMM[7], @XMM[$i]
2621 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2622 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2623 pand $twmask, $twres # isolate carry and residue
2624 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2625 pxor $twres, @XMM[7]
2627 $code.=<<___ if ($i>=1);
2628 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2629 cmp \$`0x10*$i`,$len
2632 $code.=<<___ if ($i>=2);
2633 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2637 movdqu 0x60($inp), @XMM[8+6]
2638 pxor @XMM[8+5], @XMM[5]
2639 movdqa @XMM[7], 0x70(%rsp)
2640 lea 0x70($inp), $inp
2641 pxor @XMM[8+6], @XMM[6]
2642 lea 0x80(%rsp), %rax # pass key schedule
2643 mov %edx, %r10d # pass rounds
2645 call _bsaes_decrypt8
2647 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2648 pxor 0x10(%rsp), @XMM[1]
2649 movdqu @XMM[0], 0x00($out) # write output
2650 pxor 0x20(%rsp), @XMM[6]
2651 movdqu @XMM[1], 0x10($out)
2652 pxor 0x30(%rsp), @XMM[4]
2653 movdqu @XMM[6], 0x20($out)
2654 pxor 0x40(%rsp), @XMM[2]
2655 movdqu @XMM[4], 0x30($out)
2656 pxor 0x50(%rsp), @XMM[7]
2657 movdqu @XMM[2], 0x40($out)
2658 pxor 0x60(%rsp), @XMM[3]
2659 movdqu @XMM[7], 0x50($out)
2660 movdqu @XMM[3], 0x60($out)
2661 lea 0x70($out), $out
2663 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2667 pxor @XMM[8+4], @XMM[4]
2668 lea 0x60($inp), $inp
2669 pxor @XMM[8+5], @XMM[5]
2670 lea 0x80(%rsp), %rax # pass key schedule
2671 mov %edx, %r10d # pass rounds
2673 call _bsaes_decrypt8
2675 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2676 pxor 0x10(%rsp), @XMM[1]
2677 movdqu @XMM[0], 0x00($out) # write output
2678 pxor 0x20(%rsp), @XMM[6]
2679 movdqu @XMM[1], 0x10($out)
2680 pxor 0x30(%rsp), @XMM[4]
2681 movdqu @XMM[6], 0x20($out)
2682 pxor 0x40(%rsp), @XMM[2]
2683 movdqu @XMM[4], 0x30($out)
2684 pxor 0x50(%rsp), @XMM[7]
2685 movdqu @XMM[2], 0x40($out)
2686 movdqu @XMM[7], 0x50($out)
2687 lea 0x60($out), $out
2689 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2693 pxor @XMM[8+3], @XMM[3]
2694 lea 0x50($inp), $inp
2695 pxor @XMM[8+4], @XMM[4]
2696 lea 0x80(%rsp), %rax # pass key schedule
2697 mov %edx, %r10d # pass rounds
2699 call _bsaes_decrypt8
2701 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2702 pxor 0x10(%rsp), @XMM[1]
2703 movdqu @XMM[0], 0x00($out) # write output
2704 pxor 0x20(%rsp), @XMM[6]
2705 movdqu @XMM[1], 0x10($out)
2706 pxor 0x30(%rsp), @XMM[4]
2707 movdqu @XMM[6], 0x20($out)
2708 pxor 0x40(%rsp), @XMM[2]
2709 movdqu @XMM[4], 0x30($out)
2710 movdqu @XMM[2], 0x40($out)
2711 lea 0x50($out), $out
2713 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2717 pxor @XMM[8+2], @XMM[2]
2718 lea 0x40($inp), $inp
2719 pxor @XMM[8+3], @XMM[3]
2720 lea 0x80(%rsp), %rax # pass key schedule
2721 mov %edx, %r10d # pass rounds
2723 call _bsaes_decrypt8
2725 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2726 pxor 0x10(%rsp), @XMM[1]
2727 movdqu @XMM[0], 0x00($out) # write output
2728 pxor 0x20(%rsp), @XMM[6]
2729 movdqu @XMM[1], 0x10($out)
2730 pxor 0x30(%rsp), @XMM[4]
2731 movdqu @XMM[6], 0x20($out)
2732 movdqu @XMM[4], 0x30($out)
2733 lea 0x40($out), $out
2735 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2739 pxor @XMM[8+1], @XMM[1]
2740 lea 0x30($inp), $inp
2741 pxor @XMM[8+2], @XMM[2]
2742 lea 0x80(%rsp), %rax # pass key schedule
2743 mov %edx, %r10d # pass rounds
2745 call _bsaes_decrypt8
2747 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2748 pxor 0x10(%rsp), @XMM[1]
2749 movdqu @XMM[0], 0x00($out) # write output
2750 pxor 0x20(%rsp), @XMM[6]
2751 movdqu @XMM[1], 0x10($out)
2752 movdqu @XMM[6], 0x20($out)
2753 lea 0x30($out), $out
2755 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2759 pxor @XMM[8+0], @XMM[0]
2760 lea 0x20($inp), $inp
2761 pxor @XMM[8+1], @XMM[1]
2762 lea 0x80(%rsp), %rax # pass key schedule
2763 mov %edx, %r10d # pass rounds
2765 call _bsaes_decrypt8
2767 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2768 pxor 0x10(%rsp), @XMM[1]
2769 movdqu @XMM[0], 0x00($out) # write output
2770 movdqu @XMM[1], 0x10($out)
2771 lea 0x20($out), $out
2773 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2777 pxor @XMM[0], @XMM[8]
2778 lea 0x10($inp), $inp
2779 movdqa @XMM[8], 0x20(%rbp)
2780 lea 0x20(%rbp), $arg1
2781 lea 0x20(%rbp), $arg2
2783 call asm_AES_decrypt # doesn't touch %xmm
2784 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2785 #pxor @XMM[8], @XMM[0]
2786 #lea 0x80(%rsp), %rax # pass key schedule
2787 #mov %edx, %r10d # pass rounds
2788 #call _bsaes_decrypt8
2789 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2790 movdqu @XMM[0], 0x00($out) # write output
2791 lea 0x10($out), $out
2793 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2800 movdqa .Lxts_magic(%rip), $twmask
2801 pcmpgtd @XMM[7], $twtmp
2802 pshufd \$0x13, $twtmp, $twres
2803 movdqa @XMM[7], @XMM[6]
2804 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2805 pand $twmask, $twres # isolate carry and residue
2806 movdqu ($inp), @XMM[0]
2807 pxor $twres, @XMM[7]
2809 lea 0x20(%rbp), $arg1
2810 pxor @XMM[7], @XMM[0]
2811 lea 0x20(%rbp), $arg2
2812 movdqa @XMM[0], 0x20(%rbp)
2814 call asm_AES_decrypt # doesn't touch %xmm
2815 pxor 0x20(%rbp), @XMM[7]
2817 movdqu @XMM[7], ($out)
2820 movzb 16($inp), %eax
2829 movdqu ($out), @XMM[0]
2830 lea 0x20(%rbp), $arg1
2831 pxor @XMM[6], @XMM[0]
2832 lea 0x20(%rbp), $arg2
2833 movdqa @XMM[0], 0x20(%rbp)
2835 call asm_AES_decrypt # doesn't touch %xmm
2836 pxor 0x20(%rbp), @XMM[6]
2837 movdqu @XMM[6], ($out)
2842 .Lxts_dec_bzero: # wipe key schedule [if any]
2843 movdqa %xmm0, 0x00(%rax)
2844 movdqa %xmm0, 0x10(%rax)
2845 lea 0x20(%rax), %rax
2849 lea (%rbp),%rsp # restore %rsp
2851 $code.=<<___ if ($win64);
2852 movaps 0x40(%rbp), %xmm6
2853 movaps 0x50(%rbp), %xmm7
2854 movaps 0x60(%rbp), %xmm8
2855 movaps 0x70(%rbp), %xmm9
2856 movaps 0x80(%rbp), %xmm10
2857 movaps 0x90(%rbp), %xmm11
2858 movaps 0xa0(%rbp), %xmm12
2859 movaps 0xb0(%rbp), %xmm13
2860 movaps 0xc0(%rbp), %xmm14
2861 movaps 0xd0(%rbp), %xmm15
2862 lea 0xa0(%rbp), %rsp
2865 mov 0x48(%rsp), %r15
2866 mov 0x50(%rsp), %r14
2867 mov 0x58(%rsp), %r13
2868 mov 0x60(%rsp), %r12
2869 mov 0x68(%rsp), %rbx
2870 mov 0x70(%rsp), %rax
2871 lea 0x78(%rsp), %rsp
2875 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2879 .type _bsaes_const,\@object
2882 .LM0ISR: # InvShiftRows constants
2883 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
2885 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
2887 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
2888 .LBS0: # bit-slice constants
2889 .quad 0x5555555555555555, 0x5555555555555555
2891 .quad 0x3333333333333333, 0x3333333333333333
2893 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2894 .LSR: # shiftrows constants
2895 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
2897 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
2899 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
2900 .LSWPUP: # byte-swap upper dword
2901 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
2903 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
2904 .LADD1: # counter increment constants
2905 .quad 0x0000000000000000, 0x0000000100000000
2907 .quad 0x0000000000000000, 0x0000000200000000
2909 .quad 0x0000000000000000, 0x0000000300000000
2911 .quad 0x0000000000000000, 0x0000000400000000
2913 .quad 0x0000000000000000, 0x0000000500000000
2915 .quad 0x0000000000000000, 0x0000000600000000
2917 .quad 0x0000000000000000, 0x0000000700000000
2919 .quad 0x0000000000000000, 0x0000000800000000
2923 .quad 0x0101010101010101, 0x0101010101010101
2924 .quad 0x0202020202020202, 0x0202020202020202
2925 .quad 0x0404040404040404, 0x0404040404040404
2926 .quad 0x0808080808080808, 0x0808080808080808
2928 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
2930 .quad 0x6363636363636363, 0x6363636363636363
2931 .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
2933 .size _bsaes_const,.-_bsaes_const
2936 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2937 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2945 .extern __imp_RtlVirtualUnwind
2946 .type se_handler,\@abi-omnipotent
2960 mov 120($context),%rax # pull context->Rax
2961 mov 248($context),%rbx # pull context->Rip
2963 mov 8($disp),%rsi # disp->ImageBase
2964 mov 56($disp),%r11 # disp->HandlerData
2966 mov 0(%r11),%r10d # HandlerData[0]
2967 lea (%rsi,%r10),%r10 # prologue label
2968 cmp %r10,%rbx # context->Rip<prologue label
2971 mov 152($context),%rax # pull context->Rsp
2973 mov 4(%r11),%r10d # HandlerData[1]
2974 lea (%rsi,%r10),%r10 # epilogue label
2975 cmp %r10,%rbx # context->Rip>=epilogue label
2978 mov 160($context),%rax # pull context->Rbp
2980 lea 0x40(%rax),%rsi # %xmm save area
2981 lea 512($context),%rdi # &context.Xmm6
2982 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
2983 .long 0xa548f3fc # cld; rep movsq
2984 lea 0xa0(%rax),%rax # adjust stack pointer
2992 lea 0x78(%rax),%rax # adjust stack pointer
2993 mov %rbx,144($context) # restore context->Rbx
2994 mov %rbp,160($context) # restore context->Rbp
2995 mov %r12,216($context) # restore context->R12
2996 mov %r13,224($context) # restore context->R13
2997 mov %r14,232($context) # restore context->R14
2998 mov %r15,240($context) # restore context->R15
3001 mov %rax,152($context) # restore context->Rsp
3003 mov 40($disp),%rdi # disp->ContextRecord
3004 mov $context,%rsi # context
3005 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
3006 .long 0xa548f3fc # cld; rep movsq
3009 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3010 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3011 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3012 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3013 mov 40(%rsi),%r10 # disp->ContextRecord
3014 lea 56(%rsi),%r11 # &disp->HandlerData
3015 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3016 mov %r10,32(%rsp) # arg5
3017 mov %r11,40(%rsp) # arg6
3018 mov %r12,48(%rsp) # arg7
3019 mov %rcx,56(%rsp) # arg8, (NULL)
3020 call *__imp_RtlVirtualUnwind(%rip)
3022 mov \$1,%eax # ExceptionContinueSearch
3034 .size se_handler,.-se_handler
3039 $code.=<<___ if ($ecb);
3040 .rva .Lecb_enc_prologue
3041 .rva .Lecb_enc_epilogue
3044 .rva .Lecb_dec_prologue
3045 .rva .Lecb_dec_epilogue
3049 .rva .Lcbc_dec_prologue
3050 .rva .Lcbc_dec_epilogue
3053 .rva .Lctr_enc_prologue
3054 .rva .Lctr_enc_epilogue
3057 .rva .Lxts_enc_prologue
3058 .rva .Lxts_enc_epilogue
3061 .rva .Lxts_dec_prologue
3062 .rva .Lxts_dec_epilogue
3068 $code.=<<___ if ($ecb);
3072 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
3076 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
3082 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
3086 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
3090 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3094 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3098 $code =~ s/\`([^\`]*)\`/eval($1)/gem;