crypto/openssl/crypto/aes/asm/aesni-sha1-x86_64.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # June 2011
  11 #
  12 # This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
  13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
  14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
  15 # parallelism, interleaving it with another algorithm would allow to
  16 # utilize processor resources better and achieve better performance.
  17 # SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
  18 # AESNI code is weaved into it. Below are performance numbers in
  19 # cycles per processed byte, less is better, for standalone AESNI-CBC
  20 # encrypt, sum of the latter and standalone SHA1, and "stitched"
  21 # subroutine:
  22 #
  23 #               AES-128-CBC     +SHA1           stitch      gain
  24 # Westmere      3.77[+5.6]      9.37            6.65        +41%
  25 # Sandy Bridge  5.05[+5.2(6.3)] 10.25(11.35)    6.16(7.08)  +67%(+60%)
  26 #
  27 #               AES-192-CBC
  28 # Westmere      4.51            10.11           6.97        +45%
  29 # Sandy Bridge  6.05            11.25(12.35)    6.34(7.27)  +77%(+70%)
  30 #
  31 #               AES-256-CBC
  32 # Westmere      5.25            10.85           7.25        +50%
  33 # Sandy Bridge  7.05            12.25(13.35)    7.06(7.70)  +74%(+73%)
  34 #
  35 # (*)   There are two code paths: SSSE3 and AVX. See sha1-568.pl for
  36 #       background information. Above numbers in parentheses are SSSE3
  37 #       results collected on AVX-capable CPU, i.e. apply on OSes that
  38 #       don't support AVX.
  39 #
  40 # Needless to mention that it makes no sense to implement "stitched"
  41 # *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
  42 # fully utilize parallelism, so stitching would not give any gain
  43 # anyway. Well, there might be some, e.g. because of better cache
  44 # locality... For reference, here are performance results for
  45 # standalone AESNI-CBC decrypt:
  46 #
  47 #               AES-128-CBC     AES-192-CBC     AES-256-CBC
  48 # Westmere      1.31            1.55            1.80
  49 # Sandy Bridge  0.93            1.06            1.22
  50
  51 $flavour = shift;
  52 $output  = shift;
  53 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  54
  55 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  56
  57 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  58 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  59 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  60 die "can't locate x86_64-xlate.pl";
  61
  62 $avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  63                 =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
  64            $1>=2.19);
  65 $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  66            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
  67            $1>=2.09);
  68 $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  69            `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
  70            $1>=10);
  71 $avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/ && $2>=3.0);
  72
  73 open OUT,"| \"$^X\" $xlate $flavour $output";
  74 *STDOUT=*OUT;
  75
  76 # void aesni_cbc_sha1_enc(const void *inp,
  77 #                       void *out,
  78 #                       size_t length,
  79 #                       const AES_KEY *key,
  80 #                       unsigned char *iv,
  81 #                       SHA_CTX *ctx,
  82 #                       const void *in0);
  83
  84 $code.=<<___;
  85 .text
  86 .extern OPENSSL_ia32cap_P
  87
  88 .globl  aesni_cbc_sha1_enc
  89 .type   aesni_cbc_sha1_enc,\@abi-omnipotent
  90 .align  16
  91 aesni_cbc_sha1_enc:
  92         # caller should check for SSSE3 and AES-NI bits
  93         mov     OPENSSL_ia32cap_P+0(%rip),%r10d
  94         mov     OPENSSL_ia32cap_P+4(%rip),%r11d
  95 ___
  96 $code.=<<___ if ($avx);
  97         and     \$`1<<28`,%r11d         # mask AVX bit
  98         and     \$`1<<30`,%r10d         # mask "Intel CPU" bit
  99         or      %r11d,%r10d
 100         cmp     \$`1<<28|1<<30`,%r10d
 101         je      aesni_cbc_sha1_enc_avx
 102 ___
 103 $code.=<<___;
 104         jmp     aesni_cbc_sha1_enc_ssse3
 105         ret
 106 .size   aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
 107 ___
 108
 109 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
 110
 111 my $Xi=4;
 112 my @X=map("%xmm$_",(4..7,0..3));
 113 my @Tx=map("%xmm$_",(8..10));
 114 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");    # size optimization
 115 my @T=("%esi","%edi");
 116 my $j=0; my $jj=0; my $r=0; my $sn=0;
 117 my $K_XX_XX="%r11";
 118 my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13));
 119 my @rndkey=("%xmm14","%xmm15");
 120
 121 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
 122 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
 123   my $arg = pop;
 124     $arg = "\$$arg" if ($arg*1 eq $arg);
 125     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
 126 }
 127
 128 my $_rol=sub { &rol(@_) };
 129 my $_ror=sub { &ror(@_) };
 130
 131 $code.=<<___;
 132 .type   aesni_cbc_sha1_enc_ssse3,\@function,6
 133 .align  16
 134 aesni_cbc_sha1_enc_ssse3:
 135         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
 136         #shr    \$6,$len                        # debugging artefact
 137         #jz     .Lepilogue_ssse3                # debugging artefact
 138         push    %rbx
 139         push    %rbp
 140         push    %r12
 141         push    %r13
 142         push    %r14
 143         push    %r15
 144         lea     `-104-($win64?10*16:0)`(%rsp),%rsp
 145         #mov    $in0,$inp                       # debugging artefact
 146         #lea    64(%rsp),$ctx                   # debugging artefact
 147 ___
 148 $code.=<<___ if ($win64);
 149         movaps  %xmm6,96+0(%rsp)
 150         movaps  %xmm7,96+16(%rsp)
 151         movaps  %xmm8,96+32(%rsp)
 152         movaps  %xmm9,96+48(%rsp)
 153         movaps  %xmm10,96+64(%rsp)
 154         movaps  %xmm11,96+80(%rsp)
 155         movaps  %xmm12,96+96(%rsp)
 156         movaps  %xmm13,96+112(%rsp)
 157         movaps  %xmm14,96+128(%rsp)
 158         movaps  %xmm15,96+144(%rsp)
 159 .Lprologue_ssse3:
 160 ___
 161 $code.=<<___;
 162         mov     $in0,%r12                       # reassign arguments
 163         mov     $out,%r13
 164         mov     $len,%r14
 165         mov     $key,%r15
 166         movdqu  ($ivp),$iv                      # load IV
 167         mov     $ivp,88(%rsp)                   # save $ivp
 168 ___
 169 my ($in0,$out,$len,$key)=map("%r$_",(12..15));  # reassign arguments
 170 my $rounds="${ivp}d";
 171 $code.=<<___;
 172         shl     \$6,$len
 173         sub     $in0,$out
 174         mov     240($key),$rounds
 175         add     $inp,$len               # end of input
 176
 177         lea     K_XX_XX(%rip),$K_XX_XX
 178         mov     0($ctx),$A              # load context
 179         mov     4($ctx),$B
 180         mov     8($ctx),$C
 181         mov     12($ctx),$D
 182         mov     $B,@T[0]                # magic seed
 183         mov     16($ctx),$E
 184
 185         movdqa  64($K_XX_XX),@X[2]      # pbswap mask
 186         movdqa  0($K_XX_XX),@Tx[1]      # K_00_19
 187         movdqu  0($inp),@X[-4&7]        # load input to %xmm[0-3]
 188         movdqu  16($inp),@X[-3&7]
 189         movdqu  32($inp),@X[-2&7]
 190         movdqu  48($inp),@X[-1&7]
 191         pshufb  @X[2],@X[-4&7]          # byte swap
 192         add     \$64,$inp
 193         pshufb  @X[2],@X[-3&7]
 194         pshufb  @X[2],@X[-2&7]
 195         pshufb  @X[2],@X[-1&7]
 196         paddd   @Tx[1],@X[-4&7]         # add K_00_19
 197         paddd   @Tx[1],@X[-3&7]
 198         paddd   @Tx[1],@X[-2&7]
 199         movdqa  @X[-4&7],0(%rsp)        # X[]+K xfer to IALU
 200         psubd   @Tx[1],@X[-4&7]         # restore X[]
 201         movdqa  @X[-3&7],16(%rsp)
 202         psubd   @Tx[1],@X[-3&7]
 203         movdqa  @X[-2&7],32(%rsp)
 204         psubd   @Tx[1],@X[-2&7]
 205         movups  ($key),$rndkey0         # $key[0]
 206         movups  16($key),$rndkey[0]     # forward reference
 207         jmp     .Loop_ssse3
 208 ___
 209
 210 my $aesenc=sub {
 211   use integer;
 212   my ($n,$k)=($r/10,$r%10);
 213     if ($k==0) {
 214       $code.=<<___;
 215         movups          `16*$n`($in0),$in               # load input
 216         xorps           $rndkey0,$in
 217 ___
 218       $code.=<<___ if ($n);
 219         movups          $iv,`16*($n-1)`($out,$in0)      # write output
 220 ___
 221       $code.=<<___;
 222         xorps           $in,$iv
 223         aesenc          $rndkey[0],$iv
 224         movups          `32+16*$k`($key),$rndkey[1]
 225 ___
 226     } elsif ($k==9) {
 227       $sn++;
 228       $code.=<<___;
 229         cmp             \$11,$rounds
 230         jb              .Laesenclast$sn
 231         movups          `32+16*($k+0)`($key),$rndkey[1]
 232         aesenc          $rndkey[0],$iv
 233         movups          `32+16*($k+1)`($key),$rndkey[0]
 234         aesenc          $rndkey[1],$iv
 235         je              .Laesenclast$sn
 236         movups          `32+16*($k+2)`($key),$rndkey[1]
 237         aesenc          $rndkey[0],$iv
 238         movups          `32+16*($k+3)`($key),$rndkey[0]
 239         aesenc          $rndkey[1],$iv
 240 .Laesenclast$sn:
 241         aesenclast      $rndkey[0],$iv
 242         movups          16($key),$rndkey[1]             # forward reference
 243 ___
 244     } else {
 245       $code.=<<___;
 246         aesenc          $rndkey[0],$iv
 247         movups          `32+16*$k`($key),$rndkey[1]
 248 ___
 249     }
 250     $r++;       unshift(@rndkey,pop(@rndkey));
 251 };
 252
 253 sub Xupdate_ssse3_16_31()               # recall that $Xi starts wtih 4
 254 { use integer;
 255   my $body = shift;
 256   my @insns = (&$body,&$body,&$body,&$body);    # 40 instructions
 257   my ($a,$b,$c,$d,$e);
 258
 259         &movdqa (@X[0],@X[-3&7]);
 260          eval(shift(@insns));
 261          eval(shift(@insns));
 262         &movdqa (@Tx[0],@X[-1&7]);
 263         &palignr(@X[0],@X[-4&7],8);     # compose "X[-14]" in "X[0]"
 264          eval(shift(@insns));
 265          eval(shift(@insns));
 266
 267           &paddd        (@Tx[1],@X[-1&7]);
 268          eval(shift(@insns));
 269          eval(shift(@insns));
 270         &psrldq (@Tx[0],4);             # "X[-3]", 3 dwords
 271          eval(shift(@insns));
 272          eval(shift(@insns));
 273         &pxor   (@X[0],@X[-4&7]);       # "X[0]"^="X[-16]"
 274          eval(shift(@insns));
 275          eval(shift(@insns));
 276
 277         &pxor   (@Tx[0],@X[-2&7]);      # "X[-3]"^"X[-8]"
 278          eval(shift(@insns));
 279          eval(shift(@insns));
 280          eval(shift(@insns));
 281          eval(shift(@insns));
 282
 283         &pxor   (@X[0],@Tx[0]);         # "X[0]"^="X[-3]"^"X[-8]"
 284          eval(shift(@insns));
 285          eval(shift(@insns));
 286           &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
 287          eval(shift(@insns));
 288          eval(shift(@insns));
 289
 290         &movdqa (@Tx[2],@X[0]);
 291         &movdqa (@Tx[0],@X[0]);
 292          eval(shift(@insns));
 293          eval(shift(@insns));
 294          eval(shift(@insns));
 295          eval(shift(@insns));
 296
 297         &pslldq (@Tx[2],12);            # "X[0]"<<96, extract one dword
 298         &paddd  (@X[0],@X[0]);
 299          eval(shift(@insns));
 300          eval(shift(@insns));
 301          eval(shift(@insns));
 302          eval(shift(@insns));
 303
 304         &psrld  (@Tx[0],31);
 305          eval(shift(@insns));
 306          eval(shift(@insns));
 307         &movdqa (@Tx[1],@Tx[2]);
 308          eval(shift(@insns));
 309          eval(shift(@insns));
 310
 311         &psrld  (@Tx[2],30);
 312         &por    (@X[0],@Tx[0]);         # "X[0]"<<<=1
 313          eval(shift(@insns));
 314          eval(shift(@insns));
 315          eval(shift(@insns));
 316          eval(shift(@insns));
 317
 318         &pslld  (@Tx[1],2);
 319         &pxor   (@X[0],@Tx[2]);
 320          eval(shift(@insns));
 321          eval(shift(@insns));
 322           &movdqa       (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");       # K_XX_XX
 323          eval(shift(@insns));
 324          eval(shift(@insns));
 325
 326         &pxor   (@X[0],@Tx[1]);         # "X[0]"^=("X[0]">>96)<<<2
 327
 328          foreach (@insns) { eval; }     # remaining instructions [if any]
 329
 330   $Xi++;        push(@X,shift(@X));     # "rotate" X[]
 331                 push(@Tx,shift(@Tx));
 332 }
 333
 334 sub Xupdate_ssse3_32_79()
 335 { use integer;
 336   my $body = shift;
 337   my @insns = (&$body,&$body,&$body,&$body);    # 32 to 48 instructions
 338   my ($a,$b,$c,$d,$e);
 339
 340         &movdqa (@Tx[0],@X[-1&7])       if ($Xi==8);
 341          eval(shift(@insns));           # body_20_39
 342         &pxor   (@X[0],@X[-4&7]);       # "X[0]"="X[-32]"^"X[-16]"
 343         &palignr(@Tx[0],@X[-2&7],8);    # compose "X[-6]"
 344          eval(shift(@insns));
 345          eval(shift(@insns));
 346          eval(shift(@insns));           # rol
 347
 348         &pxor   (@X[0],@X[-7&7]);       # "X[0]"^="X[-28]"
 349          eval(shift(@insns));
 350          eval(shift(@insns))    if (@insns[0] !~ /&ro[rl]/);
 351         if ($Xi%5) {
 352           &movdqa       (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
 353         } else {                        # ... or load next one
 354           &movdqa       (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
 355         }
 356           &paddd        (@Tx[1],@X[-1&7]);
 357          eval(shift(@insns));           # ror
 358          eval(shift(@insns));
 359
 360         &pxor   (@X[0],@Tx[0]);         # "X[0]"^="X[-6]"
 361          eval(shift(@insns));           # body_20_39
 362          eval(shift(@insns));
 363          eval(shift(@insns));
 364          eval(shift(@insns));           # rol
 365
 366         &movdqa (@Tx[0],@X[0]);
 367           &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
 368          eval(shift(@insns));
 369          eval(shift(@insns));
 370          eval(shift(@insns));           # ror
 371          eval(shift(@insns));
 372
 373         &pslld  (@X[0],2);
 374          eval(shift(@insns));           # body_20_39
 375          eval(shift(@insns));
 376         &psrld  (@Tx[0],30);
 377          eval(shift(@insns));
 378          eval(shift(@insns));           # rol
 379          eval(shift(@insns));
 380          eval(shift(@insns));
 381          eval(shift(@insns));           # ror
 382          eval(shift(@insns));
 383
 384         &por    (@X[0],@Tx[0]);         # "X[0]"<<<=2
 385          eval(shift(@insns));           # body_20_39
 386          eval(shift(@insns));
 387           &movdqa       (@Tx[1],@X[0])  if ($Xi<19);
 388          eval(shift(@insns));
 389          eval(shift(@insns));           # rol
 390          eval(shift(@insns));
 391          eval(shift(@insns));
 392          eval(shift(@insns));           # rol
 393          eval(shift(@insns));
 394
 395          foreach (@insns) { eval; }     # remaining instructions
 396
 397   $Xi++;        push(@X,shift(@X));     # "rotate" X[]
 398                 push(@Tx,shift(@Tx));
 399 }
 400
 401 sub Xuplast_ssse3_80()
 402 { use integer;
 403   my $body = shift;
 404   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 405   my ($a,$b,$c,$d,$e);
 406
 407          eval(shift(@insns));
 408           &paddd        (@Tx[1],@X[-1&7]);
 409          eval(shift(@insns));
 410          eval(shift(@insns));
 411          eval(shift(@insns));
 412          eval(shift(@insns));
 413
 414           &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
 415
 416          foreach (@insns) { eval; }             # remaining instructions
 417
 418         &cmp    ($inp,$len);
 419         &je     (".Ldone_ssse3");
 420
 421         unshift(@Tx,pop(@Tx));
 422
 423         &movdqa (@X[2],"64($K_XX_XX)");         # pbswap mask
 424         &movdqa (@Tx[1],"0($K_XX_XX)");         # K_00_19
 425         &movdqu (@X[-4&7],"0($inp)");           # load input
 426         &movdqu (@X[-3&7],"16($inp)");
 427         &movdqu (@X[-2&7],"32($inp)");
 428         &movdqu (@X[-1&7],"48($inp)");
 429         &pshufb (@X[-4&7],@X[2]);               # byte swap
 430         &add    ($inp,64);
 431
 432   $Xi=0;
 433 }
 434
 435 sub Xloop_ssse3()
 436 { use integer;
 437   my $body = shift;
 438   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 439   my ($a,$b,$c,$d,$e);
 440
 441          eval(shift(@insns));
 442          eval(shift(@insns));
 443         &pshufb (@X[($Xi-3)&7],@X[2]);
 444          eval(shift(@insns));
 445          eval(shift(@insns));
 446         &paddd  (@X[($Xi-4)&7],@Tx[1]);
 447          eval(shift(@insns));
 448          eval(shift(@insns));
 449          eval(shift(@insns));
 450          eval(shift(@insns));
 451         &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);  # X[]+K xfer to IALU
 452          eval(shift(@insns));
 453          eval(shift(@insns));
 454         &psubd  (@X[($Xi-4)&7],@Tx[1]);
 455
 456         foreach (@insns) { eval; }
 457   $Xi++;
 458 }
 459
 460 sub Xtail_ssse3()
 461 { use integer;
 462   my $body = shift;
 463   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 464   my ($a,$b,$c,$d,$e);
 465
 466         foreach (@insns) { eval; }
 467 }
 468
 469 sub body_00_19 () {
 470   use integer;
 471   my ($k,$n);
 472   my @r=(
 473         '($a,$b,$c,$d,$e)=@V;'.
 474         '&add   ($e,eval(4*($j&15))."(%rsp)");',        # X[]+K xfer
 475         '&xor   ($c,$d);',
 476         '&mov   (@T[1],$a);',   # $b in next round
 477         '&$_rol ($a,5);',
 478         '&and   (@T[0],$c);',   # ($b&($c^$d))
 479         '&xor   ($c,$d);',      # restore $c
 480         '&xor   (@T[0],$d);',
 481         '&add   ($e,$a);',
 482         '&$_ror ($b,$j?7:2);',  # $b>>>2
 483         '&add   ($e,@T[0]);'    .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
 484         );
 485         $n = scalar(@r);
 486         $k = (($jj+1)*12/20)*20*$n/12;  # 12 aesencs per these 20 rounds
 487         @r[$k%$n].='&$aesenc();'        if ($jj==$k/$n);
 488         $jj++;
 489     return @r;
 490 }
 491
 492 sub body_20_39 () {
 493   use integer;
 494   my ($k,$n);
 495   my @r=(
 496         '($a,$b,$c,$d,$e)=@V;'.
 497         '&add   ($e,eval(4*($j++&15))."(%rsp)");',      # X[]+K xfer
 498         '&xor   (@T[0],$d);',   # ($b^$d)
 499         '&mov   (@T[1],$a);',   # $b in next round
 500         '&$_rol ($a,5);',
 501         '&xor   (@T[0],$c);',   # ($b^$d^$c)
 502         '&add   ($e,$a);',
 503         '&$_ror ($b,7);',       # $b>>>2
 504         '&add   ($e,@T[0]);'    .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
 505         );
 506         $n = scalar(@r);
 507         $k = (($jj+1)*8/20)*20*$n/8;    # 8 aesencs per these 20 rounds
 508         @r[$k%$n].='&$aesenc();'        if ($jj==$k/$n);
 509         $jj++;
 510     return @r;
 511 }
 512
 513 sub body_40_59 () {
 514   use integer;
 515   my ($k,$n);
 516   my @r=(
 517         '($a,$b,$c,$d,$e)=@V;'.
 518         '&mov   (@T[1],$c);',
 519         '&xor   ($c,$d);',
 520         '&add   ($e,eval(4*($j++&15))."(%rsp)");',      # X[]+K xfer
 521         '&and   (@T[1],$d);',
 522         '&and   (@T[0],$c);',   # ($b&($c^$d))
 523         '&$_ror ($b,7);',       # $b>>>2
 524         '&add   ($e,@T[1]);',
 525         '&mov   (@T[1],$a);',   # $b in next round
 526         '&$_rol ($a,5);',
 527         '&add   ($e,@T[0]);',
 528         '&xor   ($c,$d);',      # restore $c
 529         '&add   ($e,$a);'       .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
 530         );
 531         $n = scalar(@r);
 532         $k=(($jj+1)*12/20)*20*$n/12;    # 12 aesencs per these 20 rounds
 533         @r[$k%$n].='&$aesenc();'        if ($jj==$k/$n);
 534         $jj++;
 535     return @r;
 536 }
 537 $code.=<<___;
 538 .align  16
 539 .Loop_ssse3:
 540 ___
 541         &Xupdate_ssse3_16_31(\&body_00_19);
 542         &Xupdate_ssse3_16_31(\&body_00_19);
 543         &Xupdate_ssse3_16_31(\&body_00_19);
 544         &Xupdate_ssse3_16_31(\&body_00_19);
 545         &Xupdate_ssse3_32_79(\&body_00_19);
 546         &Xupdate_ssse3_32_79(\&body_20_39);
 547         &Xupdate_ssse3_32_79(\&body_20_39);
 548         &Xupdate_ssse3_32_79(\&body_20_39);
 549         &Xupdate_ssse3_32_79(\&body_20_39);
 550         &Xupdate_ssse3_32_79(\&body_20_39);
 551         &Xupdate_ssse3_32_79(\&body_40_59);
 552         &Xupdate_ssse3_32_79(\&body_40_59);
 553         &Xupdate_ssse3_32_79(\&body_40_59);
 554         &Xupdate_ssse3_32_79(\&body_40_59);
 555         &Xupdate_ssse3_32_79(\&body_40_59);
 556         &Xupdate_ssse3_32_79(\&body_20_39);
 557         &Xuplast_ssse3_80(\&body_20_39);        # can jump to "done"
 558
 559                                 $saved_j=$j; @saved_V=@V;
 560                                 $saved_r=$r; @saved_rndkey=@rndkey;
 561
 562         &Xloop_ssse3(\&body_20_39);
 563         &Xloop_ssse3(\&body_20_39);
 564         &Xloop_ssse3(\&body_20_39);
 565
 566 $code.=<<___;
 567         movups  $iv,48($out,$in0)               # write output
 568         lea     64($in0),$in0
 569
 570         add     0($ctx),$A                      # update context
 571         add     4($ctx),@T[0]
 572         add     8($ctx),$C
 573         add     12($ctx),$D
 574         mov     $A,0($ctx)
 575         add     16($ctx),$E
 576         mov     @T[0],4($ctx)
 577         mov     @T[0],$B                        # magic seed
 578         mov     $C,8($ctx)
 579         mov     $D,12($ctx)
 580         mov     $E,16($ctx)
 581         jmp     .Loop_ssse3
 582
 583 .align  16
 584 .Ldone_ssse3:
 585 ___
 586                                 $jj=$j=$saved_j; @V=@saved_V;
 587                                 $r=$saved_r;     @rndkey=@saved_rndkey;
 588
 589         &Xtail_ssse3(\&body_20_39);
 590         &Xtail_ssse3(\&body_20_39);
 591         &Xtail_ssse3(\&body_20_39);
 592
 593 $code.=<<___;
 594         movups  $iv,48($out,$in0)               # write output
 595         mov     88(%rsp),$ivp                   # restore $ivp
 596
 597         add     0($ctx),$A                      # update context
 598         add     4($ctx),@T[0]
 599         add     8($ctx),$C
 600         mov     $A,0($ctx)
 601         add     12($ctx),$D
 602         mov     @T[0],4($ctx)
 603         add     16($ctx),$E
 604         mov     $C,8($ctx)
 605         mov     $D,12($ctx)
 606         mov     $E,16($ctx)
 607         movups  $iv,($ivp)                      # write IV
 608 ___
 609 $code.=<<___ if ($win64);
 610         movaps  96+0(%rsp),%xmm6
 611         movaps  96+16(%rsp),%xmm7
 612         movaps  96+32(%rsp),%xmm8
 613         movaps  96+48(%rsp),%xmm9
 614         movaps  96+64(%rsp),%xmm10
 615         movaps  96+80(%rsp),%xmm11
 616         movaps  96+96(%rsp),%xmm12
 617         movaps  96+112(%rsp),%xmm13
 618         movaps  96+128(%rsp),%xmm14
 619         movaps  96+144(%rsp),%xmm15
 620 ___
 621 $code.=<<___;
 622         lea     `104+($win64?10*16:0)`(%rsp),%rsi
 623         mov     0(%rsi),%r15
 624         mov     8(%rsi),%r14
 625         mov     16(%rsi),%r13
 626         mov     24(%rsi),%r12
 627         mov     32(%rsi),%rbp
 628         mov     40(%rsi),%rbx
 629         lea     48(%rsi),%rsp
 630 .Lepilogue_ssse3:
 631         ret
 632 .size   aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
 633 ___
 634
 635 $j=$jj=$r=$sn=0;
 636
 637 if ($avx) {
 638 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
 639
 640 my $Xi=4;
 641 my @X=map("%xmm$_",(4..7,0..3));
 642 my @Tx=map("%xmm$_",(8..10));
 643 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");    # size optimization
 644 my @T=("%esi","%edi");
 645
 646 my $_rol=sub { &shld(@_[0],@_) };
 647 my $_ror=sub { &shrd(@_[0],@_) };
 648
 649 $code.=<<___;
 650 .type   aesni_cbc_sha1_enc_avx,\@function,6
 651 .align  16
 652 aesni_cbc_sha1_enc_avx:
 653         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
 654         #shr    \$6,$len                        # debugging artefact
 655         #jz     .Lepilogue_avx                  # debugging artefact
 656         push    %rbx
 657         push    %rbp
 658         push    %r12
 659         push    %r13
 660         push    %r14
 661         push    %r15
 662         lea     `-104-($win64?10*16:0)`(%rsp),%rsp
 663         #mov    $in0,$inp                       # debugging artefact
 664         #lea    64(%rsp),$ctx                   # debugging artefact
 665 ___
 666 $code.=<<___ if ($win64);
 667         movaps  %xmm6,96+0(%rsp)
 668         movaps  %xmm7,96+16(%rsp)
 669         movaps  %xmm8,96+32(%rsp)
 670         movaps  %xmm9,96+48(%rsp)
 671         movaps  %xmm10,96+64(%rsp)
 672         movaps  %xmm11,96+80(%rsp)
 673         movaps  %xmm12,96+96(%rsp)
 674         movaps  %xmm13,96+112(%rsp)
 675         movaps  %xmm14,96+128(%rsp)
 676         movaps  %xmm15,96+144(%rsp)
 677 .Lprologue_avx:
 678 ___
 679 $code.=<<___;
 680         vzeroall
 681         mov     $in0,%r12                       # reassign arguments
 682         mov     $out,%r13
 683         mov     $len,%r14
 684         mov     $key,%r15
 685         vmovdqu ($ivp),$iv                      # load IV
 686         mov     $ivp,88(%rsp)                   # save $ivp
 687 ___
 688 my ($in0,$out,$len,$key)=map("%r$_",(12..15));  # reassign arguments
 689 my $rounds="${ivp}d";
 690 $code.=<<___;
 691         shl     \$6,$len
 692         sub     $in0,$out
 693         mov     240($key),$rounds
 694         add     \$112,$key              # size optimization
 695         add     $inp,$len               # end of input
 696
 697         lea     K_XX_XX(%rip),$K_XX_XX
 698         mov     0($ctx),$A              # load context
 699         mov     4($ctx),$B
 700         mov     8($ctx),$C
 701         mov     12($ctx),$D
 702         mov     $B,@T[0]                # magic seed
 703         mov     16($ctx),$E
 704
 705         vmovdqa 64($K_XX_XX),@X[2]      # pbswap mask
 706         vmovdqa 0($K_XX_XX),@Tx[1]      # K_00_19
 707         vmovdqu 0($inp),@X[-4&7]        # load input to %xmm[0-3]
 708         vmovdqu 16($inp),@X[-3&7]
 709         vmovdqu 32($inp),@X[-2&7]
 710         vmovdqu 48($inp),@X[-1&7]
 711         vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
 712         add     \$64,$inp
 713         vpshufb @X[2],@X[-3&7],@X[-3&7]
 714         vpshufb @X[2],@X[-2&7],@X[-2&7]
 715         vpshufb @X[2],@X[-1&7],@X[-1&7]
 716         vpaddd  @Tx[1],@X[-4&7],@X[0]   # add K_00_19
 717         vpaddd  @Tx[1],@X[-3&7],@X[1]
 718         vpaddd  @Tx[1],@X[-2&7],@X[2]
 719         vmovdqa @X[0],0(%rsp)           # X[]+K xfer to IALU
 720         vmovdqa @X[1],16(%rsp)
 721         vmovdqa @X[2],32(%rsp)
 722         vmovups -112($key),$rndkey0     # $key[0]
 723         vmovups 16-112($key),$rndkey[0] # forward reference
 724         jmp     .Loop_avx
 725 ___
 726
 727 my $aesenc=sub {
 728   use integer;
 729   my ($n,$k)=($r/10,$r%10);
 730     if ($k==0) {
 731       $code.=<<___;
 732         vmovups         `16*$n`($in0),$in               # load input
 733         vxorps          $rndkey0,$in,$in
 734 ___
 735       $code.=<<___ if ($n);
 736         vmovups         $iv,`16*($n-1)`($out,$in0)      # write output
 737 ___
 738       $code.=<<___;
 739         vxorps          $in,$iv,$iv
 740         vaesenc         $rndkey[0],$iv,$iv
 741         vmovups         `32+16*$k-112`($key),$rndkey[1]
 742 ___
 743     } elsif ($k==9) {
 744       $sn++;
 745       $code.=<<___;
 746         cmp             \$11,$rounds
 747         jb              .Lvaesenclast$sn
 748         vaesenc         $rndkey[0],$iv,$iv
 749         vmovups         `32+16*($k+0)-112`($key),$rndkey[1]
 750         vaesenc         $rndkey[1],$iv,$iv
 751         vmovups         `32+16*($k+1)-112`($key),$rndkey[0]
 752         je              .Lvaesenclast$sn
 753         vaesenc         $rndkey[0],$iv,$iv
 754         vmovups         `32+16*($k+2)-112`($key),$rndkey[1]
 755         vaesenc         $rndkey[1],$iv,$iv
 756         vmovups         `32+16*($k+3)-112`($key),$rndkey[0]
 757 .Lvaesenclast$sn:
 758         vaesenclast     $rndkey[0],$iv,$iv
 759         vmovups         16-112($key),$rndkey[1]         # forward reference
 760 ___
 761     } else {
 762       $code.=<<___;
 763         vaesenc         $rndkey[0],$iv,$iv
 764         vmovups         `32+16*$k-112`($key),$rndkey[1]
 765 ___
 766     }
 767     $r++;       unshift(@rndkey,pop(@rndkey));
 768 };
 769
 770 sub Xupdate_avx_16_31()         # recall that $Xi starts wtih 4
 771 { use integer;
 772   my $body = shift;
 773   my @insns = (&$body,&$body,&$body,&$body);    # 40 instructions
 774   my ($a,$b,$c,$d,$e);
 775
 776          eval(shift(@insns));
 777          eval(shift(@insns));
 778         &vpalignr(@X[0],@X[-3&7],@X[-4&7],8);   # compose "X[-14]" in "X[0]"
 779          eval(shift(@insns));
 780          eval(shift(@insns));
 781
 782           &vpaddd       (@Tx[1],@Tx[1],@X[-1&7]);
 783          eval(shift(@insns));
 784          eval(shift(@insns));
 785         &vpsrldq(@Tx[0],@X[-1&7],4);    # "X[-3]", 3 dwords
 786          eval(shift(@insns));
 787          eval(shift(@insns));
 788         &vpxor  (@X[0],@X[0],@X[-4&7]);         # "X[0]"^="X[-16]"
 789          eval(shift(@insns));
 790          eval(shift(@insns));
 791
 792         &vpxor  (@Tx[0],@Tx[0],@X[-2&7]);       # "X[-3]"^"X[-8]"
 793          eval(shift(@insns));
 794          eval(shift(@insns));
 795          eval(shift(@insns));
 796          eval(shift(@insns));
 797
 798         &vpxor  (@X[0],@X[0],@Tx[0]);           # "X[0]"^="X[-3]"^"X[-8]"
 799          eval(shift(@insns));
 800          eval(shift(@insns));
 801           &vmovdqa      (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
 802          eval(shift(@insns));
 803          eval(shift(@insns));
 804
 805         &vpsrld (@Tx[0],@X[0],31);
 806          eval(shift(@insns));
 807          eval(shift(@insns));
 808          eval(shift(@insns));
 809          eval(shift(@insns));
 810
 811         &vpslldq(@Tx[2],@X[0],12);              # "X[0]"<<96, extract one dword
 812         &vpaddd (@X[0],@X[0],@X[0]);
 813          eval(shift(@insns));
 814          eval(shift(@insns));
 815          eval(shift(@insns));
 816          eval(shift(@insns));
 817
 818         &vpsrld (@Tx[1],@Tx[2],30);
 819         &vpor   (@X[0],@X[0],@Tx[0]);           # "X[0]"<<<=1
 820          eval(shift(@insns));
 821          eval(shift(@insns));
 822          eval(shift(@insns));
 823          eval(shift(@insns));
 824
 825         &vpslld (@Tx[2],@Tx[2],2);
 826         &vpxor  (@X[0],@X[0],@Tx[1]);
 827          eval(shift(@insns));
 828          eval(shift(@insns));
 829          eval(shift(@insns));
 830          eval(shift(@insns));
 831
 832         &vpxor  (@X[0],@X[0],@Tx[2]);           # "X[0]"^=("X[0]">>96)<<<2
 833          eval(shift(@insns));
 834          eval(shift(@insns));
 835           &vmovdqa      (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");       # K_XX_XX
 836          eval(shift(@insns));
 837          eval(shift(@insns));
 838
 839
 840          foreach (@insns) { eval; }     # remaining instructions [if any]
 841
 842   $Xi++;        push(@X,shift(@X));     # "rotate" X[]
 843                 push(@Tx,shift(@Tx));
 844 }
 845
 846 sub Xupdate_avx_32_79()
 847 { use integer;
 848   my $body = shift;
 849   my @insns = (&$body,&$body,&$body,&$body);    # 32 to 48 instructions
 850   my ($a,$b,$c,$d,$e);
 851
 852         &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);  # compose "X[-6]"
 853         &vpxor  (@X[0],@X[0],@X[-4&7]);         # "X[0]"="X[-32]"^"X[-16]"
 854          eval(shift(@insns));           # body_20_39
 855          eval(shift(@insns));
 856          eval(shift(@insns));
 857          eval(shift(@insns));           # rol
 858
 859         &vpxor  (@X[0],@X[0],@X[-7&7]);         # "X[0]"^="X[-28]"
 860          eval(shift(@insns));
 861          eval(shift(@insns))    if (@insns[0] !~ /&ro[rl]/);
 862         if ($Xi%5) {
 863           &vmovdqa      (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
 864         } else {                        # ... or load next one
 865           &vmovdqa      (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
 866         }
 867           &vpaddd       (@Tx[1],@Tx[1],@X[-1&7]);
 868          eval(shift(@insns));           # ror
 869          eval(shift(@insns));
 870
 871         &vpxor  (@X[0],@X[0],@Tx[0]);           # "X[0]"^="X[-6]"
 872          eval(shift(@insns));           # body_20_39
 873          eval(shift(@insns));
 874          eval(shift(@insns));
 875          eval(shift(@insns));           # rol
 876
 877         &vpsrld (@Tx[0],@X[0],30);
 878           &vmovdqa      (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
 879          eval(shift(@insns));
 880          eval(shift(@insns));
 881          eval(shift(@insns));           # ror
 882          eval(shift(@insns));
 883
 884         &vpslld (@X[0],@X[0],2);
 885          eval(shift(@insns));           # body_20_39
 886          eval(shift(@insns));
 887          eval(shift(@insns));
 888          eval(shift(@insns));           # rol
 889          eval(shift(@insns));
 890          eval(shift(@insns));
 891          eval(shift(@insns));           # ror
 892          eval(shift(@insns));
 893
 894         &vpor   (@X[0],@X[0],@Tx[0]);           # "X[0]"<<<=2
 895          eval(shift(@insns));           # body_20_39
 896          eval(shift(@insns));
 897           &vmovdqa      (@Tx[1],@X[0])  if ($Xi<19);
 898          eval(shift(@insns));
 899          eval(shift(@insns));           # rol
 900          eval(shift(@insns));
 901          eval(shift(@insns));
 902          eval(shift(@insns));           # rol
 903          eval(shift(@insns));
 904
 905          foreach (@insns) { eval; }     # remaining instructions
 906
 907   $Xi++;        push(@X,shift(@X));     # "rotate" X[]
 908                 push(@Tx,shift(@Tx));
 909 }
 910
 911 sub Xuplast_avx_80()
 912 { use integer;
 913   my $body = shift;
 914   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 915   my ($a,$b,$c,$d,$e);
 916
 917          eval(shift(@insns));
 918           &vpaddd       (@Tx[1],@Tx[1],@X[-1&7]);
 919          eval(shift(@insns));
 920          eval(shift(@insns));
 921          eval(shift(@insns));
 922          eval(shift(@insns));
 923
 924           &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
 925
 926          foreach (@insns) { eval; }             # remaining instructions
 927
 928         &cmp    ($inp,$len);
 929         &je     (".Ldone_avx");
 930
 931         unshift(@Tx,pop(@Tx));
 932
 933         &vmovdqa(@X[2],"64($K_XX_XX)");         # pbswap mask
 934         &vmovdqa(@Tx[1],"0($K_XX_XX)");         # K_00_19
 935         &vmovdqu(@X[-4&7],"0($inp)");           # load input
 936         &vmovdqu(@X[-3&7],"16($inp)");
 937         &vmovdqu(@X[-2&7],"32($inp)");
 938         &vmovdqu(@X[-1&7],"48($inp)");
 939         &vpshufb(@X[-4&7],@X[-4&7],@X[2]);      # byte swap
 940         &add    ($inp,64);
 941
 942   $Xi=0;
 943 }
 944
 945 sub Xloop_avx()
 946 { use integer;
 947   my $body = shift;
 948   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 949   my ($a,$b,$c,$d,$e);
 950
 951          eval(shift(@insns));
 952          eval(shift(@insns));
 953         &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
 954          eval(shift(@insns));
 955          eval(shift(@insns));
 956         &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
 957          eval(shift(@insns));
 958          eval(shift(@insns));
 959          eval(shift(@insns));
 960          eval(shift(@insns));
 961         &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);      # X[]+K xfer to IALU
 962          eval(shift(@insns));
 963          eval(shift(@insns));
 964
 965         foreach (@insns) { eval; }
 966   $Xi++;
 967 }
 968
 969 sub Xtail_avx()
 970 { use integer;
 971   my $body = shift;
 972   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 973   my ($a,$b,$c,$d,$e);
 974
 975         foreach (@insns) { eval; }
 976 }
 977
 978 $code.=<<___;
 979 .align  16
 980 .Loop_avx:
 981 ___
 982         &Xupdate_avx_16_31(\&body_00_19);
 983         &Xupdate_avx_16_31(\&body_00_19);
 984         &Xupdate_avx_16_31(\&body_00_19);
 985         &Xupdate_avx_16_31(\&body_00_19);
 986         &Xupdate_avx_32_79(\&body_00_19);
 987         &Xupdate_avx_32_79(\&body_20_39);
 988         &Xupdate_avx_32_79(\&body_20_39);
 989         &Xupdate_avx_32_79(\&body_20_39);
 990         &Xupdate_avx_32_79(\&body_20_39);
 991         &Xupdate_avx_32_79(\&body_20_39);
 992         &Xupdate_avx_32_79(\&body_40_59);
 993         &Xupdate_avx_32_79(\&body_40_59);
 994         &Xupdate_avx_32_79(\&body_40_59);
 995         &Xupdate_avx_32_79(\&body_40_59);
 996         &Xupdate_avx_32_79(\&body_40_59);
 997         &Xupdate_avx_32_79(\&body_20_39);
 998         &Xuplast_avx_80(\&body_20_39);  # can jump to "done"
 999
1000                                 $saved_j=$j; @saved_V=@V;
1001                                 $saved_r=$r; @saved_rndkey=@rndkey;
1002
1003         &Xloop_avx(\&body_20_39);
1004         &Xloop_avx(\&body_20_39);
1005         &Xloop_avx(\&body_20_39);
1006
1007 $code.=<<___;
1008         vmovups $iv,48($out,$in0)               # write output
1009         lea     64($in0),$in0
1010
1011         add     0($ctx),$A                      # update context
1012         add     4($ctx),@T[0]
1013         add     8($ctx),$C
1014         add     12($ctx),$D
1015         mov     $A,0($ctx)
1016         add     16($ctx),$E
1017         mov     @T[0],4($ctx)
1018         mov     @T[0],$B                        # magic seed
1019         mov     $C,8($ctx)
1020         mov     $D,12($ctx)
1021         mov     $E,16($ctx)
1022         jmp     .Loop_avx
1023
1024 .align  16
1025 .Ldone_avx:
1026 ___
1027                                 $jj=$j=$saved_j; @V=@saved_V;
1028                                 $r=$saved_r;     @rndkey=@saved_rndkey;
1029
1030         &Xtail_avx(\&body_20_39);
1031         &Xtail_avx(\&body_20_39);
1032         &Xtail_avx(\&body_20_39);
1033
1034 $code.=<<___;
1035         vmovups $iv,48($out,$in0)               # write output
1036         mov     88(%rsp),$ivp                   # restore $ivp
1037
1038         add     0($ctx),$A                      # update context
1039         add     4($ctx),@T[0]
1040         add     8($ctx),$C
1041         mov     $A,0($ctx)
1042         add     12($ctx),$D
1043         mov     @T[0],4($ctx)
1044         add     16($ctx),$E
1045         mov     $C,8($ctx)
1046         mov     $D,12($ctx)
1047         mov     $E,16($ctx)
1048         vmovups $iv,($ivp)                      # write IV
1049         vzeroall
1050 ___
1051 $code.=<<___ if ($win64);
1052         movaps  96+0(%rsp),%xmm6
1053         movaps  96+16(%rsp),%xmm7
1054         movaps  96+32(%rsp),%xmm8
1055         movaps  96+48(%rsp),%xmm9
1056         movaps  96+64(%rsp),%xmm10
1057         movaps  96+80(%rsp),%xmm11
1058         movaps  96+96(%rsp),%xmm12
1059         movaps  96+112(%rsp),%xmm13
1060         movaps  96+128(%rsp),%xmm14
1061         movaps  96+144(%rsp),%xmm15
1062 ___
1063 $code.=<<___;
1064         lea     `104+($win64?10*16:0)`(%rsp),%rsi
1065         mov     0(%rsi),%r15
1066         mov     8(%rsi),%r14
1067         mov     16(%rsi),%r13
1068         mov     24(%rsi),%r12
1069         mov     32(%rsi),%rbp
1070         mov     40(%rsi),%rbx
1071         lea     48(%rsi),%rsp
1072 .Lepilogue_avx:
1073         ret
1074 .size   aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
1075 ___
1076 }
1077 $code.=<<___;
1078 .align  64
1079 K_XX_XX:
1080 .long   0x5a827999,0x5a827999,0x5a827999,0x5a827999     # K_00_19
1081 .long   0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1     # K_20_39
1082 .long   0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc     # K_40_59
1083 .long   0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6     # K_60_79
1084 .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap mask
1085
1086 .asciz  "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1087 .align  64
1088 ___
1089
1090 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1091 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1092 if ($win64) {
1093 $rec="%rcx";
1094 $frame="%rdx";
1095 $context="%r8";
1096 $disp="%r9";
1097
1098 $code.=<<___;
1099 .extern __imp_RtlVirtualUnwind
1100 .type   ssse3_handler,\@abi-omnipotent
1101 .align  16
1102 ssse3_handler:
1103         push    %rsi
1104         push    %rdi
1105         push    %rbx
1106         push    %rbp
1107         push    %r12
1108         push    %r13
1109         push    %r14
1110         push    %r15
1111         pushfq
1112         sub     \$64,%rsp
1113
1114         mov     120($context),%rax      # pull context->Rax
1115         mov     248($context),%rbx      # pull context->Rip
1116
1117         mov     8($disp),%rsi           # disp->ImageBase
1118         mov     56($disp),%r11          # disp->HandlerData
1119
1120         mov     0(%r11),%r10d           # HandlerData[0]
1121         lea     (%rsi,%r10),%r10        # prologue label
1122         cmp     %r10,%rbx               # context->Rip<prologue label
1123         jb      .Lcommon_seh_tail
1124
1125         mov     152($context),%rax      # pull context->Rsp
1126
1127         mov     4(%r11),%r10d           # HandlerData[1]
1128         lea     (%rsi,%r10),%r10        # epilogue label
1129         cmp     %r10,%rbx               # context->Rip>=epilogue label
1130         jae     .Lcommon_seh_tail
1131
1132         lea     96(%rax),%rsi
1133         lea     512($context),%rdi      # &context.Xmm6
1134         mov     \$20,%ecx
1135         .long   0xa548f3fc              # cld; rep movsq
1136         lea     `104+10*16`(%rax),%rax  # adjust stack pointer
1137
1138         mov     0(%rax),%r15
1139         mov     8(%rax),%r14
1140         mov     16(%rax),%r13
1141         mov     24(%rax),%r12
1142         mov     32(%rax),%rbp
1143         mov     40(%rax),%rbx
1144         lea     48(%rax),%rax
1145         mov     %rbx,144($context)      # restore context->Rbx
1146         mov     %rbp,160($context)      # restore context->Rbp
1147         mov     %r12,216($context)      # restore context->R12
1148         mov     %r13,224($context)      # restore context->R13
1149         mov     %r14,232($context)      # restore context->R14
1150         mov     %r15,240($context)      # restore context->R15
1151
1152 .Lcommon_seh_tail:
1153         mov     8(%rax),%rdi
1154         mov     16(%rax),%rsi
1155         mov     %rax,152($context)      # restore context->Rsp
1156         mov     %rsi,168($context)      # restore context->Rsi
1157         mov     %rdi,176($context)      # restore context->Rdi
1158
1159         mov     40($disp),%rdi          # disp->ContextRecord
1160         mov     $context,%rsi           # context
1161         mov     \$154,%ecx              # sizeof(CONTEXT)
1162         .long   0xa548f3fc              # cld; rep movsq
1163
1164         mov     $disp,%rsi
1165         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1166         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1167         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1168         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1169         mov     40(%rsi),%r10           # disp->ContextRecord
1170         lea     56(%rsi),%r11           # &disp->HandlerData
1171         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1172         mov     %r10,32(%rsp)           # arg5
1173         mov     %r11,40(%rsp)           # arg6
1174         mov     %r12,48(%rsp)           # arg7
1175         mov     %rcx,56(%rsp)           # arg8, (NULL)
1176         call    *__imp_RtlVirtualUnwind(%rip)
1177
1178         mov     \$1,%eax                # ExceptionContinueSearch
1179         add     \$64,%rsp
1180         popfq
1181         pop     %r15
1182         pop     %r14
1183         pop     %r13
1184         pop     %r12
1185         pop     %rbp
1186         pop     %rbx
1187         pop     %rdi
1188         pop     %rsi
1189         ret
1190 .size   ssse3_handler,.-ssse3_handler
1191
1192 .section        .pdata
1193 .align  4
1194         .rva    .LSEH_begin_aesni_cbc_sha1_enc_ssse3
1195         .rva    .LSEH_end_aesni_cbc_sha1_enc_ssse3
1196         .rva    .LSEH_info_aesni_cbc_sha1_enc_ssse3
1197 ___
1198 $code.=<<___ if ($avx);
1199         .rva    .LSEH_begin_aesni_cbc_sha1_enc_avx
1200         .rva    .LSEH_end_aesni_cbc_sha1_enc_avx
1201         .rva    .LSEH_info_aesni_cbc_sha1_enc_avx
1202 ___
1203 $code.=<<___;
1204 .section        .xdata
1205 .align  8
1206 .LSEH_info_aesni_cbc_sha1_enc_ssse3:
1207         .byte   9,0,0,0
1208         .rva    ssse3_handler
1209         .rva    .Lprologue_ssse3,.Lepilogue_ssse3       # HandlerData[]
1210 ___
1211 $code.=<<___ if ($avx);
1212 .LSEH_info_aesni_cbc_sha1_enc_avx:
1213         .byte   9,0,0,0
1214         .rva    ssse3_handler
1215         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1216 ___
1217 }
1218
1219 ####################################################################
1220 sub rex {
1221   local *opcode=shift;
1222   my ($dst,$src)=@_;
1223   my $rex=0;
1224
1225     $rex|=0x04                  if($dst>=8);
1226     $rex|=0x01                  if($src>=8);
1227     push @opcode,$rex|0x40      if($rex);
1228 }
1229
1230 sub aesni {
1231   my $line=shift;
1232   my @opcode=(0x66);
1233
1234     if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1235         my %opcodelet = (
1236                 "aesenc" => 0xdc,       "aesenclast" => 0xdd
1237         );
1238         return undef if (!defined($opcodelet{$1}));
1239         rex(\@opcode,$3,$2);
1240         push @opcode,0x0f,0x38,$opcodelet{$1};
1241         push @opcode,0xc0|($2&7)|(($3&7)<<3);   # ModR/M
1242         return ".byte\t".join(',',@opcode);
1243     }
1244     return $line;
1245 }
1246
1247 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1248 $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
1249
1250 print $code;
1251 close STDOUT;