]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - crypto/openssl/crypto/aes/asm/aesni-sha256-x86_64.pl
Add kernel interfaces to call EFI Runtime Services.
[FreeBSD/FreeBSD.git] / crypto / openssl / crypto / aes / asm / aesni-sha256-x86_64.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # January 2013
11 #
12 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
15 # parallelism, interleaving it with another algorithm would allow to
16 # utilize processor resources better and achieve better performance.
17 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
18 # AESNI code is weaved into it. As SHA256 dominates execution time,
19 # stitch performance does not depend on AES key length. Below are
20 # performance numbers in cycles per processed byte, less is better,
21 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
22 # subroutine:
23 #
24 #                AES-128/-192/-256+SHA256       this(**)gain
25 # Sandy Bridge      5.05/6.05/7.05+11.6         13.0    +28%/36%/43%
26 # Ivy Bridge        5.05/6.05/7.05+10.3         11.6    +32%/41%/50%
27 # Haswell           4.43/5.29/6.19+7.80         8.79    +39%/49%/59%
28 # Bulldozer         5.77/6.89/8.00+13.7         13.7    +42%/50%/58%
29 #
30 # (*)   there are XOP, AVX1 and AVX2 code pathes, meaning that
31 #       Westmere is omitted from loop, this is because gain was not
32 #       estimated high enough to justify the effort;
33 # (**)  these are EVP-free results, results obtained with 'speed
34 #       -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
35
36 $flavour = shift;
37 $output  = shift;
38 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
39
40 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
41
42 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
43 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
44 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
45 die "can't locate x86_64-xlate.pl";
46
47 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
48                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
49         $avx = ($1>=2.19) + ($1>=2.22);
50 }
51
52 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
53            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
54         $avx = ($1>=2.09) + ($1>=2.10);
55 }
56
57 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
58            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
59         $avx = ($1>=10) + ($1>=12);
60 }
61
62 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
63         $avx = ($2>=3.0) + ($2>3.0);
64 }
65
66 $shaext=$avx;   ### set to zero if compiling for 1.0.1
67 $avx=1          if (!$shaext && $avx);
68
69 open OUT,"| \"$^X\" $xlate $flavour $output";
70 *STDOUT=*OUT;
71
72 $func="aesni_cbc_sha256_enc";
73 $TABLE="K256";
74 $SZ=4;
75 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
76                                 "%r8d","%r9d","%r10d","%r11d");
77 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
78 @Sigma0=( 2,13,22);
79 @Sigma1=( 6,11,25);
80 @sigma0=( 7,18, 3);
81 @sigma1=(17,19,10);
82 $rounds=64;
83
84 ########################################################################
85 # void aesni_cbc_sha256_enc(const void *inp,
86 #                       void *out,
87 #                       size_t length,
88 #                       const AES_KEY *key,
89 #                       unsigned char *iv,
90 #                       SHA256_CTX *ctx,
91 #                       const void *in0);
92 ($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
93 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
94
95 $Tbl="%rbp";
96
97 $_inp="16*$SZ+0*8(%rsp)";
98 $_out="16*$SZ+1*8(%rsp)";
99 $_end="16*$SZ+2*8(%rsp)";
100 $_key="16*$SZ+3*8(%rsp)";
101 $_ivp="16*$SZ+4*8(%rsp)";
102 $_ctx="16*$SZ+5*8(%rsp)";
103 $_in0="16*$SZ+6*8(%rsp)";
104 $_rsp="16*$SZ+7*8(%rsp)";
105 $framesz=16*$SZ+8*8;
106
107 $code=<<___;
108 .text
109
110 .extern OPENSSL_ia32cap_P
111 .globl  $func
112 .type   $func,\@abi-omnipotent
113 .align  16
114 $func:
115 ___
116                                                 if ($avx) {
117 $code.=<<___;
118         lea     OPENSSL_ia32cap_P(%rip),%r11
119         mov     \$1,%eax
120         cmp     \$0,`$win64?"%rcx":"%rdi"`
121         je      .Lprobe
122         mov     0(%r11),%eax
123         mov     4(%r11),%r10
124 ___
125 $code.=<<___ if ($shaext);
126         bt      \$61,%r10                       # check for SHA
127         jc      ${func}_shaext
128 ___
129 $code.=<<___;
130         mov     %r10,%r11
131         shr     \$32,%r11
132
133         test    \$`1<<11`,%r10d                 # check for XOP
134         jnz     ${func}_xop
135 ___
136 $code.=<<___ if ($avx>1);
137         and     \$`1<<8|1<<5|1<<3`,%r11d        # check for BMI2+AVX2+BMI1
138         cmp     \$`1<<8|1<<5|1<<3`,%r11d
139         je      ${func}_avx2
140 ___
141 $code.=<<___;
142         and     \$`1<<28`,%r10d                 # check for AVX
143         jnz     ${func}_avx
144         ud2
145 ___
146                                                 }
147 $code.=<<___;
148         xor     %eax,%eax
149         cmp     \$0,`$win64?"%rcx":"%rdi"`
150         je      .Lprobe
151         ud2
152 .Lprobe:
153         ret
154 .size   $func,.-$func
155
156 .align  64
157 .type   $TABLE,\@object
158 $TABLE:
159         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
160         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
161         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
162         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
163         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
164         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
165         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
166         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
167         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
168         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
169         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
170         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
171         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
172         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
173         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
174         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
175         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
176         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
177         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
178         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
179         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
180         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
181         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
182         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
183         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
184         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
185         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
186         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
187         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
188         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
189         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
190         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
191
192         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
193         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
194         .long   0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
195         .long   0,0,0,0,   0,0,0,0
196         .asciz  "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
197 .align  64
198 ___
199
200 ######################################################################
201 # SIMD code paths
202 #
203 {{{
204 ($iv,$inout,$roundkey,$temp,
205  $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
206
207 $aesni_cbc_idx=0;
208 @aesni_cbc_block = (
209 ##      &vmovdqu        ($roundkey,"0x00-0x80($inp)");'
210 ##      &vmovdqu        ($inout,($inp));
211 ##      &mov            ($_inp,$inp);
212
213         '&vpxor         ($inout,$inout,$roundkey);'.
214         ' &vmovdqu      ($roundkey,"0x10-0x80($inp)");',
215
216         '&vpxor         ($inout,$inout,$iv);',
217
218         '&vaesenc       ($inout,$inout,$roundkey);'.
219         ' &vmovdqu      ($roundkey,"0x20-0x80($inp)");',
220
221         '&vaesenc       ($inout,$inout,$roundkey);'.
222         ' &vmovdqu      ($roundkey,"0x30-0x80($inp)");',
223
224         '&vaesenc       ($inout,$inout,$roundkey);'.
225         ' &vmovdqu      ($roundkey,"0x40-0x80($inp)");',
226
227         '&vaesenc       ($inout,$inout,$roundkey);'.
228         ' &vmovdqu      ($roundkey,"0x50-0x80($inp)");',
229
230         '&vaesenc       ($inout,$inout,$roundkey);'.
231         ' &vmovdqu      ($roundkey,"0x60-0x80($inp)");',
232
233         '&vaesenc       ($inout,$inout,$roundkey);'.
234         ' &vmovdqu      ($roundkey,"0x70-0x80($inp)");',
235
236         '&vaesenc       ($inout,$inout,$roundkey);'.
237         ' &vmovdqu      ($roundkey,"0x80-0x80($inp)");',
238
239         '&vaesenc       ($inout,$inout,$roundkey);'.
240         ' &vmovdqu      ($roundkey,"0x90-0x80($inp)");',
241
242         '&vaesenc       ($inout,$inout,$roundkey);'.
243         ' &vmovdqu      ($roundkey,"0xa0-0x80($inp)");',
244
245         '&vaesenclast   ($temp,$inout,$roundkey);'.
246         ' &vaesenc      ($inout,$inout,$roundkey);'.
247         ' &vmovdqu      ($roundkey,"0xb0-0x80($inp)");',
248
249         '&vpand         ($iv,$temp,$mask10);'.
250         ' &vaesenc      ($inout,$inout,$roundkey);'.
251         ' &vmovdqu      ($roundkey,"0xc0-0x80($inp)");',
252
253         '&vaesenclast   ($temp,$inout,$roundkey);'.
254         ' &vaesenc      ($inout,$inout,$roundkey);'.
255         ' &vmovdqu      ($roundkey,"0xd0-0x80($inp)");',
256
257         '&vpand         ($temp,$temp,$mask12);'.
258         ' &vaesenc      ($inout,$inout,$roundkey);'.
259          '&vmovdqu      ($roundkey,"0xe0-0x80($inp)");',
260
261         '&vpor          ($iv,$iv,$temp);'.
262         ' &vaesenclast  ($temp,$inout,$roundkey);'.
263         ' &vmovdqu      ($roundkey,"0x00-0x80($inp)");'
264
265 ##      &mov            ($inp,$_inp);
266 ##      &mov            ($out,$_out);
267 ##      &vpand          ($temp,$temp,$mask14);
268 ##      &vpor           ($iv,$iv,$temp);
269 ##      &vmovdqu        ($iv,($out,$inp);
270 ##      &lea            (inp,16($inp));
271 );
272
273 my $a4=$T1;
274 my ($a,$b,$c,$d,$e,$f,$g,$h);
275
276 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
277 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
278   my $arg = pop;
279     $arg = "\$$arg" if ($arg*1 eq $arg);
280     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
281 }
282
283 sub body_00_15 () {
284         (
285         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
286
287         '&ror   ($a0,$Sigma1[2]-$Sigma1[1])',
288         '&mov   ($a,$a1)',
289         '&mov   ($a4,$f)',
290
291         '&xor   ($a0,$e)',
292         '&ror   ($a1,$Sigma0[2]-$Sigma0[1])',
293         '&xor   ($a4,$g)',                      # f^g
294
295         '&ror   ($a0,$Sigma1[1]-$Sigma1[0])',
296         '&xor   ($a1,$a)',
297         '&and   ($a4,$e)',                      # (f^g)&e
298
299         @aesni_cbc_block[$aesni_cbc_idx++].
300         '&xor   ($a0,$e)',
301         '&add   ($h,$SZ*($i&15)."(%rsp)")',     # h+=X[i]+K[i]
302         '&mov   ($a2,$a)',
303
304         '&ror   ($a1,$Sigma0[1]-$Sigma0[0])',
305         '&xor   ($a4,$g)',                      # Ch(e,f,g)=((f^g)&e)^g
306         '&xor   ($a2,$b)',                      # a^b, b^c in next round
307
308         '&ror   ($a0,$Sigma1[0])',              # Sigma1(e)
309         '&add   ($h,$a4)',                      # h+=Ch(e,f,g)
310         '&and   ($a3,$a2)',                     # (b^c)&(a^b)
311
312         '&xor   ($a1,$a)',
313         '&add   ($h,$a0)',                      # h+=Sigma1(e)
314         '&xor   ($a3,$b)',                      # Maj(a,b,c)=Ch(a^b,c,b)
315
316         '&add   ($d,$h)',                       # d+=h
317         '&ror   ($a1,$Sigma0[0])',              # Sigma0(a)
318         '&add   ($h,$a3)',                      # h+=Maj(a,b,c)
319
320         '&mov   ($a0,$d)',
321         '&add   ($a1,$h);'.                     # h+=Sigma0(a)
322         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
323         );
324 }
325
326 if ($avx) {{
327 ######################################################################
328 # XOP code path
329 #
330 $code.=<<___;
331 .type   ${func}_xop,\@function,6
332 .align  64
333 ${func}_xop:
334 .Lxop_shortcut:
335         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
336         push    %rbx
337         push    %rbp
338         push    %r12
339         push    %r13
340         push    %r14
341         push    %r15
342         mov     %rsp,%r11               # copy %rsp
343         sub     \$`$framesz+$win64*16*10`,%rsp
344         and     \$-64,%rsp              # align stack frame
345
346         shl     \$6,$len
347         sub     $inp,$out               # re-bias
348         sub     $inp,$in0
349         add     $inp,$len               # end of input
350
351         #mov    $inp,$_inp              # saved later
352         mov     $out,$_out
353         mov     $len,$_end
354         #mov    $key,$_key              # remains resident in $inp register
355         mov     $ivp,$_ivp
356         mov     $ctx,$_ctx
357         mov     $in0,$_in0
358         mov     %r11,$_rsp
359 ___
360 $code.=<<___ if ($win64);
361         movaps  %xmm6,`$framesz+16*0`(%rsp)
362         movaps  %xmm7,`$framesz+16*1`(%rsp)
363         movaps  %xmm8,`$framesz+16*2`(%rsp)
364         movaps  %xmm9,`$framesz+16*3`(%rsp)
365         movaps  %xmm10,`$framesz+16*4`(%rsp)
366         movaps  %xmm11,`$framesz+16*5`(%rsp)
367         movaps  %xmm12,`$framesz+16*6`(%rsp)
368         movaps  %xmm13,`$framesz+16*7`(%rsp)
369         movaps  %xmm14,`$framesz+16*8`(%rsp)
370         movaps  %xmm15,`$framesz+16*9`(%rsp)
371 ___
372 $code.=<<___;
373 .Lprologue_xop:
374         vzeroall
375
376         mov     $inp,%r12               # borrow $a4
377         lea     0x80($key),$inp         # size optimization, reassign
378         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
379         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
380         mov     $ctx,%r15               # borrow $a2
381         mov     $in0,%rsi               # borrow $a3
382         vmovdqu ($ivp),$iv              # load IV
383         sub     \$9,%r14
384
385         mov     $SZ*0(%r15),$A
386         mov     $SZ*1(%r15),$B
387         mov     $SZ*2(%r15),$C
388         mov     $SZ*3(%r15),$D
389         mov     $SZ*4(%r15),$E
390         mov     $SZ*5(%r15),$F
391         mov     $SZ*6(%r15),$G
392         mov     $SZ*7(%r15),$H
393
394         vmovdqa 0x00(%r13,%r14,8),$mask14
395         vmovdqa 0x10(%r13,%r14,8),$mask12
396         vmovdqa 0x20(%r13,%r14,8),$mask10
397         vmovdqu 0x00-0x80($inp),$roundkey
398         jmp     .Lloop_xop
399 ___
400                                         if ($SZ==4) {   # SHA256
401     my @X = map("%xmm$_",(0..3));
402     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
403
404 $code.=<<___;
405 .align  16
406 .Lloop_xop:
407         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
408         vmovdqu 0x00(%rsi,%r12),@X[0]
409         vmovdqu 0x10(%rsi,%r12),@X[1]
410         vmovdqu 0x20(%rsi,%r12),@X[2]
411         vmovdqu 0x30(%rsi,%r12),@X[3]
412         vpshufb $t3,@X[0],@X[0]
413         lea     $TABLE(%rip),$Tbl
414         vpshufb $t3,@X[1],@X[1]
415         vpshufb $t3,@X[2],@X[2]
416         vpaddd  0x00($Tbl),@X[0],$t0
417         vpshufb $t3,@X[3],@X[3]
418         vpaddd  0x20($Tbl),@X[1],$t1
419         vpaddd  0x40($Tbl),@X[2],$t2
420         vpaddd  0x60($Tbl),@X[3],$t3
421         vmovdqa $t0,0x00(%rsp)
422         mov     $A,$a1
423         vmovdqa $t1,0x10(%rsp)
424         mov     $B,$a3
425         vmovdqa $t2,0x20(%rsp)
426         xor     $C,$a3                  # magic
427         vmovdqa $t3,0x30(%rsp)
428         mov     $E,$a0
429         jmp     .Lxop_00_47
430
431 .align  16
432 .Lxop_00_47:
433         sub     \$-16*2*$SZ,$Tbl        # size optimization
434         vmovdqu (%r12),$inout           # $a4
435         mov     %r12,$_inp              # $a4
436 ___
437 sub XOP_256_00_47 () {
438 my $j = shift;
439 my $body = shift;
440 my @X = @_;
441 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
442
443         &vpalignr       ($t0,@X[1],@X[0],$SZ);  # X[1..4]
444           eval(shift(@insns));
445           eval(shift(@insns));
446          &vpalignr      ($t3,@X[3],@X[2],$SZ);  # X[9..12]
447           eval(shift(@insns));
448           eval(shift(@insns));
449         &vprotd         ($t1,$t0,8*$SZ-$sigma0[1]);
450           eval(shift(@insns));
451           eval(shift(@insns));
452         &vpsrld         ($t0,$t0,$sigma0[2]);
453           eval(shift(@insns));
454           eval(shift(@insns));
455          &vpaddd        (@X[0],@X[0],$t3);      # X[0..3] += X[9..12]
456           eval(shift(@insns));
457           eval(shift(@insns));
458           eval(shift(@insns));
459           eval(shift(@insns));
460         &vprotd         ($t2,$t1,$sigma0[1]-$sigma0[0]);
461           eval(shift(@insns));
462           eval(shift(@insns));
463         &vpxor          ($t0,$t0,$t1);
464           eval(shift(@insns));
465           eval(shift(@insns));
466           eval(shift(@insns));
467           eval(shift(@insns));
468          &vprotd        ($t3,@X[3],8*$SZ-$sigma1[1]);
469           eval(shift(@insns));
470           eval(shift(@insns));
471         &vpxor          ($t0,$t0,$t2);          # sigma0(X[1..4])
472           eval(shift(@insns));
473           eval(shift(@insns));
474          &vpsrld        ($t2,@X[3],$sigma1[2]);
475           eval(shift(@insns));
476           eval(shift(@insns));
477         &vpaddd         (@X[0],@X[0],$t0);      # X[0..3] += sigma0(X[1..4])
478           eval(shift(@insns));
479           eval(shift(@insns));
480          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
481           eval(shift(@insns));
482           eval(shift(@insns));
483          &vpxor         ($t3,$t3,$t2);
484           eval(shift(@insns));
485           eval(shift(@insns));
486           eval(shift(@insns));
487           eval(shift(@insns));
488          &vpxor         ($t3,$t3,$t1);          # sigma1(X[14..15])
489           eval(shift(@insns));
490           eval(shift(@insns));
491           eval(shift(@insns));
492           eval(shift(@insns));
493         &vpsrldq        ($t3,$t3,8);
494           eval(shift(@insns));
495           eval(shift(@insns));
496           eval(shift(@insns));
497           eval(shift(@insns));
498         &vpaddd         (@X[0],@X[0],$t3);      # X[0..1] += sigma1(X[14..15])
499           eval(shift(@insns));
500           eval(shift(@insns));
501           eval(shift(@insns));
502           eval(shift(@insns));
503          &vprotd        ($t3,@X[0],8*$SZ-$sigma1[1]);
504           eval(shift(@insns));
505           eval(shift(@insns));
506          &vpsrld        ($t2,@X[0],$sigma1[2]);
507           eval(shift(@insns));
508           eval(shift(@insns));
509          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
510           eval(shift(@insns));
511           eval(shift(@insns));
512          &vpxor         ($t3,$t3,$t2);
513           eval(shift(@insns));
514           eval(shift(@insns));
515           eval(shift(@insns));
516           eval(shift(@insns));
517          &vpxor         ($t3,$t3,$t1);          # sigma1(X[16..17])
518           eval(shift(@insns));
519           eval(shift(@insns));
520           eval(shift(@insns));
521           eval(shift(@insns));
522         &vpslldq        ($t3,$t3,8);            # 22 instructions
523           eval(shift(@insns));
524           eval(shift(@insns));
525           eval(shift(@insns));
526           eval(shift(@insns));
527         &vpaddd         (@X[0],@X[0],$t3);      # X[2..3] += sigma1(X[16..17])
528           eval(shift(@insns));
529           eval(shift(@insns));
530           eval(shift(@insns));
531           eval(shift(@insns));
532         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
533           foreach (@insns) { eval; }            # remaining instructions
534         &vmovdqa        (16*$j."(%rsp)",$t2);
535 }
536
537     $aesni_cbc_idx=0;
538     for ($i=0,$j=0; $j<4; $j++) {
539         &XOP_256_00_47($j,\&body_00_15,@X);
540         push(@X,shift(@X));                     # rotate(@X)
541     }
542         &mov            ("%r12",$_inp);         # borrow $a4
543         &vpand          ($temp,$temp,$mask14);
544         &mov            ("%r15",$_out);         # borrow $a2
545         &vpor           ($iv,$iv,$temp);
546         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
547         &lea            ("%r12","16(%r12)");    # inp++
548
549         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
550         &jne    (".Lxop_00_47");
551
552         &vmovdqu        ($inout,"(%r12)");
553         &mov            ($_inp,"%r12");
554
555     $aesni_cbc_idx=0;
556     for ($i=0; $i<16; ) {
557         foreach(body_00_15()) { eval; }
558     }
559                                         }
560 $code.=<<___;
561         mov     $_inp,%r12              # borrow $a4
562         mov     $_out,%r13              # borrow $a0
563         mov     $_ctx,%r15              # borrow $a2
564         mov     $_in0,%rsi              # borrow $a3
565
566         vpand   $mask14,$temp,$temp
567         mov     $a1,$A
568         vpor    $temp,$iv,$iv
569         vmovdqu $iv,(%r13,%r12)         # write output
570         lea     16(%r12),%r12           # inp++
571
572         add     $SZ*0(%r15),$A
573         add     $SZ*1(%r15),$B
574         add     $SZ*2(%r15),$C
575         add     $SZ*3(%r15),$D
576         add     $SZ*4(%r15),$E
577         add     $SZ*5(%r15),$F
578         add     $SZ*6(%r15),$G
579         add     $SZ*7(%r15),$H
580
581         cmp     $_end,%r12
582
583         mov     $A,$SZ*0(%r15)
584         mov     $B,$SZ*1(%r15)
585         mov     $C,$SZ*2(%r15)
586         mov     $D,$SZ*3(%r15)
587         mov     $E,$SZ*4(%r15)
588         mov     $F,$SZ*5(%r15)
589         mov     $G,$SZ*6(%r15)
590         mov     $H,$SZ*7(%r15)
591
592         jb      .Lloop_xop
593
594         mov     $_ivp,$ivp
595         mov     $_rsp,%rsi
596         vmovdqu $iv,($ivp)              # output IV
597         vzeroall
598 ___
599 $code.=<<___ if ($win64);
600         movaps  `$framesz+16*0`(%rsp),%xmm6
601         movaps  `$framesz+16*1`(%rsp),%xmm7
602         movaps  `$framesz+16*2`(%rsp),%xmm8
603         movaps  `$framesz+16*3`(%rsp),%xmm9
604         movaps  `$framesz+16*4`(%rsp),%xmm10
605         movaps  `$framesz+16*5`(%rsp),%xmm11
606         movaps  `$framesz+16*6`(%rsp),%xmm12
607         movaps  `$framesz+16*7`(%rsp),%xmm13
608         movaps  `$framesz+16*8`(%rsp),%xmm14
609         movaps  `$framesz+16*9`(%rsp),%xmm15
610 ___
611 $code.=<<___;
612         mov     (%rsi),%r15
613         mov     8(%rsi),%r14
614         mov     16(%rsi),%r13
615         mov     24(%rsi),%r12
616         mov     32(%rsi),%rbp
617         mov     40(%rsi),%rbx
618         lea     48(%rsi),%rsp
619 .Lepilogue_xop:
620         ret
621 .size   ${func}_xop,.-${func}_xop
622 ___
623 ######################################################################
624 # AVX+shrd code path
625 #
626 local *ror = sub { &shrd(@_[0],@_) };
627
628 $code.=<<___;
629 .type   ${func}_avx,\@function,6
630 .align  64
631 ${func}_avx:
632 .Lavx_shortcut:
633         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
634         push    %rbx
635         push    %rbp
636         push    %r12
637         push    %r13
638         push    %r14
639         push    %r15
640         mov     %rsp,%r11               # copy %rsp
641         sub     \$`$framesz+$win64*16*10`,%rsp
642         and     \$-64,%rsp              # align stack frame
643
644         shl     \$6,$len
645         sub     $inp,$out               # re-bias
646         sub     $inp,$in0
647         add     $inp,$len               # end of input
648
649         #mov    $inp,$_inp              # saved later
650         mov     $out,$_out
651         mov     $len,$_end
652         #mov    $key,$_key              # remains resident in $inp register
653         mov     $ivp,$_ivp
654         mov     $ctx,$_ctx
655         mov     $in0,$_in0
656         mov     %r11,$_rsp
657 ___
658 $code.=<<___ if ($win64);
659         movaps  %xmm6,`$framesz+16*0`(%rsp)
660         movaps  %xmm7,`$framesz+16*1`(%rsp)
661         movaps  %xmm8,`$framesz+16*2`(%rsp)
662         movaps  %xmm9,`$framesz+16*3`(%rsp)
663         movaps  %xmm10,`$framesz+16*4`(%rsp)
664         movaps  %xmm11,`$framesz+16*5`(%rsp)
665         movaps  %xmm12,`$framesz+16*6`(%rsp)
666         movaps  %xmm13,`$framesz+16*7`(%rsp)
667         movaps  %xmm14,`$framesz+16*8`(%rsp)
668         movaps  %xmm15,`$framesz+16*9`(%rsp)
669 ___
670 $code.=<<___;
671 .Lprologue_avx:
672         vzeroall
673
674         mov     $inp,%r12               # borrow $a4
675         lea     0x80($key),$inp         # size optimization, reassign
676         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
677         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
678         mov     $ctx,%r15               # borrow $a2
679         mov     $in0,%rsi               # borrow $a3
680         vmovdqu ($ivp),$iv              # load IV
681         sub     \$9,%r14
682
683         mov     $SZ*0(%r15),$A
684         mov     $SZ*1(%r15),$B
685         mov     $SZ*2(%r15),$C
686         mov     $SZ*3(%r15),$D
687         mov     $SZ*4(%r15),$E
688         mov     $SZ*5(%r15),$F
689         mov     $SZ*6(%r15),$G
690         mov     $SZ*7(%r15),$H
691
692         vmovdqa 0x00(%r13,%r14,8),$mask14
693         vmovdqa 0x10(%r13,%r14,8),$mask12
694         vmovdqa 0x20(%r13,%r14,8),$mask10
695         vmovdqu 0x00-0x80($inp),$roundkey
696 ___
697                                         if ($SZ==4) {   # SHA256
698     my @X = map("%xmm$_",(0..3));
699     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
700
701 $code.=<<___;
702         jmp     .Lloop_avx
703 .align  16
704 .Lloop_avx:
705         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
706         vmovdqu 0x00(%rsi,%r12),@X[0]
707         vmovdqu 0x10(%rsi,%r12),@X[1]
708         vmovdqu 0x20(%rsi,%r12),@X[2]
709         vmovdqu 0x30(%rsi,%r12),@X[3]
710         vpshufb $t3,@X[0],@X[0]
711         lea     $TABLE(%rip),$Tbl
712         vpshufb $t3,@X[1],@X[1]
713         vpshufb $t3,@X[2],@X[2]
714         vpaddd  0x00($Tbl),@X[0],$t0
715         vpshufb $t3,@X[3],@X[3]
716         vpaddd  0x20($Tbl),@X[1],$t1
717         vpaddd  0x40($Tbl),@X[2],$t2
718         vpaddd  0x60($Tbl),@X[3],$t3
719         vmovdqa $t0,0x00(%rsp)
720         mov     $A,$a1
721         vmovdqa $t1,0x10(%rsp)
722         mov     $B,$a3
723         vmovdqa $t2,0x20(%rsp)
724         xor     $C,$a3                  # magic
725         vmovdqa $t3,0x30(%rsp)
726         mov     $E,$a0
727         jmp     .Lavx_00_47
728
729 .align  16
730 .Lavx_00_47:
731         sub     \$-16*2*$SZ,$Tbl        # size optimization
732         vmovdqu (%r12),$inout           # $a4
733         mov     %r12,$_inp              # $a4
734 ___
735 sub Xupdate_256_AVX () {
736         (
737         '&vpalignr      ($t0,@X[1],@X[0],$SZ)', # X[1..4]
738          '&vpalignr     ($t3,@X[3],@X[2],$SZ)', # X[9..12]
739         '&vpsrld        ($t2,$t0,$sigma0[0]);',
740          '&vpaddd       (@X[0],@X[0],$t3)',     # X[0..3] += X[9..12]
741         '&vpsrld        ($t3,$t0,$sigma0[2])',
742         '&vpslld        ($t1,$t0,8*$SZ-$sigma0[1]);',
743         '&vpxor         ($t0,$t3,$t2)',
744          '&vpshufd      ($t3,@X[3],0b11111010)',# X[14..15]
745         '&vpsrld        ($t2,$t2,$sigma0[1]-$sigma0[0]);',
746         '&vpxor         ($t0,$t0,$t1)',
747         '&vpslld        ($t1,$t1,$sigma0[1]-$sigma0[0]);',
748         '&vpxor         ($t0,$t0,$t2)',
749          '&vpsrld       ($t2,$t3,$sigma1[2]);',
750         '&vpxor         ($t0,$t0,$t1)',         # sigma0(X[1..4])
751          '&vpsrlq       ($t3,$t3,$sigma1[0]);',
752         '&vpaddd        (@X[0],@X[0],$t0)',     # X[0..3] += sigma0(X[1..4])
753          '&vpxor        ($t2,$t2,$t3);',
754          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
755          '&vpxor        ($t2,$t2,$t3)',         # sigma1(X[14..15])
756          '&vpshufd      ($t2,$t2,0b10000100)',
757          '&vpsrldq      ($t2,$t2,8)',
758         '&vpaddd        (@X[0],@X[0],$t2)',     # X[0..1] += sigma1(X[14..15])
759          '&vpshufd      ($t3,@X[0],0b01010000)',# X[16..17]
760          '&vpsrld       ($t2,$t3,$sigma1[2])',
761          '&vpsrlq       ($t3,$t3,$sigma1[0])',
762          '&vpxor        ($t2,$t2,$t3);',
763          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
764          '&vpxor        ($t2,$t2,$t3)',
765          '&vpshufd      ($t2,$t2,0b11101000)',
766          '&vpslldq      ($t2,$t2,8)',
767         '&vpaddd        (@X[0],@X[0],$t2)'      # X[2..3] += sigma1(X[16..17])
768         );
769 }
770
771 sub AVX_256_00_47 () {
772 my $j = shift;
773 my $body = shift;
774 my @X = @_;
775 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
776
777         foreach (Xupdate_256_AVX()) {           # 29 instructions
778             eval;
779             eval(shift(@insns));
780             eval(shift(@insns));
781             eval(shift(@insns));
782         }
783         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
784           foreach (@insns) { eval; }            # remaining instructions
785         &vmovdqa        (16*$j."(%rsp)",$t2);
786 }
787
788     $aesni_cbc_idx=0;
789     for ($i=0,$j=0; $j<4; $j++) {
790         &AVX_256_00_47($j,\&body_00_15,@X);
791         push(@X,shift(@X));                     # rotate(@X)
792     }
793         &mov            ("%r12",$_inp);         # borrow $a4
794         &vpand          ($temp,$temp,$mask14);
795         &mov            ("%r15",$_out);         # borrow $a2
796         &vpor           ($iv,$iv,$temp);
797         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
798         &lea            ("%r12","16(%r12)");    # inp++
799
800         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
801         &jne    (".Lavx_00_47");
802
803         &vmovdqu        ($inout,"(%r12)");
804         &mov            ($_inp,"%r12");
805
806     $aesni_cbc_idx=0;
807     for ($i=0; $i<16; ) {
808         foreach(body_00_15()) { eval; }
809     }
810
811                                         }
812 $code.=<<___;
813         mov     $_inp,%r12              # borrow $a4
814         mov     $_out,%r13              # borrow $a0
815         mov     $_ctx,%r15              # borrow $a2
816         mov     $_in0,%rsi              # borrow $a3
817
818         vpand   $mask14,$temp,$temp
819         mov     $a1,$A
820         vpor    $temp,$iv,$iv
821         vmovdqu $iv,(%r13,%r12)         # write output
822         lea     16(%r12),%r12           # inp++
823
824         add     $SZ*0(%r15),$A
825         add     $SZ*1(%r15),$B
826         add     $SZ*2(%r15),$C
827         add     $SZ*3(%r15),$D
828         add     $SZ*4(%r15),$E
829         add     $SZ*5(%r15),$F
830         add     $SZ*6(%r15),$G
831         add     $SZ*7(%r15),$H
832
833         cmp     $_end,%r12
834
835         mov     $A,$SZ*0(%r15)
836         mov     $B,$SZ*1(%r15)
837         mov     $C,$SZ*2(%r15)
838         mov     $D,$SZ*3(%r15)
839         mov     $E,$SZ*4(%r15)
840         mov     $F,$SZ*5(%r15)
841         mov     $G,$SZ*6(%r15)
842         mov     $H,$SZ*7(%r15)
843         jb      .Lloop_avx
844
845         mov     $_ivp,$ivp
846         mov     $_rsp,%rsi
847         vmovdqu $iv,($ivp)              # output IV
848         vzeroall
849 ___
850 $code.=<<___ if ($win64);
851         movaps  `$framesz+16*0`(%rsp),%xmm6
852         movaps  `$framesz+16*1`(%rsp),%xmm7
853         movaps  `$framesz+16*2`(%rsp),%xmm8
854         movaps  `$framesz+16*3`(%rsp),%xmm9
855         movaps  `$framesz+16*4`(%rsp),%xmm10
856         movaps  `$framesz+16*5`(%rsp),%xmm11
857         movaps  `$framesz+16*6`(%rsp),%xmm12
858         movaps  `$framesz+16*7`(%rsp),%xmm13
859         movaps  `$framesz+16*8`(%rsp),%xmm14
860         movaps  `$framesz+16*9`(%rsp),%xmm15
861 ___
862 $code.=<<___;
863         mov     (%rsi),%r15
864         mov     8(%rsi),%r14
865         mov     16(%rsi),%r13
866         mov     24(%rsi),%r12
867         mov     32(%rsi),%rbp
868         mov     40(%rsi),%rbx
869         lea     48(%rsi),%rsp
870 .Lepilogue_avx:
871         ret
872 .size   ${func}_avx,.-${func}_avx
873 ___
874
875 if ($avx>1) {{
876 ######################################################################
877 # AVX2+BMI code path
878 #
879 my $a5=$SZ==4?"%esi":"%rsi";    # zap $inp 
880 my $PUSH8=8*2*$SZ;
881 use integer;
882
883 sub bodyx_00_15 () {
884         # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
885         (
886         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
887
888         '&add   ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
889         '&and   ($a4,$e)',              # f&e
890         '&rorx  ($a0,$e,$Sigma1[2])',
891         '&rorx  ($a2,$e,$Sigma1[1])',
892
893         '&lea   ($a,"($a,$a1)")',       # h+=Sigma0(a) from the past
894         '&lea   ($h,"($h,$a4)")',
895         '&andn  ($a4,$e,$g)',           # ~e&g
896         '&xor   ($a0,$a2)',
897
898         '&rorx  ($a1,$e,$Sigma1[0])',
899         '&lea   ($h,"($h,$a4)")',       # h+=Ch(e,f,g)=(e&f)+(~e&g)
900         '&xor   ($a0,$a1)',             # Sigma1(e)
901         '&mov   ($a2,$a)',
902
903         '&rorx  ($a4,$a,$Sigma0[2])',
904         '&lea   ($h,"($h,$a0)")',       # h+=Sigma1(e)
905         '&xor   ($a2,$b)',              # a^b, b^c in next round
906         '&rorx  ($a1,$a,$Sigma0[1])',
907
908         '&rorx  ($a0,$a,$Sigma0[0])',
909         '&lea   ($d,"($d,$h)")',        # d+=h
910         '&and   ($a3,$a2)',             # (b^c)&(a^b)
911         @aesni_cbc_block[$aesni_cbc_idx++].
912         '&xor   ($a1,$a4)',
913
914         '&xor   ($a3,$b)',              # Maj(a,b,c)=Ch(a^b,c,b)
915         '&xor   ($a1,$a0)',             # Sigma0(a)
916         '&lea   ($h,"($h,$a3)");'.      # h+=Maj(a,b,c)
917         '&mov   ($a4,$e)',              # copy of f in future
918
919         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
920         );
921         # and at the finish one has to $a+=$a1
922 }
923
924 $code.=<<___;
925 .type   ${func}_avx2,\@function,6
926 .align  64
927 ${func}_avx2:
928 .Lavx2_shortcut:
929         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
930         push    %rbx
931         push    %rbp
932         push    %r12
933         push    %r13
934         push    %r14
935         push    %r15
936         mov     %rsp,%r11               # copy %rsp
937         sub     \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
938         and     \$-256*$SZ,%rsp         # align stack frame
939         add     \$`2*$SZ*($rounds-8)`,%rsp
940
941         shl     \$6,$len
942         sub     $inp,$out               # re-bias
943         sub     $inp,$in0
944         add     $inp,$len               # end of input
945
946         #mov    $inp,$_inp              # saved later
947         #mov    $out,$_out              # kept in $offload
948         mov     $len,$_end
949         #mov    $key,$_key              # remains resident in $inp register
950         mov     $ivp,$_ivp
951         mov     $ctx,$_ctx
952         mov     $in0,$_in0
953         mov     %r11,$_rsp
954 ___
955 $code.=<<___ if ($win64);
956         movaps  %xmm6,`$framesz+16*0`(%rsp)
957         movaps  %xmm7,`$framesz+16*1`(%rsp)
958         movaps  %xmm8,`$framesz+16*2`(%rsp)
959         movaps  %xmm9,`$framesz+16*3`(%rsp)
960         movaps  %xmm10,`$framesz+16*4`(%rsp)
961         movaps  %xmm11,`$framesz+16*5`(%rsp)
962         movaps  %xmm12,`$framesz+16*6`(%rsp)
963         movaps  %xmm13,`$framesz+16*7`(%rsp)
964         movaps  %xmm14,`$framesz+16*8`(%rsp)
965         movaps  %xmm15,`$framesz+16*9`(%rsp)
966 ___
967 $code.=<<___;
968 .Lprologue_avx2:
969         vzeroall
970
971         mov     $inp,%r13               # borrow $a0
972         vpinsrq \$1,$out,$offload,$offload
973         lea     0x80($key),$inp         # size optimization, reassign
974         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r12    # borrow $a4
975         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
976         mov     $ctx,%r15               # borrow $a2
977         mov     $in0,%rsi               # borrow $a3
978         vmovdqu ($ivp),$iv              # load IV
979         lea     -9(%r14),%r14
980
981         vmovdqa 0x00(%r12,%r14,8),$mask14
982         vmovdqa 0x10(%r12,%r14,8),$mask12
983         vmovdqa 0x20(%r12,%r14,8),$mask10
984
985         sub     \$-16*$SZ,%r13          # inp++, size optimization
986         mov     $SZ*0(%r15),$A
987         lea     (%rsi,%r13),%r12        # borrow $a0
988         mov     $SZ*1(%r15),$B
989         cmp     $len,%r13               # $_end
990         mov     $SZ*2(%r15),$C
991         cmove   %rsp,%r12               # next block or random data
992         mov     $SZ*3(%r15),$D
993         mov     $SZ*4(%r15),$E
994         mov     $SZ*5(%r15),$F
995         mov     $SZ*6(%r15),$G
996         mov     $SZ*7(%r15),$H
997         vmovdqu 0x00-0x80($inp),$roundkey
998 ___
999                                         if ($SZ==4) {   # SHA256
1000     my @X = map("%ymm$_",(0..3));
1001     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1002
1003 $code.=<<___;
1004         jmp     .Loop_avx2
1005 .align  16
1006 .Loop_avx2:
1007         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1008         vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1009         vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1010         vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1011         vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1012
1013         vinserti128     \$1,(%r12),@X[0],@X[0]
1014         vinserti128     \$1,16(%r12),@X[1],@X[1]
1015          vpshufb        $t3,@X[0],@X[0]
1016         vinserti128     \$1,32(%r12),@X[2],@X[2]
1017          vpshufb        $t3,@X[1],@X[1]
1018         vinserti128     \$1,48(%r12),@X[3],@X[3]
1019
1020         lea     $TABLE(%rip),$Tbl
1021         vpshufb $t3,@X[2],@X[2]
1022         lea     -16*$SZ(%r13),%r13
1023         vpaddd  0x00($Tbl),@X[0],$t0
1024         vpshufb $t3,@X[3],@X[3]
1025         vpaddd  0x20($Tbl),@X[1],$t1
1026         vpaddd  0x40($Tbl),@X[2],$t2
1027         vpaddd  0x60($Tbl),@X[3],$t3
1028         vmovdqa $t0,0x00(%rsp)
1029         xor     $a1,$a1
1030         vmovdqa $t1,0x20(%rsp)
1031         lea     -$PUSH8(%rsp),%rsp
1032         mov     $B,$a3
1033         vmovdqa $t2,0x00(%rsp)
1034         xor     $C,$a3                  # magic
1035         vmovdqa $t3,0x20(%rsp)
1036         mov     $F,$a4
1037         sub     \$-16*2*$SZ,$Tbl        # size optimization
1038         jmp     .Lavx2_00_47
1039
1040 .align  16
1041 .Lavx2_00_47:
1042         vmovdqu (%r13),$inout
1043         vpinsrq \$0,%r13,$offload,$offload
1044 ___
1045
1046 sub AVX2_256_00_47 () {
1047 my $j = shift;
1048 my $body = shift;
1049 my @X = @_;
1050 my @insns = (&$body,&$body,&$body,&$body);      # 96 instructions
1051 my $base = "+2*$PUSH8(%rsp)";
1052
1053         &lea    ("%rsp","-$PUSH8(%rsp)")        if (($j%2)==0);
1054         foreach (Xupdate_256_AVX()) {           # 29 instructions
1055             eval;
1056             eval(shift(@insns));
1057             eval(shift(@insns));
1058             eval(shift(@insns));
1059         }
1060         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
1061           foreach (@insns) { eval; }            # remaining instructions
1062         &vmovdqa        ((32*$j)%$PUSH8."(%rsp)",$t2);
1063 }
1064     $aesni_cbc_idx=0;
1065     for ($i=0,$j=0; $j<4; $j++) {
1066         &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1067         push(@X,shift(@X));                     # rotate(@X)
1068     }
1069         &vmovq          ("%r13",$offload);      # borrow $a0
1070         &vpextrq        ("%r15",$offload,1);    # borrow $a2
1071         &vpand          ($temp,$temp,$mask14);
1072         &vpor           ($iv,$iv,$temp);
1073         &vmovdqu        ("(%r15,%r13)",$iv);    # write output
1074         &lea            ("%r13","16(%r13)");    # inp++
1075
1076         &lea    ($Tbl,16*2*$SZ."($Tbl)");
1077         &cmpb   (($SZ-1)."($Tbl)",0);
1078         &jne    (".Lavx2_00_47");
1079
1080         &vmovdqu        ($inout,"(%r13)");
1081         &vpinsrq        ($offload,$offload,"%r13",0);
1082
1083     $aesni_cbc_idx=0;
1084     for ($i=0; $i<16; ) {
1085         my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1086         foreach(bodyx_00_15()) { eval; }
1087     }
1088                                         }
1089 $code.=<<___;
1090         vpextrq \$1,$offload,%r12               # $_out, borrow $a4
1091         vmovq   $offload,%r13                   # $_inp, borrow $a0
1092         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1093         add     $a1,$A
1094         lea     `2*$SZ*($rounds-8)`(%rsp),$Tbl
1095
1096         vpand   $mask14,$temp,$temp
1097         vpor    $temp,$iv,$iv
1098         vmovdqu $iv,(%r12,%r13)                 # write output
1099         lea     16(%r13),%r13
1100
1101         add     $SZ*0(%r15),$A
1102         add     $SZ*1(%r15),$B
1103         add     $SZ*2(%r15),$C
1104         add     $SZ*3(%r15),$D
1105         add     $SZ*4(%r15),$E
1106         add     $SZ*5(%r15),$F
1107         add     $SZ*6(%r15),$G
1108         add     $SZ*7(%r15),$H
1109
1110         mov     $A,$SZ*0(%r15)
1111         mov     $B,$SZ*1(%r15)
1112         mov     $C,$SZ*2(%r15)
1113         mov     $D,$SZ*3(%r15)
1114         mov     $E,$SZ*4(%r15)
1115         mov     $F,$SZ*5(%r15)
1116         mov     $G,$SZ*6(%r15)
1117         mov     $H,$SZ*7(%r15)
1118
1119         cmp     `$PUSH8+2*8`($Tbl),%r13         # $_end
1120         je      .Ldone_avx2
1121
1122         xor     $a1,$a1
1123         mov     $B,$a3
1124         mov     $F,$a4
1125         xor     $C,$a3                  # magic
1126         jmp     .Lower_avx2
1127 .align  16
1128 .Lower_avx2:
1129         vmovdqu (%r13),$inout
1130         vpinsrq \$0,%r13,$offload,$offload
1131 ___
1132     $aesni_cbc_idx=0;
1133     for ($i=0; $i<16; ) {
1134         my $base="+16($Tbl)";
1135         foreach(bodyx_00_15()) { eval; }
1136         &lea    ($Tbl,"-$PUSH8($Tbl)")  if ($i==8);
1137     }
1138 $code.=<<___;
1139         vmovq   $offload,%r13                   # borrow $a0
1140         vpextrq \$1,$offload,%r15               # borrow $a2
1141         vpand   $mask14,$temp,$temp
1142         vpor    $temp,$iv,$iv
1143         lea     -$PUSH8($Tbl),$Tbl
1144         vmovdqu $iv,(%r15,%r13)                 # write output
1145         lea     16(%r13),%r13                   # inp++
1146         cmp     %rsp,$Tbl
1147         jae     .Lower_avx2
1148
1149         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1150         lea     16*$SZ(%r13),%r13
1151         mov     `2*$SZ*$rounds+6*8`(%rsp),%rsi  # $_in0, borrow $a3
1152         add     $a1,$A
1153         lea     `2*$SZ*($rounds-8)`(%rsp),%rsp
1154
1155         add     $SZ*0(%r15),$A
1156         add     $SZ*1(%r15),$B
1157         add     $SZ*2(%r15),$C
1158         add     $SZ*3(%r15),$D
1159         add     $SZ*4(%r15),$E
1160         add     $SZ*5(%r15),$F
1161         add     $SZ*6(%r15),$G
1162         lea     (%rsi,%r13),%r12
1163         add     $SZ*7(%r15),$H
1164
1165         cmp     $_end,%r13
1166
1167         mov     $A,$SZ*0(%r15)
1168         cmove   %rsp,%r12               # next block or stale data
1169         mov     $B,$SZ*1(%r15)
1170         mov     $C,$SZ*2(%r15)
1171         mov     $D,$SZ*3(%r15)
1172         mov     $E,$SZ*4(%r15)
1173         mov     $F,$SZ*5(%r15)
1174         mov     $G,$SZ*6(%r15)
1175         mov     $H,$SZ*7(%r15)
1176
1177         jbe     .Loop_avx2
1178         lea     (%rsp),$Tbl
1179
1180 .Ldone_avx2:
1181         lea     ($Tbl),%rsp
1182         mov     $_ivp,$ivp
1183         mov     $_rsp,%rsi
1184         vmovdqu $iv,($ivp)              # output IV
1185         vzeroall
1186 ___
1187 $code.=<<___ if ($win64);
1188         movaps  `$framesz+16*0`(%rsp),%xmm6
1189         movaps  `$framesz+16*1`(%rsp),%xmm7
1190         movaps  `$framesz+16*2`(%rsp),%xmm8
1191         movaps  `$framesz+16*3`(%rsp),%xmm9
1192         movaps  `$framesz+16*4`(%rsp),%xmm10
1193         movaps  `$framesz+16*5`(%rsp),%xmm11
1194         movaps  `$framesz+16*6`(%rsp),%xmm12
1195         movaps  `$framesz+16*7`(%rsp),%xmm13
1196         movaps  `$framesz+16*8`(%rsp),%xmm14
1197         movaps  `$framesz+16*9`(%rsp),%xmm15
1198 ___
1199 $code.=<<___;
1200         mov     (%rsi),%r15
1201         mov     8(%rsi),%r14
1202         mov     16(%rsi),%r13
1203         mov     24(%rsi),%r12
1204         mov     32(%rsi),%rbp
1205         mov     40(%rsi),%rbx
1206         lea     48(%rsi),%rsp
1207 .Lepilogue_avx2:
1208         ret
1209 .size   ${func}_avx2,.-${func}_avx2
1210 ___
1211 }}
1212 }}
1213 {{
1214 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1215
1216 my ($rounds,$Tbl)=("%r11d","%rbx");
1217
1218 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1219 my @rndkey=("%xmm4","%xmm5");
1220 my $r=0;
1221 my $sn=0;
1222
1223 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1224 my @MSG=map("%xmm$_",(10..13));
1225
1226 my $aesenc=sub {
1227   use integer;
1228   my ($n,$k)=($r/10,$r%10);
1229     if ($k==0) {
1230       $code.=<<___;
1231         movups          `16*$n`($in0),$in               # load input
1232         xorps           $rndkey0,$in
1233 ___
1234       $code.=<<___ if ($n);
1235         movups          $iv,`16*($n-1)`($out,$in0)      # write output
1236 ___
1237       $code.=<<___;
1238         xorps           $in,$iv
1239         movups          `32+16*$k-112`($key),$rndkey[1]
1240         aesenc          $rndkey[0],$iv
1241 ___
1242     } elsif ($k==9) {
1243       $sn++;
1244       $code.=<<___;
1245         cmp             \$11,$rounds
1246         jb              .Laesenclast$sn
1247         movups          `32+16*($k+0)-112`($key),$rndkey[1]
1248         aesenc          $rndkey[0],$iv
1249         movups          `32+16*($k+1)-112`($key),$rndkey[0]
1250         aesenc          $rndkey[1],$iv
1251         je              .Laesenclast$sn
1252         movups          `32+16*($k+2)-112`($key),$rndkey[1]
1253         aesenc          $rndkey[0],$iv
1254         movups          `32+16*($k+3)-112`($key),$rndkey[0]
1255         aesenc          $rndkey[1],$iv
1256 .Laesenclast$sn:
1257         aesenclast      $rndkey[0],$iv
1258         movups          16-112($key),$rndkey[1]         # forward reference
1259         nop
1260 ___
1261     } else {
1262       $code.=<<___;
1263         movups          `32+16*$k-112`($key),$rndkey[1]
1264         aesenc          $rndkey[0],$iv
1265 ___
1266     }
1267     $r++;       unshift(@rndkey,pop(@rndkey));
1268 };
1269
1270 if ($shaext) {
1271 my $Tbl="%rax";
1272
1273 $code.=<<___;
1274 .type   ${func}_shaext,\@function,6
1275 .align  32
1276 ${func}_shaext:
1277         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
1278 ___
1279 $code.=<<___ if ($win64);
1280         lea     `-8-10*16`(%rsp),%rsp
1281         movaps  %xmm6,-8-10*16(%rax)
1282         movaps  %xmm7,-8-9*16(%rax)
1283         movaps  %xmm8,-8-8*16(%rax)
1284         movaps  %xmm9,-8-7*16(%rax)
1285         movaps  %xmm10,-8-6*16(%rax)
1286         movaps  %xmm11,-8-5*16(%rax)
1287         movaps  %xmm12,-8-4*16(%rax)
1288         movaps  %xmm13,-8-3*16(%rax)
1289         movaps  %xmm14,-8-2*16(%rax)
1290         movaps  %xmm15,-8-1*16(%rax)
1291 .Lprologue_shaext:
1292 ___
1293 $code.=<<___;
1294         lea             K256+0x80(%rip),$Tbl
1295         movdqu          ($ctx),$ABEF            # DCBA
1296         movdqu          16($ctx),$CDGH          # HGFE
1297         movdqa          0x200-0x80($Tbl),$TMP   # byte swap mask
1298
1299         mov             240($key),$rounds
1300         sub             $in0,$out
1301         movups          ($key),$rndkey0         # $key[0]
1302         movups          16($key),$rndkey[0]     # forward reference
1303         lea             112($key),$key          # size optimization
1304
1305         pshufd          \$0x1b,$ABEF,$Wi        # ABCD
1306         pshufd          \$0xb1,$ABEF,$ABEF      # CDAB
1307         pshufd          \$0x1b,$CDGH,$CDGH      # EFGH
1308         movdqa          $TMP,$BSWAP             # offload
1309         palignr         \$8,$CDGH,$ABEF         # ABEF
1310         punpcklqdq      $Wi,$CDGH               # CDGH
1311
1312         jmp     .Loop_shaext
1313
1314 .align  16
1315 .Loop_shaext:
1316         movdqu          ($inp),@MSG[0]
1317         movdqu          0x10($inp),@MSG[1]
1318         movdqu          0x20($inp),@MSG[2]
1319         pshufb          $TMP,@MSG[0]
1320         movdqu          0x30($inp),@MSG[3]
1321
1322         movdqa          0*32-0x80($Tbl),$Wi
1323         paddd           @MSG[0],$Wi
1324         pshufb          $TMP,@MSG[1]
1325         movdqa          $CDGH,$CDGH_SAVE        # offload
1326         movdqa          $ABEF,$ABEF_SAVE        # offload
1327 ___
1328         &$aesenc();
1329 $code.=<<___;
1330         sha256rnds2     $ABEF,$CDGH             # 0-3
1331         pshufd          \$0x0e,$Wi,$Wi
1332 ___
1333         &$aesenc();
1334 $code.=<<___;
1335         sha256rnds2     $CDGH,$ABEF
1336
1337         movdqa          1*32-0x80($Tbl),$Wi
1338         paddd           @MSG[1],$Wi
1339         pshufb          $TMP,@MSG[2]
1340         lea             0x40($inp),$inp
1341 ___
1342         &$aesenc();
1343 $code.=<<___;
1344         sha256rnds2     $ABEF,$CDGH             # 4-7
1345         pshufd          \$0x0e,$Wi,$Wi
1346 ___
1347         &$aesenc();
1348 $code.=<<___;
1349         sha256rnds2     $CDGH,$ABEF
1350
1351         movdqa          2*32-0x80($Tbl),$Wi
1352         paddd           @MSG[2],$Wi
1353         pshufb          $TMP,@MSG[3]
1354         sha256msg1      @MSG[1],@MSG[0]
1355 ___
1356         &$aesenc();
1357 $code.=<<___;
1358         sha256rnds2     $ABEF,$CDGH             # 8-11
1359         pshufd          \$0x0e,$Wi,$Wi
1360         movdqa          @MSG[3],$TMP
1361         palignr         \$4,@MSG[2],$TMP
1362         paddd           $TMP,@MSG[0]
1363 ___
1364         &$aesenc();
1365 $code.=<<___;
1366         sha256rnds2     $CDGH,$ABEF
1367
1368         movdqa          3*32-0x80($Tbl),$Wi
1369         paddd           @MSG[3],$Wi
1370         sha256msg2      @MSG[3],@MSG[0]
1371         sha256msg1      @MSG[2],@MSG[1]
1372 ___
1373         &$aesenc();
1374 $code.=<<___;
1375         sha256rnds2     $ABEF,$CDGH             # 12-15
1376         pshufd          \$0x0e,$Wi,$Wi
1377 ___
1378         &$aesenc();
1379 $code.=<<___;
1380         movdqa          @MSG[0],$TMP
1381         palignr         \$4,@MSG[3],$TMP
1382         paddd           $TMP,@MSG[1]
1383         sha256rnds2     $CDGH,$ABEF
1384 ___
1385 for($i=4;$i<16-3;$i++) {
1386         &$aesenc()      if (($r%10)==0);
1387 $code.=<<___;
1388         movdqa          $i*32-0x80($Tbl),$Wi
1389         paddd           @MSG[0],$Wi
1390         sha256msg2      @MSG[0],@MSG[1]
1391         sha256msg1      @MSG[3],@MSG[2]
1392 ___
1393         &$aesenc();
1394 $code.=<<___;
1395         sha256rnds2     $ABEF,$CDGH             # 16-19...
1396         pshufd          \$0x0e,$Wi,$Wi
1397         movdqa          @MSG[1],$TMP
1398         palignr         \$4,@MSG[0],$TMP
1399         paddd           $TMP,@MSG[2]
1400 ___
1401         &$aesenc();
1402         &$aesenc()      if ($r==19);
1403 $code.=<<___;
1404         sha256rnds2     $CDGH,$ABEF
1405 ___
1406         push(@MSG,shift(@MSG));
1407 }
1408 $code.=<<___;
1409         movdqa          13*32-0x80($Tbl),$Wi
1410         paddd           @MSG[0],$Wi
1411         sha256msg2      @MSG[0],@MSG[1]
1412         sha256msg1      @MSG[3],@MSG[2]
1413 ___
1414         &$aesenc();
1415 $code.=<<___;
1416         sha256rnds2     $ABEF,$CDGH             # 52-55
1417         pshufd          \$0x0e,$Wi,$Wi
1418         movdqa          @MSG[1],$TMP
1419         palignr         \$4,@MSG[0],$TMP
1420         paddd           $TMP,@MSG[2]
1421 ___
1422         &$aesenc();
1423         &$aesenc();
1424 $code.=<<___;
1425         sha256rnds2     $CDGH,$ABEF
1426
1427         movdqa          14*32-0x80($Tbl),$Wi
1428         paddd           @MSG[1],$Wi
1429         sha256msg2      @MSG[1],@MSG[2]
1430         movdqa          $BSWAP,$TMP
1431 ___
1432         &$aesenc();
1433 $code.=<<___;
1434         sha256rnds2     $ABEF,$CDGH             # 56-59
1435         pshufd          \$0x0e,$Wi,$Wi
1436 ___
1437         &$aesenc();
1438 $code.=<<___;
1439         sha256rnds2     $CDGH,$ABEF
1440
1441         movdqa          15*32-0x80($Tbl),$Wi
1442         paddd           @MSG[2],$Wi
1443 ___
1444         &$aesenc();
1445         &$aesenc();
1446 $code.=<<___;
1447         sha256rnds2     $ABEF,$CDGH             # 60-63
1448         pshufd          \$0x0e,$Wi,$Wi
1449 ___
1450         &$aesenc();
1451 $code.=<<___;
1452         sha256rnds2     $CDGH,$ABEF
1453         #pxor           $CDGH,$rndkey0          # black magic
1454 ___
1455         while ($r<40)   { &$aesenc(); }         # remaining aesenc's
1456 $code.=<<___;
1457         #xorps          $CDGH,$rndkey0          # black magic
1458         paddd           $CDGH_SAVE,$CDGH
1459         paddd           $ABEF_SAVE,$ABEF
1460
1461         dec             $len
1462         movups          $iv,48($out,$in0)       # write output
1463         lea             64($in0),$in0
1464         jnz             .Loop_shaext
1465
1466         pshufd          \$0xb1,$CDGH,$CDGH      # DCHG
1467         pshufd          \$0x1b,$ABEF,$TMP       # FEBA
1468         pshufd          \$0xb1,$ABEF,$ABEF      # BAFE
1469         punpckhqdq      $CDGH,$ABEF             # DCBA
1470         palignr         \$8,$TMP,$CDGH          # HGFE
1471
1472         movups          $iv,($ivp)              # write IV
1473         movdqu          $ABEF,($ctx)
1474         movdqu          $CDGH,16($ctx)
1475 ___
1476 $code.=<<___ if ($win64);
1477         movaps  0*16(%rsp),%xmm6
1478         movaps  1*16(%rsp),%xmm7
1479         movaps  2*16(%rsp),%xmm8
1480         movaps  3*16(%rsp),%xmm9
1481         movaps  4*16(%rsp),%xmm10
1482         movaps  5*16(%rsp),%xmm11
1483         movaps  6*16(%rsp),%xmm12
1484         movaps  7*16(%rsp),%xmm13
1485         movaps  8*16(%rsp),%xmm14
1486         movaps  9*16(%rsp),%xmm15
1487         lea     8+10*16(%rsp),%rsp
1488 .Lepilogue_shaext:
1489 ___
1490 $code.=<<___;
1491         ret
1492 .size   ${func}_shaext,.-${func}_shaext
1493 ___
1494 }
1495 }}}}}
1496
1497 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1498 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1499 if ($win64 && $avx) {
1500 $rec="%rcx";
1501 $frame="%rdx";
1502 $context="%r8";
1503 $disp="%r9";
1504
1505 $code.=<<___;
1506 .extern __imp_RtlVirtualUnwind
1507 .type   se_handler,\@abi-omnipotent
1508 .align  16
1509 se_handler:
1510         push    %rsi
1511         push    %rdi
1512         push    %rbx
1513         push    %rbp
1514         push    %r12
1515         push    %r13
1516         push    %r14
1517         push    %r15
1518         pushfq
1519         sub     \$64,%rsp
1520
1521         mov     120($context),%rax      # pull context->Rax
1522         mov     248($context),%rbx      # pull context->Rip
1523
1524         mov     8($disp),%rsi           # disp->ImageBase
1525         mov     56($disp),%r11          # disp->HanderlData
1526
1527         mov     0(%r11),%r10d           # HandlerData[0]
1528         lea     (%rsi,%r10),%r10        # prologue label
1529         cmp     %r10,%rbx               # context->Rip<prologue label
1530         jb      .Lin_prologue
1531
1532         mov     152($context),%rax      # pull context->Rsp
1533
1534         mov     4(%r11),%r10d           # HandlerData[1]
1535         lea     (%rsi,%r10),%r10        # epilogue label
1536         cmp     %r10,%rbx               # context->Rip>=epilogue label
1537         jae     .Lin_prologue
1538 ___
1539 $code.=<<___ if ($shaext);
1540         lea     aesni_cbc_sha256_enc_shaext(%rip),%r10
1541         cmp     %r10,%rbx
1542         jb      .Lnot_in_shaext
1543
1544         lea     (%rax),%rsi
1545         lea     512($context),%rdi      # &context.Xmm6
1546         mov     \$20,%ecx
1547         .long   0xa548f3fc              # cld; rep movsq
1548         lea     168(%rax),%rax          # adjust stack pointer
1549         jmp     .Lin_prologue
1550 .Lnot_in_shaext:
1551 ___
1552 $code.=<<___ if ($avx>1);
1553         lea     .Lavx2_shortcut(%rip),%r10
1554         cmp     %r10,%rbx               # context->Rip<avx2_shortcut
1555         jb      .Lnot_in_avx2
1556
1557         and     \$-256*$SZ,%rax
1558         add     \$`2*$SZ*($rounds-8)`,%rax
1559 .Lnot_in_avx2:
1560 ___
1561 $code.=<<___;
1562         mov     %rax,%rsi               # put aside Rsp
1563         mov     16*$SZ+7*8(%rax),%rax   # pull $_rsp
1564         lea     48(%rax),%rax
1565
1566         mov     -8(%rax),%rbx
1567         mov     -16(%rax),%rbp
1568         mov     -24(%rax),%r12
1569         mov     -32(%rax),%r13
1570         mov     -40(%rax),%r14
1571         mov     -48(%rax),%r15
1572         mov     %rbx,144($context)      # restore context->Rbx
1573         mov     %rbp,160($context)      # restore context->Rbp
1574         mov     %r12,216($context)      # restore context->R12
1575         mov     %r13,224($context)      # restore context->R13
1576         mov     %r14,232($context)      # restore context->R14
1577         mov     %r15,240($context)      # restore context->R15
1578
1579         lea     16*$SZ+8*8(%rsi),%rsi   # Xmm6- save area
1580         lea     512($context),%rdi      # &context.Xmm6
1581         mov     \$20,%ecx
1582         .long   0xa548f3fc              # cld; rep movsq
1583
1584 .Lin_prologue:
1585         mov     8(%rax),%rdi
1586         mov     16(%rax),%rsi
1587         mov     %rax,152($context)      # restore context->Rsp
1588         mov     %rsi,168($context)      # restore context->Rsi
1589         mov     %rdi,176($context)      # restore context->Rdi
1590
1591         mov     40($disp),%rdi          # disp->ContextRecord
1592         mov     $context,%rsi           # context
1593         mov     \$154,%ecx              # sizeof(CONTEXT)
1594         .long   0xa548f3fc              # cld; rep movsq
1595
1596         mov     $disp,%rsi
1597         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1598         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1599         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1600         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1601         mov     40(%rsi),%r10           # disp->ContextRecord
1602         lea     56(%rsi),%r11           # &disp->HandlerData
1603         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1604         mov     %r10,32(%rsp)           # arg5
1605         mov     %r11,40(%rsp)           # arg6
1606         mov     %r12,48(%rsp)           # arg7
1607         mov     %rcx,56(%rsp)           # arg8, (NULL)
1608         call    *__imp_RtlVirtualUnwind(%rip)
1609
1610         mov     \$1,%eax                # ExceptionContinueSearch
1611         add     \$64,%rsp
1612         popfq
1613         pop     %r15
1614         pop     %r14
1615         pop     %r13
1616         pop     %r12
1617         pop     %rbp
1618         pop     %rbx
1619         pop     %rdi
1620         pop     %rsi
1621         ret
1622 .size   se_handler,.-se_handler
1623
1624 .section        .pdata
1625         .rva    .LSEH_begin_${func}_xop
1626         .rva    .LSEH_end_${func}_xop
1627         .rva    .LSEH_info_${func}_xop
1628
1629         .rva    .LSEH_begin_${func}_avx
1630         .rva    .LSEH_end_${func}_avx
1631         .rva    .LSEH_info_${func}_avx
1632 ___
1633 $code.=<<___ if ($avx>1);
1634         .rva    .LSEH_begin_${func}_avx2
1635         .rva    .LSEH_end_${func}_avx2
1636         .rva    .LSEH_info_${func}_avx2
1637 ___
1638 $code.=<<___ if ($shaext);
1639         .rva    .LSEH_begin_${func}_shaext
1640         .rva    .LSEH_end_${func}_shaext
1641         .rva    .LSEH_info_${func}_shaext
1642 ___
1643 $code.=<<___;
1644 .section        .xdata
1645 .align  8
1646 .LSEH_info_${func}_xop:
1647         .byte   9,0,0,0
1648         .rva    se_handler
1649         .rva    .Lprologue_xop,.Lepilogue_xop           # HandlerData[]
1650
1651 .LSEH_info_${func}_avx:
1652         .byte   9,0,0,0
1653         .rva    se_handler
1654         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1655 ___
1656 $code.=<<___ if ($avx>1);
1657 .LSEH_info_${func}_avx2:
1658         .byte   9,0,0,0
1659         .rva    se_handler
1660         .rva    .Lprologue_avx2,.Lepilogue_avx2         # HandlerData[]
1661 ___
1662 $code.=<<___ if ($shaext);
1663 .LSEH_info_${func}_shaext:
1664         .byte   9,0,0,0
1665         .rva    se_handler
1666         .rva    .Lprologue_shaext,.Lepilogue_shaext     # HandlerData[]
1667 ___
1668 }
1669
1670 ####################################################################
1671 sub rex {
1672   local *opcode=shift;
1673   my ($dst,$src)=@_;
1674   my $rex=0;
1675
1676     $rex|=0x04                  if($dst>=8);
1677     $rex|=0x01                  if($src>=8);
1678     unshift @opcode,$rex|0x40   if($rex);
1679 }
1680
1681 {
1682   my %opcodelet = (
1683                 "sha256rnds2" => 0xcb,
1684                 "sha256msg1"  => 0xcc,
1685                 "sha256msg2"  => 0xcd   );
1686
1687   sub sha256op38 {
1688     my $instr = shift;
1689
1690     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1691       my @opcode=(0x0f,0x38);
1692         rex(\@opcode,$2,$1);
1693         push @opcode,$opcodelet{$instr};
1694         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1695         return ".byte\t".join(',',@opcode);
1696     } else {
1697         return $instr."\t".@_[0];
1698     }
1699   }
1700 }
1701
1702 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1703 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1704 print $code;
1705 close STDOUT;