From 3971092e119dd117e9e40f6b5955f54a2762dcf3 Mon Sep 17 00:00:00 2001 From: Jung-uk Kim Date: Wed, 26 Aug 2020 16:56:44 +0000 Subject: [PATCH] Regen X86 assembly files after r364822. --- secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S | 784 +- secure/lib/libcrypto/amd64/aesni-mb-x86_64.S | 965 ++ .../lib/libcrypto/amd64/aesni-sha1-x86_64.S | 1350 +- .../lib/libcrypto/amd64/aesni-sha256-x86_64.S | 4376 ++++++ secure/lib/libcrypto/amd64/chacha-x86_64.S | 1026 ++ .../lib/libcrypto/amd64/ecp_nistz256-x86_64.S | 2503 +++- secure/lib/libcrypto/amd64/ghash-x86_64.S | 475 +- secure/lib/libcrypto/amd64/poly1305-x86_64.S | 1785 +++ secure/lib/libcrypto/amd64/rsaz-avx2.S | 1749 ++- secure/lib/libcrypto/amd64/rsaz-x86_64.S | 664 + secure/lib/libcrypto/amd64/sha1-mb-x86_64.S | 4315 ++++++ secure/lib/libcrypto/amd64/sha1-x86_64.S | 2829 ++++ secure/lib/libcrypto/amd64/sha256-mb-x86_64.S | 4672 ++++++ secure/lib/libcrypto/amd64/sha256-x86_64.S | 2369 +++ secure/lib/libcrypto/amd64/sha512-x86_64.S | 3660 +++++ secure/lib/libcrypto/amd64/x25519-x86_64.S | 390 +- secure/lib/libcrypto/amd64/x86_64-mont.S | 380 + secure/lib/libcrypto/amd64/x86_64-mont5.S | 1365 ++ secure/lib/libcrypto/i386/chacha-x86.S | 960 ++ secure/lib/libcrypto/i386/poly1305-x86.S | 1110 ++ secure/lib/libcrypto/i386/sha1-586.S | 6222 +++++--- secure/lib/libcrypto/i386/sha256-586.S | 11930 +++++++++++----- 22 files changed, 49916 insertions(+), 5963 deletions(-) diff --git a/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S b/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S index 1cdcc86043b..26e49f9b297 100644 --- a/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S +++ b/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S @@ -2,20 +2,790 @@ /* Do not modify. This file is auto-generated from aesni-gcm-x86_64.pl. */ .text -.globl aesni_gcm_encrypt -.type aesni_gcm_encrypt,@function -aesni_gcm_encrypt: +.type _aesni_ctr32_ghash_6x,@function +.align 32 +_aesni_ctr32_ghash_6x: .cfi_startproc - xorl %eax,%eax + vmovdqu 32(%r11),%xmm2 + subq $6,%rdx + vpxor %xmm4,%xmm4,%xmm4 + vmovdqu 0-128(%rcx),%xmm15 + vpaddb %xmm2,%xmm1,%xmm10 + vpaddb %xmm2,%xmm10,%xmm11 + vpaddb %xmm2,%xmm11,%xmm12 + vpaddb %xmm2,%xmm12,%xmm13 + vpaddb %xmm2,%xmm13,%xmm14 + vpxor %xmm15,%xmm1,%xmm9 + vmovdqu %xmm4,16+8(%rsp) + jmp .Loop6x + +.align 32 +.Loop6x: + addl $100663296,%ebx + jc .Lhandle_ctr32 + vmovdqu 0-32(%r9),%xmm3 + vpaddb %xmm2,%xmm14,%xmm1 + vpxor %xmm15,%xmm10,%xmm10 + vpxor %xmm15,%xmm11,%xmm11 + +.Lresume_ctr32: + vmovdqu %xmm1,(%r8) + vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 + vpxor %xmm15,%xmm12,%xmm12 + vmovups 16-128(%rcx),%xmm2 + vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 + xorq %r12,%r12 + cmpq %r14,%r15 + + vaesenc %xmm2,%xmm9,%xmm9 + vmovdqu 48+8(%rsp),%xmm0 + vpxor %xmm15,%xmm13,%xmm13 + vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 + vaesenc %xmm2,%xmm10,%xmm10 + vpxor %xmm15,%xmm14,%xmm14 + setnc %r12b + vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vmovdqu 16-32(%r9),%xmm3 + negq %r12 + vaesenc %xmm2,%xmm12,%xmm12 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 + vpxor %xmm4,%xmm8,%xmm8 + vaesenc %xmm2,%xmm13,%xmm13 + vpxor %xmm5,%xmm1,%xmm4 + andq $0x60,%r12 + vmovups 32-128(%rcx),%xmm15 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 + vaesenc %xmm2,%xmm14,%xmm14 + + vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 + leaq (%r14,%r12,1),%r14 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor 16+8(%rsp),%xmm8,%xmm8 + vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 + vmovdqu 64+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 88(%r14),%r13 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 80(%r14),%r12 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,32+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,40+8(%rsp) + vmovdqu 48-32(%r9),%xmm5 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 48-128(%rcx),%xmm15 + vpxor %xmm1,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm2,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor %xmm3,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 + vaesenc %xmm15,%xmm11,%xmm11 + vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 + vmovdqu 80+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqu 64-32(%r9),%xmm1 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 64-128(%rcx),%xmm15 + vpxor %xmm2,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm3,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 72(%r14),%r13 + vpxor %xmm5,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 64(%r14),%r12 + vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 + vmovdqu 96+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,48+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,56+8(%rsp) + vpxor %xmm2,%xmm4,%xmm4 + vmovdqu 96-32(%r9),%xmm2 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 80-128(%rcx),%xmm15 + vpxor %xmm3,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 56(%r14),%r13 + vpxor %xmm1,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 + vpxor 112+8(%rsp),%xmm8,%xmm8 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 48(%r14),%r12 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,64+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,72+8(%rsp) + vpxor %xmm3,%xmm4,%xmm4 + vmovdqu 112-32(%r9),%xmm3 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 96-128(%rcx),%xmm15 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm1,%xmm6,%xmm6 + vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 40(%r14),%r13 + vpxor %xmm2,%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 32(%r14),%r12 + vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,80+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,88+8(%rsp) + vpxor %xmm5,%xmm6,%xmm6 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor %xmm1,%xmm6,%xmm6 + + vmovups 112-128(%rcx),%xmm15 + vpslldq $8,%xmm6,%xmm5 + vpxor %xmm2,%xmm4,%xmm4 + vmovdqu 16(%r11),%xmm3 + + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm8,%xmm7,%xmm7 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor %xmm5,%xmm4,%xmm4 + movbeq 24(%r14),%r13 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 16(%r14),%r12 + vpalignr $8,%xmm4,%xmm4,%xmm0 + vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 + movq %r13,96+8(%rsp) + vaesenc %xmm15,%xmm12,%xmm12 + movq %r12,104+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + vmovups 128-128(%rcx),%xmm1 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vmovups 144-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm10,%xmm10 + vpsrldq $8,%xmm6,%xmm6 + vaesenc %xmm1,%xmm11,%xmm11 + vpxor %xmm6,%xmm7,%xmm7 + vaesenc %xmm1,%xmm12,%xmm12 + vpxor %xmm0,%xmm4,%xmm4 + movbeq 8(%r14),%r13 + vaesenc %xmm1,%xmm13,%xmm13 + movbeq 0(%r14),%r12 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 160-128(%rcx),%xmm1 + cmpl $11,%ebp + jb .Lenc_tail + + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + vmovups 176-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 192-128(%rcx),%xmm1 + je .Lenc_tail + + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + vmovups 208-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 224-128(%rcx),%xmm1 + jmp .Lenc_tail + +.align 32 +.Lhandle_ctr32: + vmovdqu (%r11),%xmm0 + vpshufb %xmm0,%xmm1,%xmm6 + vmovdqu 48(%r11),%xmm5 + vpaddd 64(%r11),%xmm6,%xmm10 + vpaddd %xmm5,%xmm6,%xmm11 + vmovdqu 0-32(%r9),%xmm3 + vpaddd %xmm5,%xmm10,%xmm12 + vpshufb %xmm0,%xmm10,%xmm10 + vpaddd %xmm5,%xmm11,%xmm13 + vpshufb %xmm0,%xmm11,%xmm11 + vpxor %xmm15,%xmm10,%xmm10 + vpaddd %xmm5,%xmm12,%xmm14 + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm15,%xmm11,%xmm11 + vpaddd %xmm5,%xmm13,%xmm1 + vpshufb %xmm0,%xmm13,%xmm13 + vpshufb %xmm0,%xmm14,%xmm14 + vpshufb %xmm0,%xmm1,%xmm1 + jmp .Lresume_ctr32 + +.align 32 +.Lenc_tail: + vaesenc %xmm15,%xmm9,%xmm9 + vmovdqu %xmm7,16+8(%rsp) + vpalignr $8,%xmm4,%xmm4,%xmm8 + vaesenc %xmm15,%xmm10,%xmm10 + vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 + vpxor 0(%rdi),%xmm1,%xmm2 + vaesenc %xmm15,%xmm11,%xmm11 + vpxor 16(%rdi),%xmm1,%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + vpxor 32(%rdi),%xmm1,%xmm5 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor 48(%rdi),%xmm1,%xmm6 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor 64(%rdi),%xmm1,%xmm7 + vpxor 80(%rdi),%xmm1,%xmm3 + vmovdqu (%r8),%xmm1 + + vaesenclast %xmm2,%xmm9,%xmm9 + vmovdqu 32(%r11),%xmm2 + vaesenclast %xmm0,%xmm10,%xmm10 + vpaddb %xmm2,%xmm1,%xmm0 + movq %r13,112+8(%rsp) + leaq 96(%rdi),%rdi + vaesenclast %xmm5,%xmm11,%xmm11 + vpaddb %xmm2,%xmm0,%xmm5 + movq %r12,120+8(%rsp) + leaq 96(%rsi),%rsi + vmovdqu 0-128(%rcx),%xmm15 + vaesenclast %xmm6,%xmm12,%xmm12 + vpaddb %xmm2,%xmm5,%xmm6 + vaesenclast %xmm7,%xmm13,%xmm13 + vpaddb %xmm2,%xmm6,%xmm7 + vaesenclast %xmm3,%xmm14,%xmm14 + vpaddb %xmm2,%xmm7,%xmm3 + + addq $0x60,%r10 + subq $0x6,%rdx + jc .L6x_done + + vmovups %xmm9,-96(%rsi) + vpxor %xmm15,%xmm1,%xmm9 + vmovups %xmm10,-80(%rsi) + vmovdqa %xmm0,%xmm10 + vmovups %xmm11,-64(%rsi) + vmovdqa %xmm5,%xmm11 + vmovups %xmm12,-48(%rsi) + vmovdqa %xmm6,%xmm12 + vmovups %xmm13,-32(%rsi) + vmovdqa %xmm7,%xmm13 + vmovups %xmm14,-16(%rsi) + vmovdqa %xmm3,%xmm14 + vmovdqu 32+8(%rsp),%xmm7 + jmp .Loop6x + +.L6x_done: + vpxor 16+8(%rsp),%xmm8,%xmm8 + vpxor %xmm4,%xmm8,%xmm8 + .byte 0xf3,0xc3 .cfi_endproc -.size aesni_gcm_encrypt,.-aesni_gcm_encrypt - +.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x .globl aesni_gcm_decrypt .type aesni_gcm_decrypt,@function +.align 32 aesni_gcm_decrypt: .cfi_startproc - xorl %eax,%eax + xorq %r10,%r10 + cmpq $0x60,%rdx + jb .Lgcm_dec_abort + + leaq (%rsp),%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + vzeroupper + + vmovdqu (%r8),%xmm1 + addq $-128,%rsp + movl 12(%r8),%ebx + leaq .Lbswap_mask(%rip),%r11 + leaq -128(%rcx),%r14 + movq $0xf80,%r15 + vmovdqu (%r9),%xmm8 + andq $-128,%rsp + vmovdqu (%r11),%xmm0 + leaq 128(%rcx),%rcx + leaq 32+32(%r9),%r9 + movl 240-128(%rcx),%ebp + vpshufb %xmm0,%xmm8,%xmm8 + + andq %r15,%r14 + andq %rsp,%r15 + subq %r14,%r15 + jc .Ldec_no_key_aliasing + cmpq $768,%r15 + jnc .Ldec_no_key_aliasing + subq %r15,%rsp +.Ldec_no_key_aliasing: + + vmovdqu 80(%rdi),%xmm7 + leaq (%rdi),%r14 + vmovdqu 64(%rdi),%xmm4 + leaq -192(%rdi,%rdx,1),%r15 + vmovdqu 48(%rdi),%xmm5 + shrq $4,%rdx + xorq %r10,%r10 + vmovdqu 32(%rdi),%xmm6 + vpshufb %xmm0,%xmm7,%xmm7 + vmovdqu 16(%rdi),%xmm2 + vpshufb %xmm0,%xmm4,%xmm4 + vmovdqu (%rdi),%xmm3 + vpshufb %xmm0,%xmm5,%xmm5 + vmovdqu %xmm4,48(%rsp) + vpshufb %xmm0,%xmm6,%xmm6 + vmovdqu %xmm5,64(%rsp) + vpshufb %xmm0,%xmm2,%xmm2 + vmovdqu %xmm6,80(%rsp) + vpshufb %xmm0,%xmm3,%xmm3 + vmovdqu %xmm2,96(%rsp) + vmovdqu %xmm3,112(%rsp) + + call _aesni_ctr32_ghash_6x + + vmovups %xmm9,-96(%rsi) + vmovups %xmm10,-80(%rsi) + vmovups %xmm11,-64(%rsi) + vmovups %xmm12,-48(%rsi) + vmovups %xmm13,-32(%rsi) + vmovups %xmm14,-16(%rsi) + + vpshufb (%r11),%xmm8,%xmm8 + vmovdqu %xmm8,-64(%r9) + + vzeroupper + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lgcm_dec_abort: + movq %r10,%rax .byte 0xf3,0xc3 .cfi_endproc .size aesni_gcm_decrypt,.-aesni_gcm_decrypt +.type _aesni_ctr32_6x,@function +.align 32 +_aesni_ctr32_6x: +.cfi_startproc + vmovdqu 0-128(%rcx),%xmm4 + vmovdqu 32(%r11),%xmm2 + leaq -1(%rbp),%r13 + vmovups 16-128(%rcx),%xmm15 + leaq 32-128(%rcx),%r12 + vpxor %xmm4,%xmm1,%xmm9 + addl $100663296,%ebx + jc .Lhandle_ctr32_2 + vpaddb %xmm2,%xmm1,%xmm10 + vpaddb %xmm2,%xmm10,%xmm11 + vpxor %xmm4,%xmm10,%xmm10 + vpaddb %xmm2,%xmm11,%xmm12 + vpxor %xmm4,%xmm11,%xmm11 + vpaddb %xmm2,%xmm12,%xmm13 + vpxor %xmm4,%xmm12,%xmm12 + vpaddb %xmm2,%xmm13,%xmm14 + vpxor %xmm4,%xmm13,%xmm13 + vpaddb %xmm2,%xmm14,%xmm1 + vpxor %xmm4,%xmm14,%xmm14 + jmp .Loop_ctr32 + +.align 16 +.Loop_ctr32: + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + vmovups (%r12),%xmm15 + leaq 16(%r12),%r12 + decl %r13d + jnz .Loop_ctr32 + + vmovdqu (%r12),%xmm3 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor 0(%rdi),%xmm3,%xmm4 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor 16(%rdi),%xmm3,%xmm5 + vaesenc %xmm15,%xmm11,%xmm11 + vpxor 32(%rdi),%xmm3,%xmm6 + vaesenc %xmm15,%xmm12,%xmm12 + vpxor 48(%rdi),%xmm3,%xmm8 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor 64(%rdi),%xmm3,%xmm2 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor 80(%rdi),%xmm3,%xmm3 + leaq 96(%rdi),%rdi + + vaesenclast %xmm4,%xmm9,%xmm9 + vaesenclast %xmm5,%xmm10,%xmm10 + vaesenclast %xmm6,%xmm11,%xmm11 + vaesenclast %xmm8,%xmm12,%xmm12 + vaesenclast %xmm2,%xmm13,%xmm13 + vaesenclast %xmm3,%xmm14,%xmm14 + vmovups %xmm9,0(%rsi) + vmovups %xmm10,16(%rsi) + vmovups %xmm11,32(%rsi) + vmovups %xmm12,48(%rsi) + vmovups %xmm13,64(%rsi) + vmovups %xmm14,80(%rsi) + leaq 96(%rsi),%rsi + + .byte 0xf3,0xc3 +.align 32 +.Lhandle_ctr32_2: + vpshufb %xmm0,%xmm1,%xmm6 + vmovdqu 48(%r11),%xmm5 + vpaddd 64(%r11),%xmm6,%xmm10 + vpaddd %xmm5,%xmm6,%xmm11 + vpaddd %xmm5,%xmm10,%xmm12 + vpshufb %xmm0,%xmm10,%xmm10 + vpaddd %xmm5,%xmm11,%xmm13 + vpshufb %xmm0,%xmm11,%xmm11 + vpxor %xmm4,%xmm10,%xmm10 + vpaddd %xmm5,%xmm12,%xmm14 + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm4,%xmm11,%xmm11 + vpaddd %xmm5,%xmm13,%xmm1 + vpshufb %xmm0,%xmm13,%xmm13 + vpxor %xmm4,%xmm12,%xmm12 + vpshufb %xmm0,%xmm14,%xmm14 + vpxor %xmm4,%xmm13,%xmm13 + vpshufb %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm14,%xmm14 + jmp .Loop_ctr32 +.cfi_endproc +.size _aesni_ctr32_6x,.-_aesni_ctr32_6x + +.globl aesni_gcm_encrypt +.type aesni_gcm_encrypt,@function +.align 32 +aesni_gcm_encrypt: +.cfi_startproc + xorq %r10,%r10 + cmpq $288,%rdx + jb .Lgcm_enc_abort + + leaq (%rsp),%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + vzeroupper + + vmovdqu (%r8),%xmm1 + addq $-128,%rsp + movl 12(%r8),%ebx + leaq .Lbswap_mask(%rip),%r11 + leaq -128(%rcx),%r14 + movq $0xf80,%r15 + leaq 128(%rcx),%rcx + vmovdqu (%r11),%xmm0 + andq $-128,%rsp + movl 240-128(%rcx),%ebp + + andq %r15,%r14 + andq %rsp,%r15 + subq %r14,%r15 + jc .Lenc_no_key_aliasing + cmpq $768,%r15 + jnc .Lenc_no_key_aliasing + subq %r15,%rsp +.Lenc_no_key_aliasing: + + leaq (%rsi),%r14 + leaq -192(%rsi,%rdx,1),%r15 + shrq $4,%rdx + + call _aesni_ctr32_6x + vpshufb %xmm0,%xmm9,%xmm8 + vpshufb %xmm0,%xmm10,%xmm2 + vmovdqu %xmm8,112(%rsp) + vpshufb %xmm0,%xmm11,%xmm4 + vmovdqu %xmm2,96(%rsp) + vpshufb %xmm0,%xmm12,%xmm5 + vmovdqu %xmm4,80(%rsp) + vpshufb %xmm0,%xmm13,%xmm6 + vmovdqu %xmm5,64(%rsp) + vpshufb %xmm0,%xmm14,%xmm7 + vmovdqu %xmm6,48(%rsp) + + call _aesni_ctr32_6x + + vmovdqu (%r9),%xmm8 + leaq 32+32(%r9),%r9 + subq $12,%rdx + movq $192,%r10 + vpshufb %xmm0,%xmm8,%xmm8 + + call _aesni_ctr32_ghash_6x + vmovdqu 32(%rsp),%xmm7 + vmovdqu (%r11),%xmm0 + vmovdqu 0-32(%r9),%xmm3 + vpunpckhqdq %xmm7,%xmm7,%xmm1 + vmovdqu 32-32(%r9),%xmm15 + vmovups %xmm9,-96(%rsi) + vpshufb %xmm0,%xmm9,%xmm9 + vpxor %xmm7,%xmm1,%xmm1 + vmovups %xmm10,-80(%rsi) + vpshufb %xmm0,%xmm10,%xmm10 + vmovups %xmm11,-64(%rsi) + vpshufb %xmm0,%xmm11,%xmm11 + vmovups %xmm12,-48(%rsi) + vpshufb %xmm0,%xmm12,%xmm12 + vmovups %xmm13,-32(%rsi) + vpshufb %xmm0,%xmm13,%xmm13 + vmovups %xmm14,-16(%rsi) + vpshufb %xmm0,%xmm14,%xmm14 + vmovdqu %xmm9,16(%rsp) + vmovdqu 48(%rsp),%xmm6 + vmovdqu 16-32(%r9),%xmm0 + vpunpckhqdq %xmm6,%xmm6,%xmm2 + vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5 + vpxor %xmm6,%xmm2,%xmm2 + vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 + + vmovdqu 64(%rsp),%xmm9 + vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4 + vmovdqu 48-32(%r9),%xmm3 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm9,%xmm9,%xmm5 + vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6 + vpxor %xmm9,%xmm5,%xmm5 + vpxor %xmm7,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 + vmovdqu 80-32(%r9),%xmm15 + vpxor %xmm1,%xmm2,%xmm2 + + vmovdqu 80(%rsp),%xmm1 + vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7 + vmovdqu 64-32(%r9),%xmm0 + vpxor %xmm4,%xmm7,%xmm7 + vpunpckhqdq %xmm1,%xmm1,%xmm4 + vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpxor %xmm6,%xmm9,%xmm9 + vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 96(%rsp),%xmm2 + vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6 + vmovdqu 96-32(%r9),%xmm3 + vpxor %xmm7,%xmm6,%xmm6 + vpunpckhqdq %xmm2,%xmm2,%xmm7 + vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpxor %xmm9,%xmm1,%xmm1 + vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4 + vmovdqu 128-32(%r9),%xmm15 + vpxor %xmm5,%xmm4,%xmm4 + + vpxor 112(%rsp),%xmm8,%xmm8 + vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5 + vmovdqu 112-32(%r9),%xmm0 + vpunpckhqdq %xmm8,%xmm8,%xmm9 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2 + vpxor %xmm8,%xmm9,%xmm9 + vpxor %xmm1,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7 + vpxor %xmm4,%xmm7,%xmm4 + + vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6 + vmovdqu 0-32(%r9),%xmm3 + vpunpckhqdq %xmm14,%xmm14,%xmm1 + vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8 + vpxor %xmm14,%xmm1,%xmm1 + vpxor %xmm5,%xmm6,%xmm5 + vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9 + vmovdqu 32-32(%r9),%xmm15 + vpxor %xmm2,%xmm8,%xmm7 + vpxor %xmm4,%xmm9,%xmm6 + + vmovdqu 16-32(%r9),%xmm0 + vpxor %xmm5,%xmm7,%xmm9 + vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4 + vpxor %xmm9,%xmm6,%xmm6 + vpunpckhqdq %xmm13,%xmm13,%xmm2 + vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14 + vpxor %xmm13,%xmm2,%xmm2 + vpslldq $8,%xmm6,%xmm9 + vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 + vpxor %xmm9,%xmm5,%xmm8 + vpsrldq $8,%xmm6,%xmm6 + vpxor %xmm6,%xmm7,%xmm7 + + vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5 + vmovdqu 48-32(%r9),%xmm3 + vpxor %xmm4,%xmm5,%xmm5 + vpunpckhqdq %xmm12,%xmm12,%xmm9 + vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13 + vpxor %xmm12,%xmm9,%xmm9 + vpxor %xmm14,%xmm13,%xmm13 + vpalignr $8,%xmm8,%xmm8,%xmm14 + vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 + vmovdqu 80-32(%r9),%xmm15 + vpxor %xmm1,%xmm2,%xmm2 + + vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4 + vmovdqu 64-32(%r9),%xmm0 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm11,%xmm11,%xmm1 + vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12 + vpxor %xmm11,%xmm1,%xmm1 + vpxor %xmm13,%xmm12,%xmm12 + vxorps 16(%rsp),%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9 + vpxor %xmm2,%xmm9,%xmm9 + + vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 + vxorps %xmm14,%xmm8,%xmm8 + + vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5 + vmovdqu 96-32(%r9),%xmm3 + vpxor %xmm4,%xmm5,%xmm5 + vpunpckhqdq %xmm10,%xmm10,%xmm2 + vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11 + vpxor %xmm10,%xmm2,%xmm2 + vpalignr $8,%xmm8,%xmm8,%xmm14 + vpxor %xmm12,%xmm11,%xmm11 + vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1 + vmovdqu 128-32(%r9),%xmm15 + vpxor %xmm9,%xmm1,%xmm1 + + vxorps %xmm7,%xmm14,%xmm14 + vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 + vxorps %xmm14,%xmm8,%xmm8 + + vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4 + vmovdqu 112-32(%r9),%xmm0 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm8,%xmm8,%xmm9 + vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10 + vpxor %xmm8,%xmm9,%xmm9 + vpxor %xmm11,%xmm10,%xmm10 + vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2 + vpxor %xmm1,%xmm2,%xmm2 + + vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5 + vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7 + vpxor %xmm4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6 + vpxor %xmm10,%xmm7,%xmm7 + vpxor %xmm2,%xmm6,%xmm6 + + vpxor %xmm5,%xmm7,%xmm4 + vpxor %xmm4,%xmm6,%xmm6 + vpslldq $8,%xmm6,%xmm1 + vmovdqu 16(%r11),%xmm3 + vpsrldq $8,%xmm6,%xmm6 + vpxor %xmm1,%xmm5,%xmm8 + vpxor %xmm6,%xmm7,%xmm7 + + vpalignr $8,%xmm8,%xmm8,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 + vpxor %xmm2,%xmm8,%xmm8 + + vpalignr $8,%xmm8,%xmm8,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 + vpxor %xmm7,%xmm2,%xmm2 + vpxor %xmm2,%xmm8,%xmm8 + vpshufb (%r11),%xmm8,%xmm8 + vmovdqu %xmm8,-64(%r9) + + vzeroupper + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lgcm_enc_abort: + movq %r10,%rax + .byte 0xf3,0xc3 +.cfi_endproc +.size aesni_gcm_encrypt,.-aesni_gcm_encrypt +.align 64 +.Lbswap_mask: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.Lpoly: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.Lone_msb: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +.Ltwo_lsb: +.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.Lone_lsb: +.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 64 diff --git a/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S b/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S index de4bac9488f..706c5c59d38 100644 --- a/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S +++ b/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S @@ -9,6 +9,14 @@ .align 32 aesni_multi_cbc_encrypt: .cfi_startproc + cmpl $2,%edx + jb .Lenc_non_avx + movl OPENSSL_ia32cap_P+4(%rip),%ecx + testl $268435456,%ecx + jnz _avx_cbc_enc_shortcut + jmp .Lenc_non_avx +.align 16 +.Lenc_non_avx: movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx @@ -283,6 +291,14 @@ aesni_multi_cbc_encrypt: .align 32 aesni_multi_cbc_decrypt: .cfi_startproc + cmpl $2,%edx + jb .Ldec_non_avx + movl OPENSSL_ia32cap_P+4(%rip),%ecx + testl $268435456,%ecx + jnz _avx_cbc_dec_shortcut + jmp .Ldec_non_avx +.align 16 +.Ldec_non_avx: movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx @@ -542,3 +558,952 @@ aesni_multi_cbc_decrypt: .byte 0xf3,0xc3 .cfi_endproc .size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt +.type aesni_multi_cbc_encrypt_avx,@function +.align 32 +aesni_multi_cbc_encrypt_avx: +.cfi_startproc +_avx_cbc_enc_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + + + + + + + + + subq $192,%rsp + andq $-128,%rsp + movq %rax,16(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08 + +.Lenc8x_body: + vzeroupper + vmovdqu (%rsi),%xmm15 + leaq 120(%rsi),%rsi + leaq 160(%rdi),%rdi + shrl $1,%edx + +.Lenc8x_loop_grande: + + xorl %edx,%edx + movl -144(%rdi),%ecx + movq -160(%rdi),%r8 + cmpl %edx,%ecx + movq -152(%rdi),%rbx + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu -136(%rdi),%xmm2 + movl %ecx,32(%rsp) + cmovleq %rsp,%r8 + subq %r8,%rbx + movq %rbx,64(%rsp) + movl -104(%rdi),%ecx + movq -120(%rdi),%r9 + cmpl %edx,%ecx + movq -112(%rdi),%rbp + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu -96(%rdi),%xmm3 + movl %ecx,36(%rsp) + cmovleq %rsp,%r9 + subq %r9,%rbp + movq %rbp,72(%rsp) + movl -64(%rdi),%ecx + movq -80(%rdi),%r10 + cmpl %edx,%ecx + movq -72(%rdi),%rbp + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu -56(%rdi),%xmm4 + movl %ecx,40(%rsp) + cmovleq %rsp,%r10 + subq %r10,%rbp + movq %rbp,80(%rsp) + movl -24(%rdi),%ecx + movq -40(%rdi),%r11 + cmpl %edx,%ecx + movq -32(%rdi),%rbp + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu -16(%rdi),%xmm5 + movl %ecx,44(%rsp) + cmovleq %rsp,%r11 + subq %r11,%rbp + movq %rbp,88(%rsp) + movl 16(%rdi),%ecx + movq 0(%rdi),%r12 + cmpl %edx,%ecx + movq 8(%rdi),%rbp + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu 24(%rdi),%xmm6 + movl %ecx,48(%rsp) + cmovleq %rsp,%r12 + subq %r12,%rbp + movq %rbp,96(%rsp) + movl 56(%rdi),%ecx + movq 40(%rdi),%r13 + cmpl %edx,%ecx + movq 48(%rdi),%rbp + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu 64(%rdi),%xmm7 + movl %ecx,52(%rsp) + cmovleq %rsp,%r13 + subq %r13,%rbp + movq %rbp,104(%rsp) + movl 96(%rdi),%ecx + movq 80(%rdi),%r14 + cmpl %edx,%ecx + movq 88(%rdi),%rbp + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu 104(%rdi),%xmm8 + movl %ecx,56(%rsp) + cmovleq %rsp,%r14 + subq %r14,%rbp + movq %rbp,112(%rsp) + movl 136(%rdi),%ecx + movq 120(%rdi),%r15 + cmpl %edx,%ecx + movq 128(%rdi),%rbp + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu 144(%rdi),%xmm9 + movl %ecx,60(%rsp) + cmovleq %rsp,%r15 + subq %r15,%rbp + movq %rbp,120(%rsp) + testl %edx,%edx + jz .Lenc8x_done + + vmovups 16-120(%rsi),%xmm1 + vmovups 32-120(%rsi),%xmm0 + movl 240-120(%rsi),%eax + + vpxor (%r8),%xmm15,%xmm10 + leaq 128(%rsp),%rbp + vpxor (%r9),%xmm15,%xmm11 + vpxor (%r10),%xmm15,%xmm12 + vpxor (%r11),%xmm15,%xmm13 + vpxor %xmm10,%xmm2,%xmm2 + vpxor (%r12),%xmm15,%xmm10 + vpxor %xmm11,%xmm3,%xmm3 + vpxor (%r13),%xmm15,%xmm11 + vpxor %xmm12,%xmm4,%xmm4 + vpxor (%r14),%xmm15,%xmm12 + vpxor %xmm13,%xmm5,%xmm5 + vpxor (%r15),%xmm15,%xmm13 + vpxor %xmm10,%xmm6,%xmm6 + movl $1,%ecx + vpxor %xmm11,%xmm7,%xmm7 + vpxor %xmm12,%xmm8,%xmm8 + vpxor %xmm13,%xmm9,%xmm9 + jmp .Loop_enc8x + +.align 32 +.Loop_enc8x: + vaesenc %xmm1,%xmm2,%xmm2 + cmpl 32+0(%rsp),%ecx + vaesenc %xmm1,%xmm3,%xmm3 + prefetcht0 31(%r8) + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm5,%xmm5 + leaq (%r8,%rbx,1),%rbx + cmovgeq %rsp,%r8 + vaesenc %xmm1,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesenc %xmm1,%xmm7,%xmm7 + subq %r8,%rbx + vaesenc %xmm1,%xmm8,%xmm8 + vpxor 16(%r8),%xmm15,%xmm10 + movq %rbx,64+0(%rsp) + vaesenc %xmm1,%xmm9,%xmm9 + vmovups -72(%rsi),%xmm1 + leaq 16(%r8,%rbx,1),%r8 + vmovdqu %xmm10,0(%rbp) + vaesenc %xmm0,%xmm2,%xmm2 + cmpl 32+4(%rsp),%ecx + movq 64+8(%rsp),%rbx + vaesenc %xmm0,%xmm3,%xmm3 + prefetcht0 31(%r9) + vaesenc %xmm0,%xmm4,%xmm4 + vaesenc %xmm0,%xmm5,%xmm5 + leaq (%r9,%rbx,1),%rbx + cmovgeq %rsp,%r9 + vaesenc %xmm0,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesenc %xmm0,%xmm7,%xmm7 + subq %r9,%rbx + vaesenc %xmm0,%xmm8,%xmm8 + vpxor 16(%r9),%xmm15,%xmm11 + movq %rbx,64+8(%rsp) + vaesenc %xmm0,%xmm9,%xmm9 + vmovups -56(%rsi),%xmm0 + leaq 16(%r9,%rbx,1),%r9 + vmovdqu %xmm11,16(%rbp) + vaesenc %xmm1,%xmm2,%xmm2 + cmpl 32+8(%rsp),%ecx + movq 64+16(%rsp),%rbx + vaesenc %xmm1,%xmm3,%xmm3 + prefetcht0 31(%r10) + vaesenc %xmm1,%xmm4,%xmm4 + prefetcht0 15(%r8) + vaesenc %xmm1,%xmm5,%xmm5 + leaq (%r10,%rbx,1),%rbx + cmovgeq %rsp,%r10 + vaesenc %xmm1,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesenc %xmm1,%xmm7,%xmm7 + subq %r10,%rbx + vaesenc %xmm1,%xmm8,%xmm8 + vpxor 16(%r10),%xmm15,%xmm12 + movq %rbx,64+16(%rsp) + vaesenc %xmm1,%xmm9,%xmm9 + vmovups -40(%rsi),%xmm1 + leaq 16(%r10,%rbx,1),%r10 + vmovdqu %xmm12,32(%rbp) + vaesenc %xmm0,%xmm2,%xmm2 + cmpl 32+12(%rsp),%ecx + movq 64+24(%rsp),%rbx + vaesenc %xmm0,%xmm3,%xmm3 + prefetcht0 31(%r11) + vaesenc %xmm0,%xmm4,%xmm4 + prefetcht0 15(%r9) + vaesenc %xmm0,%xmm5,%xmm5 + leaq (%r11,%rbx,1),%rbx + cmovgeq %rsp,%r11 + vaesenc %xmm0,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesenc %xmm0,%xmm7,%xmm7 + subq %r11,%rbx + vaesenc %xmm0,%xmm8,%xmm8 + vpxor 16(%r11),%xmm15,%xmm13 + movq %rbx,64+24(%rsp) + vaesenc %xmm0,%xmm9,%xmm9 + vmovups -24(%rsi),%xmm0 + leaq 16(%r11,%rbx,1),%r11 + vmovdqu %xmm13,48(%rbp) + vaesenc %xmm1,%xmm2,%xmm2 + cmpl 32+16(%rsp),%ecx + movq 64+32(%rsp),%rbx + vaesenc %xmm1,%xmm3,%xmm3 + prefetcht0 31(%r12) + vaesenc %xmm1,%xmm4,%xmm4 + prefetcht0 15(%r10) + vaesenc %xmm1,%xmm5,%xmm5 + leaq (%r12,%rbx,1),%rbx + cmovgeq %rsp,%r12 + vaesenc %xmm1,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesenc %xmm1,%xmm7,%xmm7 + subq %r12,%rbx + vaesenc %xmm1,%xmm8,%xmm8 + vpxor 16(%r12),%xmm15,%xmm10 + movq %rbx,64+32(%rsp) + vaesenc %xmm1,%xmm9,%xmm9 + vmovups -8(%rsi),%xmm1 + leaq 16(%r12,%rbx,1),%r12 + vaesenc %xmm0,%xmm2,%xmm2 + cmpl 32+20(%rsp),%ecx + movq 64+40(%rsp),%rbx + vaesenc %xmm0,%xmm3,%xmm3 + prefetcht0 31(%r13) + vaesenc %xmm0,%xmm4,%xmm4 + prefetcht0 15(%r11) + vaesenc %xmm0,%xmm5,%xmm5 + leaq (%rbx,%r13,1),%rbx + cmovgeq %rsp,%r13 + vaesenc %xmm0,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesenc %xmm0,%xmm7,%xmm7 + subq %r13,%rbx + vaesenc %xmm0,%xmm8,%xmm8 + vpxor 16(%r13),%xmm15,%xmm11 + movq %rbx,64+40(%rsp) + vaesenc %xmm0,%xmm9,%xmm9 + vmovups 8(%rsi),%xmm0 + leaq 16(%r13,%rbx,1),%r13 + vaesenc %xmm1,%xmm2,%xmm2 + cmpl 32+24(%rsp),%ecx + movq 64+48(%rsp),%rbx + vaesenc %xmm1,%xmm3,%xmm3 + prefetcht0 31(%r14) + vaesenc %xmm1,%xmm4,%xmm4 + prefetcht0 15(%r12) + vaesenc %xmm1,%xmm5,%xmm5 + leaq (%r14,%rbx,1),%rbx + cmovgeq %rsp,%r14 + vaesenc %xmm1,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesenc %xmm1,%xmm7,%xmm7 + subq %r14,%rbx + vaesenc %xmm1,%xmm8,%xmm8 + vpxor 16(%r14),%xmm15,%xmm12 + movq %rbx,64+48(%rsp) + vaesenc %xmm1,%xmm9,%xmm9 + vmovups 24(%rsi),%xmm1 + leaq 16(%r14,%rbx,1),%r14 + vaesenc %xmm0,%xmm2,%xmm2 + cmpl 32+28(%rsp),%ecx + movq 64+56(%rsp),%rbx + vaesenc %xmm0,%xmm3,%xmm3 + prefetcht0 31(%r15) + vaesenc %xmm0,%xmm4,%xmm4 + prefetcht0 15(%r13) + vaesenc %xmm0,%xmm5,%xmm5 + leaq (%r15,%rbx,1),%rbx + cmovgeq %rsp,%r15 + vaesenc %xmm0,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesenc %xmm0,%xmm7,%xmm7 + subq %r15,%rbx + vaesenc %xmm0,%xmm8,%xmm8 + vpxor 16(%r15),%xmm15,%xmm13 + movq %rbx,64+56(%rsp) + vaesenc %xmm0,%xmm9,%xmm9 + vmovups 40(%rsi),%xmm0 + leaq 16(%r15,%rbx,1),%r15 + vmovdqu 32(%rsp),%xmm14 + prefetcht0 15(%r14) + prefetcht0 15(%r15) + cmpl $11,%eax + jb .Lenc8x_tail + + vaesenc %xmm1,%xmm2,%xmm2 + vaesenc %xmm1,%xmm3,%xmm3 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm5,%xmm5 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm8,%xmm8 + vaesenc %xmm1,%xmm9,%xmm9 + vmovups 176-120(%rsi),%xmm1 + + vaesenc %xmm0,%xmm2,%xmm2 + vaesenc %xmm0,%xmm3,%xmm3 + vaesenc %xmm0,%xmm4,%xmm4 + vaesenc %xmm0,%xmm5,%xmm5 + vaesenc %xmm0,%xmm6,%xmm6 + vaesenc %xmm0,%xmm7,%xmm7 + vaesenc %xmm0,%xmm8,%xmm8 + vaesenc %xmm0,%xmm9,%xmm9 + vmovups 192-120(%rsi),%xmm0 + je .Lenc8x_tail + + vaesenc %xmm1,%xmm2,%xmm2 + vaesenc %xmm1,%xmm3,%xmm3 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm5,%xmm5 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm8,%xmm8 + vaesenc %xmm1,%xmm9,%xmm9 + vmovups 208-120(%rsi),%xmm1 + + vaesenc %xmm0,%xmm2,%xmm2 + vaesenc %xmm0,%xmm3,%xmm3 + vaesenc %xmm0,%xmm4,%xmm4 + vaesenc %xmm0,%xmm5,%xmm5 + vaesenc %xmm0,%xmm6,%xmm6 + vaesenc %xmm0,%xmm7,%xmm7 + vaesenc %xmm0,%xmm8,%xmm8 + vaesenc %xmm0,%xmm9,%xmm9 + vmovups 224-120(%rsi),%xmm0 + +.Lenc8x_tail: + vaesenc %xmm1,%xmm2,%xmm2 + vpxor %xmm15,%xmm15,%xmm15 + vaesenc %xmm1,%xmm3,%xmm3 + vaesenc %xmm1,%xmm4,%xmm4 + vpcmpgtd %xmm15,%xmm14,%xmm15 + vaesenc %xmm1,%xmm5,%xmm5 + vaesenc %xmm1,%xmm6,%xmm6 + vpaddd %xmm14,%xmm15,%xmm15 + vmovdqu 48(%rsp),%xmm14 + vaesenc %xmm1,%xmm7,%xmm7 + movq 64(%rsp),%rbx + vaesenc %xmm1,%xmm8,%xmm8 + vaesenc %xmm1,%xmm9,%xmm9 + vmovups 16-120(%rsi),%xmm1 + + vaesenclast %xmm0,%xmm2,%xmm2 + vmovdqa %xmm15,32(%rsp) + vpxor %xmm15,%xmm15,%xmm15 + vaesenclast %xmm0,%xmm3,%xmm3 + vaesenclast %xmm0,%xmm4,%xmm4 + vpcmpgtd %xmm15,%xmm14,%xmm15 + vaesenclast %xmm0,%xmm5,%xmm5 + vaesenclast %xmm0,%xmm6,%xmm6 + vpaddd %xmm15,%xmm14,%xmm14 + vmovdqu -120(%rsi),%xmm15 + vaesenclast %xmm0,%xmm7,%xmm7 + vaesenclast %xmm0,%xmm8,%xmm8 + vmovdqa %xmm14,48(%rsp) + vaesenclast %xmm0,%xmm9,%xmm9 + vmovups 32-120(%rsi),%xmm0 + + vmovups %xmm2,-16(%r8) + subq %rbx,%r8 + vpxor 0(%rbp),%xmm2,%xmm2 + vmovups %xmm3,-16(%r9) + subq 72(%rsp),%r9 + vpxor 16(%rbp),%xmm3,%xmm3 + vmovups %xmm4,-16(%r10) + subq 80(%rsp),%r10 + vpxor 32(%rbp),%xmm4,%xmm4 + vmovups %xmm5,-16(%r11) + subq 88(%rsp),%r11 + vpxor 48(%rbp),%xmm5,%xmm5 + vmovups %xmm6,-16(%r12) + subq 96(%rsp),%r12 + vpxor %xmm10,%xmm6,%xmm6 + vmovups %xmm7,-16(%r13) + subq 104(%rsp),%r13 + vpxor %xmm11,%xmm7,%xmm7 + vmovups %xmm8,-16(%r14) + subq 112(%rsp),%r14 + vpxor %xmm12,%xmm8,%xmm8 + vmovups %xmm9,-16(%r15) + subq 120(%rsp),%r15 + vpxor %xmm13,%xmm9,%xmm9 + + decl %edx + jnz .Loop_enc8x + + movq 16(%rsp),%rax +.cfi_def_cfa %rax,8 + + + + + +.Lenc8x_done: + vzeroupper + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lenc8x_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx + +.type aesni_multi_cbc_decrypt_avx,@function +.align 32 +aesni_multi_cbc_decrypt_avx: +.cfi_startproc +_avx_cbc_dec_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + + + + + + + + + + subq $256,%rsp + andq $-256,%rsp + subq $192,%rsp + movq %rax,16(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08 + +.Ldec8x_body: + vzeroupper + vmovdqu (%rsi),%xmm15 + leaq 120(%rsi),%rsi + leaq 160(%rdi),%rdi + shrl $1,%edx + +.Ldec8x_loop_grande: + + xorl %edx,%edx + movl -144(%rdi),%ecx + movq -160(%rdi),%r8 + cmpl %edx,%ecx + movq -152(%rdi),%rbx + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu -136(%rdi),%xmm2 + movl %ecx,32(%rsp) + cmovleq %rsp,%r8 + subq %r8,%rbx + movq %rbx,64(%rsp) + vmovdqu %xmm2,192(%rsp) + movl -104(%rdi),%ecx + movq -120(%rdi),%r9 + cmpl %edx,%ecx + movq -112(%rdi),%rbp + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu -96(%rdi),%xmm3 + movl %ecx,36(%rsp) + cmovleq %rsp,%r9 + subq %r9,%rbp + movq %rbp,72(%rsp) + vmovdqu %xmm3,208(%rsp) + movl -64(%rdi),%ecx + movq -80(%rdi),%r10 + cmpl %edx,%ecx + movq -72(%rdi),%rbp + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu -56(%rdi),%xmm4 + movl %ecx,40(%rsp) + cmovleq %rsp,%r10 + subq %r10,%rbp + movq %rbp,80(%rsp) + vmovdqu %xmm4,224(%rsp) + movl -24(%rdi),%ecx + movq -40(%rdi),%r11 + cmpl %edx,%ecx + movq -32(%rdi),%rbp + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu -16(%rdi),%xmm5 + movl %ecx,44(%rsp) + cmovleq %rsp,%r11 + subq %r11,%rbp + movq %rbp,88(%rsp) + vmovdqu %xmm5,240(%rsp) + movl 16(%rdi),%ecx + movq 0(%rdi),%r12 + cmpl %edx,%ecx + movq 8(%rdi),%rbp + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu 24(%rdi),%xmm6 + movl %ecx,48(%rsp) + cmovleq %rsp,%r12 + subq %r12,%rbp + movq %rbp,96(%rsp) + vmovdqu %xmm6,256(%rsp) + movl 56(%rdi),%ecx + movq 40(%rdi),%r13 + cmpl %edx,%ecx + movq 48(%rdi),%rbp + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu 64(%rdi),%xmm7 + movl %ecx,52(%rsp) + cmovleq %rsp,%r13 + subq %r13,%rbp + movq %rbp,104(%rsp) + vmovdqu %xmm7,272(%rsp) + movl 96(%rdi),%ecx + movq 80(%rdi),%r14 + cmpl %edx,%ecx + movq 88(%rdi),%rbp + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu 104(%rdi),%xmm8 + movl %ecx,56(%rsp) + cmovleq %rsp,%r14 + subq %r14,%rbp + movq %rbp,112(%rsp) + vmovdqu %xmm8,288(%rsp) + movl 136(%rdi),%ecx + movq 120(%rdi),%r15 + cmpl %edx,%ecx + movq 128(%rdi),%rbp + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu 144(%rdi),%xmm9 + movl %ecx,60(%rsp) + cmovleq %rsp,%r15 + subq %r15,%rbp + movq %rbp,120(%rsp) + vmovdqu %xmm9,304(%rsp) + testl %edx,%edx + jz .Ldec8x_done + + vmovups 16-120(%rsi),%xmm1 + vmovups 32-120(%rsi),%xmm0 + movl 240-120(%rsi),%eax + leaq 192+128(%rsp),%rbp + + vmovdqu (%r8),%xmm2 + vmovdqu (%r9),%xmm3 + vmovdqu (%r10),%xmm4 + vmovdqu (%r11),%xmm5 + vmovdqu (%r12),%xmm6 + vmovdqu (%r13),%xmm7 + vmovdqu (%r14),%xmm8 + vmovdqu (%r15),%xmm9 + vmovdqu %xmm2,0(%rbp) + vpxor %xmm15,%xmm2,%xmm2 + vmovdqu %xmm3,16(%rbp) + vpxor %xmm15,%xmm3,%xmm3 + vmovdqu %xmm4,32(%rbp) + vpxor %xmm15,%xmm4,%xmm4 + vmovdqu %xmm5,48(%rbp) + vpxor %xmm15,%xmm5,%xmm5 + vmovdqu %xmm6,64(%rbp) + vpxor %xmm15,%xmm6,%xmm6 + vmovdqu %xmm7,80(%rbp) + vpxor %xmm15,%xmm7,%xmm7 + vmovdqu %xmm8,96(%rbp) + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu %xmm9,112(%rbp) + vpxor %xmm15,%xmm9,%xmm9 + xorq $0x80,%rbp + movl $1,%ecx + jmp .Loop_dec8x + +.align 32 +.Loop_dec8x: + vaesdec %xmm1,%xmm2,%xmm2 + cmpl 32+0(%rsp),%ecx + vaesdec %xmm1,%xmm3,%xmm3 + prefetcht0 31(%r8) + vaesdec %xmm1,%xmm4,%xmm4 + vaesdec %xmm1,%xmm5,%xmm5 + leaq (%r8,%rbx,1),%rbx + cmovgeq %rsp,%r8 + vaesdec %xmm1,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesdec %xmm1,%xmm7,%xmm7 + subq %r8,%rbx + vaesdec %xmm1,%xmm8,%xmm8 + vmovdqu 16(%r8),%xmm10 + movq %rbx,64+0(%rsp) + vaesdec %xmm1,%xmm9,%xmm9 + vmovups -72(%rsi),%xmm1 + leaq 16(%r8,%rbx,1),%r8 + vmovdqu %xmm10,128(%rsp) + vaesdec %xmm0,%xmm2,%xmm2 + cmpl 32+4(%rsp),%ecx + movq 64+8(%rsp),%rbx + vaesdec %xmm0,%xmm3,%xmm3 + prefetcht0 31(%r9) + vaesdec %xmm0,%xmm4,%xmm4 + vaesdec %xmm0,%xmm5,%xmm5 + leaq (%r9,%rbx,1),%rbx + cmovgeq %rsp,%r9 + vaesdec %xmm0,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesdec %xmm0,%xmm7,%xmm7 + subq %r9,%rbx + vaesdec %xmm0,%xmm8,%xmm8 + vmovdqu 16(%r9),%xmm11 + movq %rbx,64+8(%rsp) + vaesdec %xmm0,%xmm9,%xmm9 + vmovups -56(%rsi),%xmm0 + leaq 16(%r9,%rbx,1),%r9 + vmovdqu %xmm11,144(%rsp) + vaesdec %xmm1,%xmm2,%xmm2 + cmpl 32+8(%rsp),%ecx + movq 64+16(%rsp),%rbx + vaesdec %xmm1,%xmm3,%xmm3 + prefetcht0 31(%r10) + vaesdec %xmm1,%xmm4,%xmm4 + prefetcht0 15(%r8) + vaesdec %xmm1,%xmm5,%xmm5 + leaq (%r10,%rbx,1),%rbx + cmovgeq %rsp,%r10 + vaesdec %xmm1,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesdec %xmm1,%xmm7,%xmm7 + subq %r10,%rbx + vaesdec %xmm1,%xmm8,%xmm8 + vmovdqu 16(%r10),%xmm12 + movq %rbx,64+16(%rsp) + vaesdec %xmm1,%xmm9,%xmm9 + vmovups -40(%rsi),%xmm1 + leaq 16(%r10,%rbx,1),%r10 + vmovdqu %xmm12,160(%rsp) + vaesdec %xmm0,%xmm2,%xmm2 + cmpl 32+12(%rsp),%ecx + movq 64+24(%rsp),%rbx + vaesdec %xmm0,%xmm3,%xmm3 + prefetcht0 31(%r11) + vaesdec %xmm0,%xmm4,%xmm4 + prefetcht0 15(%r9) + vaesdec %xmm0,%xmm5,%xmm5 + leaq (%r11,%rbx,1),%rbx + cmovgeq %rsp,%r11 + vaesdec %xmm0,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesdec %xmm0,%xmm7,%xmm7 + subq %r11,%rbx + vaesdec %xmm0,%xmm8,%xmm8 + vmovdqu 16(%r11),%xmm13 + movq %rbx,64+24(%rsp) + vaesdec %xmm0,%xmm9,%xmm9 + vmovups -24(%rsi),%xmm0 + leaq 16(%r11,%rbx,1),%r11 + vmovdqu %xmm13,176(%rsp) + vaesdec %xmm1,%xmm2,%xmm2 + cmpl 32+16(%rsp),%ecx + movq 64+32(%rsp),%rbx + vaesdec %xmm1,%xmm3,%xmm3 + prefetcht0 31(%r12) + vaesdec %xmm1,%xmm4,%xmm4 + prefetcht0 15(%r10) + vaesdec %xmm1,%xmm5,%xmm5 + leaq (%r12,%rbx,1),%rbx + cmovgeq %rsp,%r12 + vaesdec %xmm1,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesdec %xmm1,%xmm7,%xmm7 + subq %r12,%rbx + vaesdec %xmm1,%xmm8,%xmm8 + vmovdqu 16(%r12),%xmm10 + movq %rbx,64+32(%rsp) + vaesdec %xmm1,%xmm9,%xmm9 + vmovups -8(%rsi),%xmm1 + leaq 16(%r12,%rbx,1),%r12 + vaesdec %xmm0,%xmm2,%xmm2 + cmpl 32+20(%rsp),%ecx + movq 64+40(%rsp),%rbx + vaesdec %xmm0,%xmm3,%xmm3 + prefetcht0 31(%r13) + vaesdec %xmm0,%xmm4,%xmm4 + prefetcht0 15(%r11) + vaesdec %xmm0,%xmm5,%xmm5 + leaq (%rbx,%r13,1),%rbx + cmovgeq %rsp,%r13 + vaesdec %xmm0,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesdec %xmm0,%xmm7,%xmm7 + subq %r13,%rbx + vaesdec %xmm0,%xmm8,%xmm8 + vmovdqu 16(%r13),%xmm11 + movq %rbx,64+40(%rsp) + vaesdec %xmm0,%xmm9,%xmm9 + vmovups 8(%rsi),%xmm0 + leaq 16(%r13,%rbx,1),%r13 + vaesdec %xmm1,%xmm2,%xmm2 + cmpl 32+24(%rsp),%ecx + movq 64+48(%rsp),%rbx + vaesdec %xmm1,%xmm3,%xmm3 + prefetcht0 31(%r14) + vaesdec %xmm1,%xmm4,%xmm4 + prefetcht0 15(%r12) + vaesdec %xmm1,%xmm5,%xmm5 + leaq (%r14,%rbx,1),%rbx + cmovgeq %rsp,%r14 + vaesdec %xmm1,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesdec %xmm1,%xmm7,%xmm7 + subq %r14,%rbx + vaesdec %xmm1,%xmm8,%xmm8 + vmovdqu 16(%r14),%xmm12 + movq %rbx,64+48(%rsp) + vaesdec %xmm1,%xmm9,%xmm9 + vmovups 24(%rsi),%xmm1 + leaq 16(%r14,%rbx,1),%r14 + vaesdec %xmm0,%xmm2,%xmm2 + cmpl 32+28(%rsp),%ecx + movq 64+56(%rsp),%rbx + vaesdec %xmm0,%xmm3,%xmm3 + prefetcht0 31(%r15) + vaesdec %xmm0,%xmm4,%xmm4 + prefetcht0 15(%r13) + vaesdec %xmm0,%xmm5,%xmm5 + leaq (%r15,%rbx,1),%rbx + cmovgeq %rsp,%r15 + vaesdec %xmm0,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesdec %xmm0,%xmm7,%xmm7 + subq %r15,%rbx + vaesdec %xmm0,%xmm8,%xmm8 + vmovdqu 16(%r15),%xmm13 + movq %rbx,64+56(%rsp) + vaesdec %xmm0,%xmm9,%xmm9 + vmovups 40(%rsi),%xmm0 + leaq 16(%r15,%rbx,1),%r15 + vmovdqu 32(%rsp),%xmm14 + prefetcht0 15(%r14) + prefetcht0 15(%r15) + cmpl $11,%eax + jb .Ldec8x_tail + + vaesdec %xmm1,%xmm2,%xmm2 + vaesdec %xmm1,%xmm3,%xmm3 + vaesdec %xmm1,%xmm4,%xmm4 + vaesdec %xmm1,%xmm5,%xmm5 + vaesdec %xmm1,%xmm6,%xmm6 + vaesdec %xmm1,%xmm7,%xmm7 + vaesdec %xmm1,%xmm8,%xmm8 + vaesdec %xmm1,%xmm9,%xmm9 + vmovups 176-120(%rsi),%xmm1 + + vaesdec %xmm0,%xmm2,%xmm2 + vaesdec %xmm0,%xmm3,%xmm3 + vaesdec %xmm0,%xmm4,%xmm4 + vaesdec %xmm0,%xmm5,%xmm5 + vaesdec %xmm0,%xmm6,%xmm6 + vaesdec %xmm0,%xmm7,%xmm7 + vaesdec %xmm0,%xmm8,%xmm8 + vaesdec %xmm0,%xmm9,%xmm9 + vmovups 192-120(%rsi),%xmm0 + je .Ldec8x_tail + + vaesdec %xmm1,%xmm2,%xmm2 + vaesdec %xmm1,%xmm3,%xmm3 + vaesdec %xmm1,%xmm4,%xmm4 + vaesdec %xmm1,%xmm5,%xmm5 + vaesdec %xmm1,%xmm6,%xmm6 + vaesdec %xmm1,%xmm7,%xmm7 + vaesdec %xmm1,%xmm8,%xmm8 + vaesdec %xmm1,%xmm9,%xmm9 + vmovups 208-120(%rsi),%xmm1 + + vaesdec %xmm0,%xmm2,%xmm2 + vaesdec %xmm0,%xmm3,%xmm3 + vaesdec %xmm0,%xmm4,%xmm4 + vaesdec %xmm0,%xmm5,%xmm5 + vaesdec %xmm0,%xmm6,%xmm6 + vaesdec %xmm0,%xmm7,%xmm7 + vaesdec %xmm0,%xmm8,%xmm8 + vaesdec %xmm0,%xmm9,%xmm9 + vmovups 224-120(%rsi),%xmm0 + +.Ldec8x_tail: + vaesdec %xmm1,%xmm2,%xmm2 + vpxor %xmm15,%xmm15,%xmm15 + vaesdec %xmm1,%xmm3,%xmm3 + vaesdec %xmm1,%xmm4,%xmm4 + vpcmpgtd %xmm15,%xmm14,%xmm15 + vaesdec %xmm1,%xmm5,%xmm5 + vaesdec %xmm1,%xmm6,%xmm6 + vpaddd %xmm14,%xmm15,%xmm15 + vmovdqu 48(%rsp),%xmm14 + vaesdec %xmm1,%xmm7,%xmm7 + movq 64(%rsp),%rbx + vaesdec %xmm1,%xmm8,%xmm8 + vaesdec %xmm1,%xmm9,%xmm9 + vmovups 16-120(%rsi),%xmm1 + + vaesdeclast %xmm0,%xmm2,%xmm2 + vmovdqa %xmm15,32(%rsp) + vpxor %xmm15,%xmm15,%xmm15 + vaesdeclast %xmm0,%xmm3,%xmm3 + vpxor 0(%rbp),%xmm2,%xmm2 + vaesdeclast %xmm0,%xmm4,%xmm4 + vpxor 16(%rbp),%xmm3,%xmm3 + vpcmpgtd %xmm15,%xmm14,%xmm15 + vaesdeclast %xmm0,%xmm5,%xmm5 + vpxor 32(%rbp),%xmm4,%xmm4 + vaesdeclast %xmm0,%xmm6,%xmm6 + vpxor 48(%rbp),%xmm5,%xmm5 + vpaddd %xmm15,%xmm14,%xmm14 + vmovdqu -120(%rsi),%xmm15 + vaesdeclast %xmm0,%xmm7,%xmm7 + vpxor 64(%rbp),%xmm6,%xmm6 + vaesdeclast %xmm0,%xmm8,%xmm8 + vpxor 80(%rbp),%xmm7,%xmm7 + vmovdqa %xmm14,48(%rsp) + vaesdeclast %xmm0,%xmm9,%xmm9 + vpxor 96(%rbp),%xmm8,%xmm8 + vmovups 32-120(%rsi),%xmm0 + + vmovups %xmm2,-16(%r8) + subq %rbx,%r8 + vmovdqu 128+0(%rsp),%xmm2 + vpxor 112(%rbp),%xmm9,%xmm9 + vmovups %xmm3,-16(%r9) + subq 72(%rsp),%r9 + vmovdqu %xmm2,0(%rbp) + vpxor %xmm15,%xmm2,%xmm2 + vmovdqu 128+16(%rsp),%xmm3 + vmovups %xmm4,-16(%r10) + subq 80(%rsp),%r10 + vmovdqu %xmm3,16(%rbp) + vpxor %xmm15,%xmm3,%xmm3 + vmovdqu 128+32(%rsp),%xmm4 + vmovups %xmm5,-16(%r11) + subq 88(%rsp),%r11 + vmovdqu %xmm4,32(%rbp) + vpxor %xmm15,%xmm4,%xmm4 + vmovdqu 128+48(%rsp),%xmm5 + vmovups %xmm6,-16(%r12) + subq 96(%rsp),%r12 + vmovdqu %xmm5,48(%rbp) + vpxor %xmm15,%xmm5,%xmm5 + vmovdqu %xmm10,64(%rbp) + vpxor %xmm10,%xmm15,%xmm6 + vmovups %xmm7,-16(%r13) + subq 104(%rsp),%r13 + vmovdqu %xmm11,80(%rbp) + vpxor %xmm11,%xmm15,%xmm7 + vmovups %xmm8,-16(%r14) + subq 112(%rsp),%r14 + vmovdqu %xmm12,96(%rbp) + vpxor %xmm12,%xmm15,%xmm8 + vmovups %xmm9,-16(%r15) + subq 120(%rsp),%r15 + vmovdqu %xmm13,112(%rbp) + vpxor %xmm13,%xmm15,%xmm9 + + xorq $128,%rbp + decl %edx + jnz .Loop_dec8x + + movq 16(%rsp),%rax +.cfi_def_cfa %rax,8 + + + + + +.Ldec8x_done: + vzeroupper + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Ldec8x_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx diff --git a/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S b/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S index 294db310a06..38f306142c8 100644 --- a/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S +++ b/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S @@ -13,6 +13,11 @@ aesni_cbc_sha1_enc: movq OPENSSL_ia32cap_P+4(%rip),%r11 btq $61,%r11 jc aesni_cbc_sha1_enc_shaext + andl $268435456,%r11d + andl $1073741824,%r10d + orl %r11d,%r10d + cmpl $1342177280,%r10d + je aesni_cbc_sha1_enc_avx jmp aesni_cbc_sha1_enc_ssse3 .byte 0xf3,0xc3 .cfi_endproc @@ -1394,6 +1399,1327 @@ aesni_cbc_sha1_enc_ssse3: .byte 0xf3,0xc3 .cfi_endproc .size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3 +.type aesni_cbc_sha1_enc_avx,@function +.align 32 +aesni_cbc_sha1_enc_avx: +.cfi_startproc + movq 8(%rsp),%r10 + + + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -104(%rsp),%rsp +.cfi_adjust_cfa_offset 104 + + + vzeroall + movq %rdi,%r12 + movq %rsi,%r13 + movq %rdx,%r14 + leaq 112(%rcx),%r15 + vmovdqu (%r8),%xmm12 + movq %r8,88(%rsp) + shlq $6,%r14 + subq %r12,%r13 + movl 240-112(%r15),%r8d + addq %r10,%r14 + + leaq K_XX_XX(%rip),%r11 + movl 0(%r9),%eax + movl 4(%r9),%ebx + movl 8(%r9),%ecx + movl 12(%r9),%edx + movl %ebx,%esi + movl 16(%r9),%ebp + movl %ecx,%edi + xorl %edx,%edi + andl %edi,%esi + + vmovdqa 64(%r11),%xmm6 + vmovdqa 0(%r11),%xmm10 + vmovdqu 0(%r10),%xmm0 + vmovdqu 16(%r10),%xmm1 + vmovdqu 32(%r10),%xmm2 + vmovdqu 48(%r10),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + addq $64,%r10 + vpshufb %xmm6,%xmm1,%xmm1 + vpshufb %xmm6,%xmm2,%xmm2 + vpshufb %xmm6,%xmm3,%xmm3 + vpaddd %xmm10,%xmm0,%xmm4 + vpaddd %xmm10,%xmm1,%xmm5 + vpaddd %xmm10,%xmm2,%xmm6 + vmovdqa %xmm4,0(%rsp) + vmovdqa %xmm5,16(%rsp) + vmovdqa %xmm6,32(%rsp) + vmovups -112(%r15),%xmm15 + vmovups 16-112(%r15),%xmm14 + jmp .Loop_avx +.align 32 +.Loop_avx: + shrdl $2,%ebx,%ebx + vmovdqu 0(%r12),%xmm13 + vpxor %xmm15,%xmm13,%xmm13 + vpxor %xmm13,%xmm12,%xmm12 + vaesenc %xmm14,%xmm12,%xmm12 + vmovups -80(%r15),%xmm15 + xorl %edx,%esi + vpalignr $8,%xmm0,%xmm1,%xmm4 + movl %eax,%edi + addl 0(%rsp),%ebp + vpaddd %xmm3,%xmm10,%xmm9 + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrldq $4,%xmm3,%xmm8 + addl %esi,%ebp + andl %ebx,%edi + vpxor %xmm0,%xmm4,%xmm4 + xorl %ecx,%ebx + addl %eax,%ebp + vpxor %xmm2,%xmm8,%xmm8 + shrdl $7,%eax,%eax + xorl %ecx,%edi + movl %ebp,%esi + addl 4(%rsp),%edx + vpxor %xmm8,%xmm4,%xmm4 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vmovdqa %xmm9,48(%rsp) + addl %edi,%edx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups -64(%r15),%xmm14 + andl %eax,%esi + vpsrld $31,%xmm4,%xmm8 + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%esi + vpslldq $12,%xmm4,%xmm9 + vpaddd %xmm4,%xmm4,%xmm4 + movl %edx,%edi + addl 8(%rsp),%ecx + xorl %eax,%ebp + shldl $5,%edx,%edx + vpor %xmm8,%xmm4,%xmm4 + vpsrld $30,%xmm9,%xmm8 + addl %esi,%ecx + andl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + vpslld $2,%xmm9,%xmm9 + vpxor %xmm8,%xmm4,%xmm4 + shrdl $7,%edx,%edx + xorl %eax,%edi + movl %ecx,%esi + addl 12(%rsp),%ebx + vaesenc %xmm14,%xmm12,%xmm12 + vmovups -48(%r15),%xmm15 + vpxor %xmm9,%xmm4,%xmm4 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + addl %edi,%ebx + andl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %ebp,%esi + vpalignr $8,%xmm1,%xmm2,%xmm5 + movl %ebx,%edi + addl 16(%rsp),%eax + vpaddd %xmm4,%xmm10,%xmm9 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrldq $4,%xmm4,%xmm8 + addl %esi,%eax + andl %ecx,%edi + vpxor %xmm1,%xmm5,%xmm5 + xorl %edx,%ecx + addl %ebx,%eax + vpxor %xmm3,%xmm8,%xmm8 + shrdl $7,%ebx,%ebx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups -32(%r15),%xmm14 + xorl %edx,%edi + movl %eax,%esi + addl 20(%rsp),%ebp + vpxor %xmm8,%xmm5,%xmm5 + xorl %ecx,%ebx + shldl $5,%eax,%eax + vmovdqa %xmm9,0(%rsp) + addl %edi,%ebp + andl %ebx,%esi + vpsrld $31,%xmm5,%xmm8 + xorl %ecx,%ebx + addl %eax,%ebp + shrdl $7,%eax,%eax + xorl %ecx,%esi + vpslldq $12,%xmm5,%xmm9 + vpaddd %xmm5,%xmm5,%xmm5 + movl %ebp,%edi + addl 24(%rsp),%edx + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vpor %xmm8,%xmm5,%xmm5 + vpsrld $30,%xmm9,%xmm8 + addl %esi,%edx + vaesenc %xmm14,%xmm12,%xmm12 + vmovups -16(%r15),%xmm15 + andl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + vpslld $2,%xmm9,%xmm9 + vpxor %xmm8,%xmm5,%xmm5 + shrdl $7,%ebp,%ebp + xorl %ebx,%edi + movl %edx,%esi + addl 28(%rsp),%ecx + vpxor %xmm9,%xmm5,%xmm5 + xorl %eax,%ebp + shldl $5,%edx,%edx + vmovdqa 16(%r11),%xmm10 + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + vpalignr $8,%xmm2,%xmm3,%xmm6 + movl %ecx,%edi + addl 32(%rsp),%ebx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 0(%r15),%xmm14 + vpaddd %xmm5,%xmm10,%xmm9 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + vpsrldq $4,%xmm5,%xmm8 + addl %esi,%ebx + andl %edx,%edi + vpxor %xmm2,%xmm6,%xmm6 + xorl %ebp,%edx + addl %ecx,%ebx + vpxor %xmm4,%xmm8,%xmm8 + shrdl $7,%ecx,%ecx + xorl %ebp,%edi + movl %ebx,%esi + addl 36(%rsp),%eax + vpxor %xmm8,%xmm6,%xmm6 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vmovdqa %xmm9,16(%rsp) + addl %edi,%eax + andl %ecx,%esi + vpsrld $31,%xmm6,%xmm8 + xorl %edx,%ecx + addl %ebx,%eax + shrdl $7,%ebx,%ebx + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 16(%r15),%xmm15 + xorl %edx,%esi + vpslldq $12,%xmm6,%xmm9 + vpaddd %xmm6,%xmm6,%xmm6 + movl %eax,%edi + addl 40(%rsp),%ebp + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpor %xmm8,%xmm6,%xmm6 + vpsrld $30,%xmm9,%xmm8 + addl %esi,%ebp + andl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + vpslld $2,%xmm9,%xmm9 + vpxor %xmm8,%xmm6,%xmm6 + shrdl $7,%eax,%eax + xorl %ecx,%edi + movl %ebp,%esi + addl 44(%rsp),%edx + vpxor %xmm9,%xmm6,%xmm6 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + addl %edi,%edx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 32(%r15),%xmm14 + andl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%esi + vpalignr $8,%xmm3,%xmm4,%xmm7 + movl %edx,%edi + addl 48(%rsp),%ecx + vpaddd %xmm6,%xmm10,%xmm9 + xorl %eax,%ebp + shldl $5,%edx,%edx + vpsrldq $4,%xmm6,%xmm8 + addl %esi,%ecx + andl %ebp,%edi + vpxor %xmm3,%xmm7,%xmm7 + xorl %eax,%ebp + addl %edx,%ecx + vpxor %xmm5,%xmm8,%xmm8 + shrdl $7,%edx,%edx + xorl %eax,%edi + movl %ecx,%esi + addl 52(%rsp),%ebx + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 48(%r15),%xmm15 + vpxor %xmm8,%xmm7,%xmm7 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + vmovdqa %xmm9,32(%rsp) + addl %edi,%ebx + andl %edx,%esi + vpsrld $31,%xmm7,%xmm8 + xorl %ebp,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %ebp,%esi + vpslldq $12,%xmm7,%xmm9 + vpaddd %xmm7,%xmm7,%xmm7 + movl %ebx,%edi + addl 56(%rsp),%eax + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpor %xmm8,%xmm7,%xmm7 + vpsrld $30,%xmm9,%xmm8 + addl %esi,%eax + andl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + vpslld $2,%xmm9,%xmm9 + vpxor %xmm8,%xmm7,%xmm7 + shrdl $7,%ebx,%ebx + cmpl $11,%r8d + jb .Lvaesenclast6 + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 64(%r15),%xmm14 + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 80(%r15),%xmm15 + je .Lvaesenclast6 + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 96(%r15),%xmm14 + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 112(%r15),%xmm15 +.Lvaesenclast6: + vaesenclast %xmm15,%xmm12,%xmm12 + vmovups -112(%r15),%xmm15 + vmovups 16-112(%r15),%xmm14 + xorl %edx,%edi + movl %eax,%esi + addl 60(%rsp),%ebp + vpxor %xmm9,%xmm7,%xmm7 + xorl %ecx,%ebx + shldl $5,%eax,%eax + addl %edi,%ebp + andl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm6,%xmm7,%xmm8 + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%esi + movl %ebp,%edi + addl 0(%rsp),%edx + vpxor %xmm1,%xmm0,%xmm0 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vpaddd %xmm7,%xmm10,%xmm9 + addl %esi,%edx + vmovdqu 16(%r12),%xmm13 + vpxor %xmm15,%xmm13,%xmm13 + vmovups %xmm12,0(%r12,%r13,1) + vpxor %xmm13,%xmm12,%xmm12 + vaesenc %xmm14,%xmm12,%xmm12 + vmovups -80(%r15),%xmm15 + andl %eax,%edi + vpxor %xmm8,%xmm0,%xmm0 + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%edi + vpsrld $30,%xmm0,%xmm8 + vmovdqa %xmm9,48(%rsp) + movl %edx,%esi + addl 4(%rsp),%ecx + xorl %eax,%ebp + shldl $5,%edx,%edx + vpslld $2,%xmm0,%xmm0 + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + movl %ecx,%edi + addl 8(%rsp),%ebx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups -64(%r15),%xmm14 + vpor %xmm8,%xmm0,%xmm0 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + addl %esi,%ebx + andl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 12(%rsp),%eax + xorl %ebp,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm7,%xmm0,%xmm8 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%rsp),%ebp + vaesenc %xmm14,%xmm12,%xmm12 + vmovups -48(%r15),%xmm15 + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + vpxor %xmm2,%xmm1,%xmm1 + addl %esi,%ebp + xorl %ecx,%edi + vpaddd %xmm0,%xmm10,%xmm9 + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpxor %xmm8,%xmm1,%xmm1 + addl 20(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + vpsrld $30,%xmm1,%xmm8 + vmovdqa %xmm9,0(%rsp) + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpslld $2,%xmm1,%xmm1 + addl 24(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups -32(%r15),%xmm14 + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpor %xmm8,%xmm1,%xmm1 + addl 28(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpalignr $8,%xmm0,%xmm1,%xmm8 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + vpxor %xmm3,%xmm2,%xmm2 + addl %esi,%eax + xorl %edx,%edi + vpaddd %xmm1,%xmm10,%xmm9 + vmovdqa 32(%r11),%xmm10 + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpxor %xmm8,%xmm2,%xmm2 + addl 36(%rsp),%ebp + vaesenc %xmm14,%xmm12,%xmm12 + vmovups -16(%r15),%xmm15 + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm8 + vmovdqa %xmm9,16(%rsp) + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpslld $2,%xmm2,%xmm2 + addl 40(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpor %xmm8,%xmm2,%xmm2 + addl 44(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 0(%r15),%xmm14 + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpalignr $8,%xmm1,%xmm2,%xmm8 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + vpxor %xmm4,%xmm3,%xmm3 + addl %esi,%ebx + xorl %ebp,%edi + vpaddd %xmm2,%xmm10,%xmm9 + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpxor %xmm8,%xmm3,%xmm3 + addl 52(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm8 + vmovdqa %xmm9,32(%rsp) + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + addl 56(%rsp),%ebp + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 16(%r15),%xmm15 + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpor %xmm8,%xmm3,%xmm3 + addl 60(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpalignr $8,%xmm2,%xmm3,%xmm8 + vpxor %xmm0,%xmm4,%xmm4 + addl 0(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + vpxor %xmm5,%xmm4,%xmm4 + addl %esi,%ecx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 32(%r15),%xmm14 + xorl %eax,%edi + vpaddd %xmm3,%xmm10,%xmm9 + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpxor %xmm8,%xmm4,%xmm4 + addl 4(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + vpsrld $30,%xmm4,%xmm8 + vmovdqa %xmm9,48(%rsp) + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpslld $2,%xmm4,%xmm4 + addl 8(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpor %xmm8,%xmm4,%xmm4 + addl 12(%rsp),%ebp + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 48(%r15),%xmm15 + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm3,%xmm4,%xmm8 + vpxor %xmm1,%xmm5,%xmm5 + addl 16(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + vpxor %xmm6,%xmm5,%xmm5 + addl %esi,%edx + xorl %ebx,%edi + vpaddd %xmm4,%xmm10,%xmm9 + shrdl $7,%eax,%eax + addl %ebp,%edx + vpxor %xmm8,%xmm5,%xmm5 + addl 20(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + vpsrld $30,%xmm5,%xmm8 + vmovdqa %xmm9,0(%rsp) + addl %edi,%ecx + cmpl $11,%r8d + jb .Lvaesenclast7 + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 64(%r15),%xmm14 + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 80(%r15),%xmm15 + je .Lvaesenclast7 + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 96(%r15),%xmm14 + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 112(%r15),%xmm15 +.Lvaesenclast7: + vaesenclast %xmm15,%xmm12,%xmm12 + vmovups -112(%r15),%xmm15 + vmovups 16-112(%r15),%xmm14 + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpslld $2,%xmm5,%xmm5 + addl 24(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpor %xmm8,%xmm5,%xmm5 + addl 28(%rsp),%eax + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm4,%xmm5,%xmm8 + vpxor %xmm2,%xmm6,%xmm6 + addl 32(%rsp),%ebp + vmovdqu 32(%r12),%xmm13 + vpxor %xmm15,%xmm13,%xmm13 + vmovups %xmm12,16(%r13,%r12,1) + vpxor %xmm13,%xmm12,%xmm12 + vaesenc %xmm14,%xmm12,%xmm12 + vmovups -80(%r15),%xmm15 + andl %ecx,%esi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vpxor %xmm7,%xmm6,%xmm6 + movl %eax,%edi + xorl %ecx,%esi + vpaddd %xmm5,%xmm10,%xmm9 + shldl $5,%eax,%eax + addl %esi,%ebp + vpxor %xmm8,%xmm6,%xmm6 + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 36(%rsp),%edx + vpsrld $30,%xmm6,%xmm8 + vmovdqa %xmm9,16(%rsp) + andl %ebx,%edi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%esi + vpslld $2,%xmm6,%xmm6 + xorl %ebx,%edi + shldl $5,%ebp,%ebp + addl %edi,%edx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups -64(%r15),%xmm14 + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + addl 40(%rsp),%ecx + andl %eax,%esi + vpor %xmm8,%xmm6,%xmm6 + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%edi + xorl %eax,%esi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 44(%rsp),%ebx + andl %ebp,%edi + xorl %eax,%ebp + shrdl $7,%edx,%edx + vaesenc %xmm14,%xmm12,%xmm12 + vmovups -48(%r15),%xmm15 + movl %ecx,%esi + xorl %ebp,%edi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + vpalignr $8,%xmm5,%xmm6,%xmm8 + vpxor %xmm3,%xmm7,%xmm7 + addl 48(%rsp),%eax + andl %edx,%esi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + vpxor %xmm0,%xmm7,%xmm7 + movl %ebx,%edi + xorl %edx,%esi + vpaddd %xmm6,%xmm10,%xmm9 + vmovdqa 48(%r11),%xmm10 + shldl $5,%ebx,%ebx + addl %esi,%eax + vpxor %xmm8,%xmm7,%xmm7 + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 52(%rsp),%ebp + vaesenc %xmm15,%xmm12,%xmm12 + vmovups -32(%r15),%xmm14 + vpsrld $30,%xmm7,%xmm8 + vmovdqa %xmm9,32(%rsp) + andl %ecx,%edi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + vpslld $2,%xmm7,%xmm7 + xorl %ecx,%edi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + addl 56(%rsp),%edx + andl %ebx,%esi + vpor %xmm8,%xmm7,%xmm7 + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%edi + xorl %ebx,%esi + shldl $5,%ebp,%ebp + addl %esi,%edx + vaesenc %xmm14,%xmm12,%xmm12 + vmovups -16(%r15),%xmm15 + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 60(%rsp),%ecx + andl %eax,%edi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%esi + xorl %eax,%edi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + vpalignr $8,%xmm6,%xmm7,%xmm8 + vpxor %xmm4,%xmm0,%xmm0 + addl 0(%rsp),%ebx + andl %ebp,%esi + xorl %eax,%ebp + shrdl $7,%edx,%edx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 0(%r15),%xmm14 + vpxor %xmm1,%xmm0,%xmm0 + movl %ecx,%edi + xorl %ebp,%esi + vpaddd %xmm7,%xmm10,%xmm9 + shldl $5,%ecx,%ecx + addl %esi,%ebx + vpxor %xmm8,%xmm0,%xmm0 + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 4(%rsp),%eax + vpsrld $30,%xmm0,%xmm8 + vmovdqa %xmm9,48(%rsp) + andl %edx,%edi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + vpslld $2,%xmm0,%xmm0 + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + addl 8(%rsp),%ebp + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 16(%r15),%xmm15 + andl %ecx,%esi + vpor %xmm8,%xmm0,%xmm0 + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%edi + xorl %ecx,%esi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 12(%rsp),%edx + andl %ebx,%edi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%esi + xorl %ebx,%edi + shldl $5,%ebp,%ebp + addl %edi,%edx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 32(%r15),%xmm14 + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + vpalignr $8,%xmm7,%xmm0,%xmm8 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%rsp),%ecx + andl %eax,%esi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + vpxor %xmm2,%xmm1,%xmm1 + movl %edx,%edi + xorl %eax,%esi + vpaddd %xmm0,%xmm10,%xmm9 + shldl $5,%edx,%edx + addl %esi,%ecx + vpxor %xmm8,%xmm1,%xmm1 + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 20(%rsp),%ebx + vpsrld $30,%xmm1,%xmm8 + vmovdqa %xmm9,0(%rsp) + andl %ebp,%edi + xorl %eax,%ebp + shrdl $7,%edx,%edx + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 48(%r15),%xmm15 + movl %ecx,%esi + vpslld $2,%xmm1,%xmm1 + xorl %ebp,%edi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + addl 24(%rsp),%eax + andl %edx,%esi + vpor %xmm8,%xmm1,%xmm1 + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%edi + xorl %edx,%esi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 28(%rsp),%ebp + cmpl $11,%r8d + jb .Lvaesenclast8 + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 64(%r15),%xmm14 + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 80(%r15),%xmm15 + je .Lvaesenclast8 + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 96(%r15),%xmm14 + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 112(%r15),%xmm15 +.Lvaesenclast8: + vaesenclast %xmm15,%xmm12,%xmm12 + vmovups -112(%r15),%xmm15 + vmovups 16-112(%r15),%xmm14 + andl %ecx,%edi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + xorl %ecx,%edi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm0,%xmm1,%xmm8 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%rsp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vpxor %xmm3,%xmm2,%xmm2 + movl %ebp,%edi + xorl %ebx,%esi + vpaddd %xmm1,%xmm10,%xmm9 + shldl $5,%ebp,%ebp + addl %esi,%edx + vmovdqu 48(%r12),%xmm13 + vpxor %xmm15,%xmm13,%xmm13 + vmovups %xmm12,32(%r13,%r12,1) + vpxor %xmm13,%xmm12,%xmm12 + vaesenc %xmm14,%xmm12,%xmm12 + vmovups -80(%r15),%xmm15 + vpxor %xmm8,%xmm2,%xmm2 + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 36(%rsp),%ecx + vpsrld $30,%xmm2,%xmm8 + vmovdqa %xmm9,16(%rsp) + andl %eax,%edi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%esi + vpslld $2,%xmm2,%xmm2 + xorl %eax,%edi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + addl 40(%rsp),%ebx + andl %ebp,%esi + vpor %xmm8,%xmm2,%xmm2 + xorl %eax,%ebp + shrdl $7,%edx,%edx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups -64(%r15),%xmm14 + movl %ecx,%edi + xorl %ebp,%esi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 44(%rsp),%eax + andl %edx,%edi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + addl %ebx,%eax + vpalignr $8,%xmm1,%xmm2,%xmm8 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%rsp),%ebp + vaesenc %xmm14,%xmm12,%xmm12 + vmovups -48(%r15),%xmm15 + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + vpxor %xmm4,%xmm3,%xmm3 + addl %esi,%ebp + xorl %ecx,%edi + vpaddd %xmm2,%xmm10,%xmm9 + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpxor %xmm8,%xmm3,%xmm3 + addl 52(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + vpsrld $30,%xmm3,%xmm8 + vmovdqa %xmm9,32(%rsp) + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpslld $2,%xmm3,%xmm3 + addl 56(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups -32(%r15),%xmm14 + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpor %xmm8,%xmm3,%xmm3 + addl 60(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 0(%rsp),%eax + vpaddd %xmm3,%xmm10,%xmm9 + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + vmovdqa %xmm9,48(%rsp) + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 4(%rsp),%ebp + vaesenc %xmm14,%xmm12,%xmm12 + vmovups -16(%r15),%xmm15 + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 8(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 12(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 0(%r15),%xmm14 + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + cmpq %r14,%r10 + je .Ldone_avx + vmovdqa 64(%r11),%xmm9 + vmovdqa 0(%r11),%xmm10 + vmovdqu 0(%r10),%xmm0 + vmovdqu 16(%r10),%xmm1 + vmovdqu 32(%r10),%xmm2 + vmovdqu 48(%r10),%xmm3 + vpshufb %xmm9,%xmm0,%xmm0 + addq $64,%r10 + addl 16(%rsp),%ebx + xorl %ebp,%esi + vpshufb %xmm9,%xmm1,%xmm1 + movl %ecx,%edi + shldl $5,%ecx,%ecx + vpaddd %xmm10,%xmm0,%xmm8 + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vmovdqa %xmm8,0(%rsp) + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 16(%r15),%xmm15 + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + vpshufb %xmm9,%xmm2,%xmm2 + movl %edx,%edi + shldl $5,%edx,%edx + vpaddd %xmm10,%xmm1,%xmm8 + addl %esi,%ecx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 32(%r15),%xmm14 + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vmovdqa %xmm8,16(%rsp) + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 48(%r15),%xmm15 + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + vpshufb %xmm9,%xmm3,%xmm3 + movl %ebp,%edi + shldl $5,%ebp,%ebp + vpaddd %xmm10,%xmm2,%xmm8 + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + vmovdqa %xmm8,32(%rsp) + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + cmpl $11,%r8d + jb .Lvaesenclast9 + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 64(%r15),%xmm14 + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 80(%r15),%xmm15 + je .Lvaesenclast9 + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 96(%r15),%xmm14 + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 112(%r15),%xmm15 +.Lvaesenclast9: + vaesenclast %xmm15,%xmm12,%xmm12 + vmovups -112(%r15),%xmm15 + vmovups 16-112(%r15),%xmm14 + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vmovups %xmm12,48(%r13,%r12,1) + leaq 64(%r12),%r12 + + addl 0(%r9),%eax + addl 4(%r9),%esi + addl 8(%r9),%ecx + addl 12(%r9),%edx + movl %eax,0(%r9) + addl 16(%r9),%ebp + movl %esi,4(%r9) + movl %esi,%ebx + movl %ecx,8(%r9) + movl %ecx,%edi + movl %edx,12(%r9) + xorl %edx,%edi + movl %ebp,16(%r9) + andl %edi,%esi + jmp .Loop_avx + +.Ldone_avx: + addl 16(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 16(%r15),%xmm15 + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 32(%r15),%xmm14 + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 48(%r15),%xmm15 + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + cmpl $11,%r8d + jb .Lvaesenclast10 + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 64(%r15),%xmm14 + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 80(%r15),%xmm15 + je .Lvaesenclast10 + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 96(%r15),%xmm14 + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 112(%r15),%xmm15 +.Lvaesenclast10: + vaesenclast %xmm15,%xmm12,%xmm12 + vmovups -112(%r15),%xmm15 + vmovups 16-112(%r15),%xmm14 + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vmovups %xmm12,48(%r13,%r12,1) + movq 88(%rsp),%r8 + + addl 0(%r9),%eax + addl 4(%r9),%esi + addl 8(%r9),%ecx + movl %eax,0(%r9) + addl 12(%r9),%edx + movl %esi,4(%r9) + addl 16(%r9),%ebp + movl %ecx,8(%r9) + movl %edx,12(%r9) + movl %ebp,16(%r9) + vmovups %xmm12,(%r8) + vzeroall + leaq 104(%rsp),%rsi +.cfi_def_cfa %rsi,56 + movq 0(%rsi),%r15 +.cfi_restore %r15 + movq 8(%rsi),%r14 +.cfi_restore %r14 + movq 16(%rsi),%r13 +.cfi_restore %r13 + movq 24(%rsi),%r12 +.cfi_restore %r12 + movq 32(%rsi),%rbp +.cfi_restore %rbp + movq 40(%rsi),%rbx +.cfi_restore %rbx + leaq 48(%rsi),%rsp +.cfi_def_cfa %rsp,8 +.Lepilogue_avx: + .byte 0xf3,0xc3 +.cfi_endproc +.size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx .align 64 K_XX_XX: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 @@ -1485,17 +2811,17 @@ aesni_cbc_sha1_enc_shaext: pxor %xmm3,%xmm5 .byte 15,56,201,243 cmpl $11,%r11d - jb .Laesenclast6 + jb .Laesenclast11 movups 64(%rcx),%xmm0 .byte 102,15,56,220,209 movups 80(%rcx),%xmm1 .byte 102,15,56,220,208 - je .Laesenclast6 + je .Laesenclast11 movups 96(%rcx),%xmm0 .byte 102,15,56,220,209 movups 112(%rcx),%xmm1 .byte 102,15,56,220,208 -.Laesenclast6: +.Laesenclast11: .byte 102,15,56,221,209 movups 16-112(%rcx),%xmm0 movdqa %xmm8,%xmm10 @@ -1551,17 +2877,17 @@ aesni_cbc_sha1_enc_shaext: pxor %xmm4,%xmm6 .byte 15,56,201,220 cmpl $11,%r11d - jb .Laesenclast7 + jb .Laesenclast12 movups 64(%rcx),%xmm0 .byte 102,15,56,220,209 movups 80(%rcx),%xmm1 .byte 102,15,56,220,208 - je .Laesenclast7 + je .Laesenclast12 movups 96(%rcx),%xmm0 .byte 102,15,56,220,209 movups 112(%rcx),%xmm1 .byte 102,15,56,220,208 -.Laesenclast7: +.Laesenclast12: .byte 102,15,56,221,209 movups 16-112(%rcx),%xmm0 movdqa %xmm8,%xmm9 @@ -1617,17 +2943,17 @@ aesni_cbc_sha1_enc_shaext: pxor %xmm5,%xmm3 .byte 15,56,201,229 cmpl $11,%r11d - jb .Laesenclast8 + jb .Laesenclast13 movups 64(%rcx),%xmm0 .byte 102,15,56,220,209 movups 80(%rcx),%xmm1 .byte 102,15,56,220,208 - je .Laesenclast8 + je .Laesenclast13 movups 96(%rcx),%xmm0 .byte 102,15,56,220,209 movups 112(%rcx),%xmm1 .byte 102,15,56,220,208 -.Laesenclast8: +.Laesenclast13: .byte 102,15,56,221,209 movups 16-112(%rcx),%xmm0 movdqa %xmm8,%xmm10 @@ -1681,17 +3007,17 @@ aesni_cbc_sha1_enc_shaext: movups 48(%rcx),%xmm1 .byte 102,15,56,220,208 cmpl $11,%r11d - jb .Laesenclast9 + jb .Laesenclast14 movups 64(%rcx),%xmm0 .byte 102,15,56,220,209 movups 80(%rcx),%xmm1 .byte 102,15,56,220,208 - je .Laesenclast9 + je .Laesenclast14 movups 96(%rcx),%xmm0 .byte 102,15,56,220,209 movups 112(%rcx),%xmm1 .byte 102,15,56,220,208 -.Laesenclast9: +.Laesenclast14: .byte 102,15,56,221,209 movups 16-112(%rcx),%xmm0 decq %rdx diff --git a/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S b/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S index e42a02ebe64..cb9e150db55 100644 --- a/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S +++ b/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S @@ -8,6 +8,25 @@ .align 16 aesni_cbc_sha256_enc: .cfi_startproc + leaq OPENSSL_ia32cap_P(%rip),%r11 + movl $1,%eax + cmpq $0,%rdi + je .Lprobe + movl 0(%r11),%eax + movq 4(%r11),%r10 + btq $61,%r10 + jc aesni_cbc_sha256_enc_shaext + movq %r10,%r11 + shrq $32,%r11 + + testl $2048,%r10d + jnz aesni_cbc_sha256_enc_xop + andl $296,%r11d + cmpl $296,%r11d + je aesni_cbc_sha256_enc_avx2 + andl $268435456,%r10d + jnz aesni_cbc_sha256_enc_avx + ud2 xorl %eax,%eax cmpq $0,%rdi je .Lprobe @@ -59,3 +78,4360 @@ K256: .long 0,0,0,0, 0,0,0,0 .byte 65,69,83,78,73,45,67,66,67,43,83,72,65,50,53,54,32,115,116,105,116,99,104,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 +.type aesni_cbc_sha256_enc_xop,@function +.align 64 +aesni_cbc_sha256_enc_xop: +.cfi_startproc +.Lxop_shortcut: + movq 8(%rsp),%r10 + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + subq $128,%rsp + andq $-64,%rsp + + shlq $6,%rdx + subq %rdi,%rsi + subq %rdi,%r10 + addq %rdi,%rdx + + + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + + movq %r8,64+32(%rsp) + movq %r9,64+40(%rsp) + movq %r10,64+48(%rsp) + movq %rax,120(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08 +.Lprologue_xop: + vzeroall + + movq %rdi,%r12 + leaq 128(%rcx),%rdi + leaq K256+544(%rip),%r13 + movl 240-128(%rdi),%r14d + movq %r9,%r15 + movq %r10,%rsi + vmovdqu (%r8),%xmm8 + subq $9,%r14 + + movl 0(%r15),%eax + movl 4(%r15),%ebx + movl 8(%r15),%ecx + movl 12(%r15),%edx + movl 16(%r15),%r8d + movl 20(%r15),%r9d + movl 24(%r15),%r10d + movl 28(%r15),%r11d + + vmovdqa 0(%r13,%r14,8),%xmm14 + vmovdqa 16(%r13,%r14,8),%xmm13 + vmovdqa 32(%r13,%r14,8),%xmm12 + vmovdqu 0-128(%rdi),%xmm10 + jmp .Lloop_xop +.align 16 +.Lloop_xop: + vmovdqa K256+512(%rip),%xmm7 + vmovdqu 0(%rsi,%r12,1),%xmm0 + vmovdqu 16(%rsi,%r12,1),%xmm1 + vmovdqu 32(%rsi,%r12,1),%xmm2 + vmovdqu 48(%rsi,%r12,1),%xmm3 + vpshufb %xmm7,%xmm0,%xmm0 + leaq K256(%rip),%rbp + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd 0(%rbp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 32(%rbp),%xmm1,%xmm5 + vpaddd 64(%rbp),%xmm2,%xmm6 + vpaddd 96(%rbp),%xmm3,%xmm7 + vmovdqa %xmm4,0(%rsp) + movl %eax,%r14d + vmovdqa %xmm5,16(%rsp) + movl %ebx,%esi + vmovdqa %xmm6,32(%rsp) + xorl %ecx,%esi + vmovdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lxop_00_47 + +.align 16 +.Lxop_00_47: + subq $-32*4,%rbp + vmovdqu (%r12),%xmm9 + movq %r12,64+0(%rsp) + vpalignr $4,%xmm0,%xmm1,%xmm4 + rorl $14,%r13d + movl %r14d,%eax + vpalignr $4,%xmm2,%xmm3,%xmm7 + movl %r9d,%r12d + xorl %r8d,%r13d +.byte 143,232,120,194,236,14 + rorl $9,%r14d + xorl %r10d,%r12d + vpsrld $3,%xmm4,%xmm4 + rorl $5,%r13d + xorl %eax,%r14d + vpaddd %xmm7,%xmm0,%xmm0 + andl %r8d,%r12d + vpxor %xmm10,%xmm9,%xmm9 + vmovdqu 16-128(%rdi),%xmm10 + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d +.byte 143,232,120,194,245,11 + rorl $11,%r14d + xorl %r10d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + xorl %ebx,%r15d + rorl $6,%r13d + addl %r12d,%r11d + andl %r15d,%esi +.byte 143,232,120,194,251,13 + xorl %eax,%r14d + addl %r13d,%r11d + vpxor %xmm6,%xmm4,%xmm4 + xorl %ebx,%esi + addl %r11d,%edx + vpsrld $10,%xmm3,%xmm6 + rorl $2,%r14d + addl %esi,%r11d + vpaddd %xmm4,%xmm0,%xmm0 + movl %edx,%r13d + addl %r11d,%r14d +.byte 143,232,120,194,239,2 + rorl $14,%r13d + movl %r14d,%r11d + vpxor %xmm6,%xmm7,%xmm7 + movl %r8d,%r12d + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%r12d + vpxor %xmm5,%xmm7,%xmm7 + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + vpxor %xmm8,%xmm9,%xmm9 + xorl %edx,%r13d + vpsrldq $8,%xmm7,%xmm7 + addl 4(%rsp),%r10d + movl %r11d,%esi + rorl $11,%r14d + xorl %r9d,%r12d + vpaddd %xmm7,%xmm0,%xmm0 + xorl %eax,%esi + rorl $6,%r13d + addl %r12d,%r10d + andl %esi,%r15d +.byte 143,232,120,194,248,13 + xorl %r11d,%r14d + addl %r13d,%r10d + vpsrld $10,%xmm0,%xmm6 + xorl %eax,%r15d + addl %r10d,%ecx +.byte 143,232,120,194,239,2 + rorl $2,%r14d + addl %r15d,%r10d + vpxor %xmm6,%xmm7,%xmm7 + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + vpxor %xmm5,%xmm7,%xmm7 + movl %edx,%r12d + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r12d + vpslldq $8,%xmm7,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 32-128(%rdi),%xmm10 + xorl %ecx,%r13d + vpaddd %xmm7,%xmm0,%xmm0 + addl 8(%rsp),%r9d + movl %r10d,%r15d + rorl $11,%r14d + xorl %r8d,%r12d + vpaddd 0(%rbp),%xmm0,%xmm6 + xorl %r11d,%r15d + rorl $6,%r13d + addl %r12d,%r9d + andl %r15d,%esi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%esi + addl %r9d,%ebx + rorl $2,%r14d + addl %esi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 48-128(%rdi),%xmm10 + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%esi + rorl $11,%r14d + xorl %edx,%r12d + xorl %r10d,%esi + rorl $6,%r13d + addl %r12d,%r8d + andl %esi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + addl %r8d,%eax + rorl $2,%r14d + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,0(%rsp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + rorl $14,%r13d + movl %r14d,%r8d + vpalignr $4,%xmm3,%xmm0,%xmm7 + movl %ebx,%r12d + xorl %eax,%r13d +.byte 143,232,120,194,236,14 + rorl $9,%r14d + xorl %ecx,%r12d + vpsrld $3,%xmm4,%xmm4 + rorl $5,%r13d + xorl %r8d,%r14d + vpaddd %xmm7,%xmm1,%xmm1 + andl %eax,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 64-128(%rdi),%xmm10 + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d +.byte 143,232,120,194,245,11 + rorl $11,%r14d + xorl %ecx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + xorl %r9d,%r15d + rorl $6,%r13d + addl %r12d,%edx + andl %r15d,%esi +.byte 143,232,120,194,248,13 + xorl %r8d,%r14d + addl %r13d,%edx + vpxor %xmm6,%xmm4,%xmm4 + xorl %r9d,%esi + addl %edx,%r11d + vpsrld $10,%xmm0,%xmm6 + rorl $2,%r14d + addl %esi,%edx + vpaddd %xmm4,%xmm1,%xmm1 + movl %r11d,%r13d + addl %edx,%r14d +.byte 143,232,120,194,239,2 + rorl $14,%r13d + movl %r14d,%edx + vpxor %xmm6,%xmm7,%xmm7 + movl %eax,%r12d + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%r12d + vpxor %xmm5,%xmm7,%xmm7 + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 80-128(%rdi),%xmm10 + xorl %r11d,%r13d + vpsrldq $8,%xmm7,%xmm7 + addl 20(%rsp),%ecx + movl %edx,%esi + rorl $11,%r14d + xorl %ebx,%r12d + vpaddd %xmm7,%xmm1,%xmm1 + xorl %r8d,%esi + rorl $6,%r13d + addl %r12d,%ecx + andl %esi,%r15d +.byte 143,232,120,194,249,13 + xorl %edx,%r14d + addl %r13d,%ecx + vpsrld $10,%xmm1,%xmm6 + xorl %r8d,%r15d + addl %ecx,%r10d +.byte 143,232,120,194,239,2 + rorl $2,%r14d + addl %r15d,%ecx + vpxor %xmm6,%xmm7,%xmm7 + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + vpxor %xmm5,%xmm7,%xmm7 + movl %r11d,%r12d + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r12d + vpslldq $8,%xmm7,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 96-128(%rdi),%xmm10 + xorl %r10d,%r13d + vpaddd %xmm7,%xmm1,%xmm1 + addl 24(%rsp),%ebx + movl %ecx,%r15d + rorl $11,%r14d + xorl %eax,%r12d + vpaddd 32(%rbp),%xmm1,%xmm6 + xorl %edx,%r15d + rorl $6,%r13d + addl %r12d,%ebx + andl %r15d,%esi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%esi + addl %ebx,%r9d + rorl $2,%r14d + addl %esi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 112-128(%rdi),%xmm10 + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%esi + rorl $11,%r14d + xorl %r11d,%r12d + xorl %ecx,%esi + rorl $6,%r13d + addl %r12d,%eax + andl %esi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + addl %eax,%r8d + rorl $2,%r14d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,16(%rsp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + rorl $14,%r13d + movl %r14d,%eax + vpalignr $4,%xmm0,%xmm1,%xmm7 + movl %r9d,%r12d + xorl %r8d,%r13d +.byte 143,232,120,194,236,14 + rorl $9,%r14d + xorl %r10d,%r12d + vpsrld $3,%xmm4,%xmm4 + rorl $5,%r13d + xorl %eax,%r14d + vpaddd %xmm7,%xmm2,%xmm2 + andl %r8d,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 128-128(%rdi),%xmm10 + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d +.byte 143,232,120,194,245,11 + rorl $11,%r14d + xorl %r10d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + xorl %ebx,%r15d + rorl $6,%r13d + addl %r12d,%r11d + andl %r15d,%esi +.byte 143,232,120,194,249,13 + xorl %eax,%r14d + addl %r13d,%r11d + vpxor %xmm6,%xmm4,%xmm4 + xorl %ebx,%esi + addl %r11d,%edx + vpsrld $10,%xmm1,%xmm6 + rorl $2,%r14d + addl %esi,%r11d + vpaddd %xmm4,%xmm2,%xmm2 + movl %edx,%r13d + addl %r11d,%r14d +.byte 143,232,120,194,239,2 + rorl $14,%r13d + movl %r14d,%r11d + vpxor %xmm6,%xmm7,%xmm7 + movl %r8d,%r12d + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%r12d + vpxor %xmm5,%xmm7,%xmm7 + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 144-128(%rdi),%xmm10 + xorl %edx,%r13d + vpsrldq $8,%xmm7,%xmm7 + addl 36(%rsp),%r10d + movl %r11d,%esi + rorl $11,%r14d + xorl %r9d,%r12d + vpaddd %xmm7,%xmm2,%xmm2 + xorl %eax,%esi + rorl $6,%r13d + addl %r12d,%r10d + andl %esi,%r15d +.byte 143,232,120,194,250,13 + xorl %r11d,%r14d + addl %r13d,%r10d + vpsrld $10,%xmm2,%xmm6 + xorl %eax,%r15d + addl %r10d,%ecx +.byte 143,232,120,194,239,2 + rorl $2,%r14d + addl %r15d,%r10d + vpxor %xmm6,%xmm7,%xmm7 + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + vpxor %xmm5,%xmm7,%xmm7 + movl %edx,%r12d + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r12d + vpslldq $8,%xmm7,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 160-128(%rdi),%xmm10 + xorl %ecx,%r13d + vpaddd %xmm7,%xmm2,%xmm2 + addl 40(%rsp),%r9d + movl %r10d,%r15d + rorl $11,%r14d + xorl %r8d,%r12d + vpaddd 64(%rbp),%xmm2,%xmm6 + xorl %r11d,%r15d + rorl $6,%r13d + addl %r12d,%r9d + andl %r15d,%esi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%esi + addl %r9d,%ebx + rorl $2,%r14d + addl %esi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vaesenclast %xmm10,%xmm9,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 176-128(%rdi),%xmm10 + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%esi + rorl $11,%r14d + xorl %edx,%r12d + xorl %r10d,%esi + rorl $6,%r13d + addl %r12d,%r8d + andl %esi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + addl %r8d,%eax + rorl $2,%r14d + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,32(%rsp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + rorl $14,%r13d + movl %r14d,%r8d + vpalignr $4,%xmm1,%xmm2,%xmm7 + movl %ebx,%r12d + xorl %eax,%r13d +.byte 143,232,120,194,236,14 + rorl $9,%r14d + xorl %ecx,%r12d + vpsrld $3,%xmm4,%xmm4 + rorl $5,%r13d + xorl %r8d,%r14d + vpaddd %xmm7,%xmm3,%xmm3 + andl %eax,%r12d + vpand %xmm12,%xmm11,%xmm8 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 192-128(%rdi),%xmm10 + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d +.byte 143,232,120,194,245,11 + rorl $11,%r14d + xorl %ecx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + xorl %r9d,%r15d + rorl $6,%r13d + addl %r12d,%edx + andl %r15d,%esi +.byte 143,232,120,194,250,13 + xorl %r8d,%r14d + addl %r13d,%edx + vpxor %xmm6,%xmm4,%xmm4 + xorl %r9d,%esi + addl %edx,%r11d + vpsrld $10,%xmm2,%xmm6 + rorl $2,%r14d + addl %esi,%edx + vpaddd %xmm4,%xmm3,%xmm3 + movl %r11d,%r13d + addl %edx,%r14d +.byte 143,232,120,194,239,2 + rorl $14,%r13d + movl %r14d,%edx + vpxor %xmm6,%xmm7,%xmm7 + movl %eax,%r12d + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%r12d + vpxor %xmm5,%xmm7,%xmm7 + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + vaesenclast %xmm10,%xmm9,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 208-128(%rdi),%xmm10 + xorl %r11d,%r13d + vpsrldq $8,%xmm7,%xmm7 + addl 52(%rsp),%ecx + movl %edx,%esi + rorl $11,%r14d + xorl %ebx,%r12d + vpaddd %xmm7,%xmm3,%xmm3 + xorl %r8d,%esi + rorl $6,%r13d + addl %r12d,%ecx + andl %esi,%r15d +.byte 143,232,120,194,251,13 + xorl %edx,%r14d + addl %r13d,%ecx + vpsrld $10,%xmm3,%xmm6 + xorl %r8d,%r15d + addl %ecx,%r10d +.byte 143,232,120,194,239,2 + rorl $2,%r14d + addl %r15d,%ecx + vpxor %xmm6,%xmm7,%xmm7 + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + vpxor %xmm5,%xmm7,%xmm7 + movl %r11d,%r12d + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r12d + vpslldq $8,%xmm7,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + vpand %xmm13,%xmm11,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 224-128(%rdi),%xmm10 + xorl %r10d,%r13d + vpaddd %xmm7,%xmm3,%xmm3 + addl 56(%rsp),%ebx + movl %ecx,%r15d + rorl $11,%r14d + xorl %eax,%r12d + vpaddd 96(%rbp),%xmm3,%xmm6 + xorl %edx,%r15d + rorl $6,%r13d + addl %r12d,%ebx + andl %r15d,%esi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%esi + addl %ebx,%r9d + rorl $2,%r14d + addl %esi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpor %xmm11,%xmm8,%xmm8 + vaesenclast %xmm10,%xmm9,%xmm11 + vmovdqu 0-128(%rdi),%xmm10 + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%esi + rorl $11,%r14d + xorl %r11d,%r12d + xorl %ecx,%esi + rorl $6,%r13d + addl %r12d,%eax + andl %esi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + addl %eax,%r8d + rorl $2,%r14d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,48(%rsp) + movq 64+0(%rsp),%r12 + vpand %xmm14,%xmm11,%xmm11 + movq 64+8(%rsp),%r15 + vpor %xmm11,%xmm8,%xmm8 + vmovdqu %xmm8,(%r15,%r12,1) + leaq 16(%r12),%r12 + cmpb $0,131(%rbp) + jne .Lxop_00_47 + vmovdqu (%r12),%xmm9 + movq %r12,64+0(%rsp) + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpxor %xmm10,%xmm9,%xmm9 + vmovdqu 16-128(%rdi),%xmm10 + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + rorl $11,%r14d + xorl %r10d,%r12d + xorl %ebx,%r15d + rorl $6,%r13d + addl %r12d,%r11d + andl %r15d,%esi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%esi + addl %r11d,%edx + rorl $2,%r14d + addl %esi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + vpxor %xmm8,%xmm9,%xmm9 + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%esi + rorl $11,%r14d + xorl %r9d,%r12d + xorl %eax,%esi + rorl $6,%r13d + addl %r12d,%r10d + andl %esi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + addl %r10d,%ecx + rorl $2,%r14d + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 32-128(%rdi),%xmm10 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + rorl $11,%r14d + xorl %r8d,%r12d + xorl %r11d,%r15d + rorl $6,%r13d + addl %r12d,%r9d + andl %r15d,%esi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%esi + addl %r9d,%ebx + rorl $2,%r14d + addl %esi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 48-128(%rdi),%xmm10 + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%esi + rorl $11,%r14d + xorl %edx,%r12d + xorl %r10d,%esi + rorl $6,%r13d + addl %r12d,%r8d + andl %esi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + addl %r8d,%eax + rorl $2,%r14d + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 64-128(%rdi),%xmm10 + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + rorl $11,%r14d + xorl %ecx,%r12d + xorl %r9d,%r15d + rorl $6,%r13d + addl %r12d,%edx + andl %r15d,%esi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%esi + addl %edx,%r11d + rorl $2,%r14d + addl %esi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 80-128(%rdi),%xmm10 + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%esi + rorl $11,%r14d + xorl %ebx,%r12d + xorl %r8d,%esi + rorl $6,%r13d + addl %r12d,%ecx + andl %esi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + addl %ecx,%r10d + rorl $2,%r14d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 96-128(%rdi),%xmm10 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + rorl $11,%r14d + xorl %eax,%r12d + xorl %edx,%r15d + rorl $6,%r13d + addl %r12d,%ebx + andl %r15d,%esi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%esi + addl %ebx,%r9d + rorl $2,%r14d + addl %esi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 112-128(%rdi),%xmm10 + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%esi + rorl $11,%r14d + xorl %r11d,%r12d + xorl %ecx,%esi + rorl $6,%r13d + addl %r12d,%eax + andl %esi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + addl %eax,%r8d + rorl $2,%r14d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 128-128(%rdi),%xmm10 + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + rorl $11,%r14d + xorl %r10d,%r12d + xorl %ebx,%r15d + rorl $6,%r13d + addl %r12d,%r11d + andl %r15d,%esi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%esi + addl %r11d,%edx + rorl $2,%r14d + addl %esi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 144-128(%rdi),%xmm10 + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%esi + rorl $11,%r14d + xorl %r9d,%r12d + xorl %eax,%esi + rorl $6,%r13d + addl %r12d,%r10d + andl %esi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + addl %r10d,%ecx + rorl $2,%r14d + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 160-128(%rdi),%xmm10 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + rorl $11,%r14d + xorl %r8d,%r12d + xorl %r11d,%r15d + rorl $6,%r13d + addl %r12d,%r9d + andl %r15d,%esi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%esi + addl %r9d,%ebx + rorl $2,%r14d + addl %esi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vaesenclast %xmm10,%xmm9,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 176-128(%rdi),%xmm10 + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%esi + rorl $11,%r14d + xorl %edx,%r12d + xorl %r10d,%esi + rorl $6,%r13d + addl %r12d,%r8d + andl %esi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + addl %r8d,%eax + rorl $2,%r14d + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpand %xmm12,%xmm11,%xmm8 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 192-128(%rdi),%xmm10 + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + rorl $11,%r14d + xorl %ecx,%r12d + xorl %r9d,%r15d + rorl $6,%r13d + addl %r12d,%edx + andl %r15d,%esi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%esi + addl %edx,%r11d + rorl $2,%r14d + addl %esi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + vaesenclast %xmm10,%xmm9,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 208-128(%rdi),%xmm10 + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%esi + rorl $11,%r14d + xorl %ebx,%r12d + xorl %r8d,%esi + rorl $6,%r13d + addl %r12d,%ecx + andl %esi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + addl %ecx,%r10d + rorl $2,%r14d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + vpand %xmm13,%xmm11,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 224-128(%rdi),%xmm10 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + rorl $11,%r14d + xorl %eax,%r12d + xorl %edx,%r15d + rorl $6,%r13d + addl %r12d,%ebx + andl %r15d,%esi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%esi + addl %ebx,%r9d + rorl $2,%r14d + addl %esi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpor %xmm11,%xmm8,%xmm8 + vaesenclast %xmm10,%xmm9,%xmm11 + vmovdqu 0-128(%rdi),%xmm10 + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%esi + rorl $11,%r14d + xorl %r11d,%r12d + xorl %ecx,%esi + rorl $6,%r13d + addl %r12d,%eax + andl %esi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + addl %eax,%r8d + rorl $2,%r14d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%r12 + movq 64+8(%rsp),%r13 + movq 64+40(%rsp),%r15 + movq 64+48(%rsp),%rsi + + vpand %xmm14,%xmm11,%xmm11 + movl %r14d,%eax + vpor %xmm11,%xmm8,%xmm8 + vmovdqu %xmm8,(%r12,%r13,1) + leaq 16(%r12),%r12 + + addl 0(%r15),%eax + addl 4(%r15),%ebx + addl 8(%r15),%ecx + addl 12(%r15),%edx + addl 16(%r15),%r8d + addl 20(%r15),%r9d + addl 24(%r15),%r10d + addl 28(%r15),%r11d + + cmpq 64+16(%rsp),%r12 + + movl %eax,0(%r15) + movl %ebx,4(%r15) + movl %ecx,8(%r15) + movl %edx,12(%r15) + movl %r8d,16(%r15) + movl %r9d,20(%r15) + movl %r10d,24(%r15) + movl %r11d,28(%r15) + + jb .Lloop_xop + + movq 64+32(%rsp),%r8 + movq 120(%rsp),%rsi +.cfi_def_cfa %rsi,8 + vmovdqu %xmm8,(%r8) + vzeroall + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_xop: + .byte 0xf3,0xc3 +.cfi_endproc +.size aesni_cbc_sha256_enc_xop,.-aesni_cbc_sha256_enc_xop +.type aesni_cbc_sha256_enc_avx,@function +.align 64 +aesni_cbc_sha256_enc_avx: +.cfi_startproc +.Lavx_shortcut: + movq 8(%rsp),%r10 + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + subq $128,%rsp + andq $-64,%rsp + + shlq $6,%rdx + subq %rdi,%rsi + subq %rdi,%r10 + addq %rdi,%rdx + + + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + + movq %r8,64+32(%rsp) + movq %r9,64+40(%rsp) + movq %r10,64+48(%rsp) + movq %rax,120(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08 +.Lprologue_avx: + vzeroall + + movq %rdi,%r12 + leaq 128(%rcx),%rdi + leaq K256+544(%rip),%r13 + movl 240-128(%rdi),%r14d + movq %r9,%r15 + movq %r10,%rsi + vmovdqu (%r8),%xmm8 + subq $9,%r14 + + movl 0(%r15),%eax + movl 4(%r15),%ebx + movl 8(%r15),%ecx + movl 12(%r15),%edx + movl 16(%r15),%r8d + movl 20(%r15),%r9d + movl 24(%r15),%r10d + movl 28(%r15),%r11d + + vmovdqa 0(%r13,%r14,8),%xmm14 + vmovdqa 16(%r13,%r14,8),%xmm13 + vmovdqa 32(%r13,%r14,8),%xmm12 + vmovdqu 0-128(%rdi),%xmm10 + jmp .Lloop_avx +.align 16 +.Lloop_avx: + vmovdqa K256+512(%rip),%xmm7 + vmovdqu 0(%rsi,%r12,1),%xmm0 + vmovdqu 16(%rsi,%r12,1),%xmm1 + vmovdqu 32(%rsi,%r12,1),%xmm2 + vmovdqu 48(%rsi,%r12,1),%xmm3 + vpshufb %xmm7,%xmm0,%xmm0 + leaq K256(%rip),%rbp + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd 0(%rbp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 32(%rbp),%xmm1,%xmm5 + vpaddd 64(%rbp),%xmm2,%xmm6 + vpaddd 96(%rbp),%xmm3,%xmm7 + vmovdqa %xmm4,0(%rsp) + movl %eax,%r14d + vmovdqa %xmm5,16(%rsp) + movl %ebx,%esi + vmovdqa %xmm6,32(%rsp) + xorl %ecx,%esi + vmovdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lavx_00_47 + +.align 16 +.Lavx_00_47: + subq $-32*4,%rbp + vmovdqu (%r12),%xmm9 + movq %r12,64+0(%rsp) + vpalignr $4,%xmm0,%xmm1,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm2,%xmm3,%xmm7 + xorl %r8d,%r13d + shrdl $9,%r14d,%r14d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm0,%xmm0 + vpxor %xmm10,%xmm9,%xmm9 + vmovdqu 16-128(%rdi),%xmm10 + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + shrdl $11,%r14d,%r14d + xorl %r10d,%r12d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + shrdl $6,%r13d,%r13d + addl %r12d,%r11d + andl %r15d,%esi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%esi + vpshufd $250,%xmm3,%xmm7 + addl %r11d,%edx + shrdl $2,%r14d,%r14d + addl %esi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + xorl %edx,%r13d + vpslld $11,%xmm5,%xmm5 + shrdl $9,%r14d,%r14d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + vpxor %xmm8,%xmm9,%xmm9 + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 4(%rsp),%r10d + movl %r11d,%esi + shrdl $11,%r14d,%r14d + vpxor %xmm5,%xmm4,%xmm4 + xorl %r9d,%r12d + xorl %eax,%esi + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + addl %r12d,%r10d + andl %esi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm0,%xmm0 + addl %r13d,%r10d + xorl %eax,%r15d + addl %r10d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $2,%r14d,%r14d + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + xorl %ecx,%r13d + shrdl $9,%r14d,%r14d + vpshufd $132,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpsrldq $8,%xmm6,%xmm6 + andl %ecx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 32-128(%rdi),%xmm10 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + vpaddd %xmm6,%xmm0,%xmm0 + movl %r10d,%r15d + shrdl $11,%r14d,%r14d + xorl %r8d,%r12d + vpshufd $80,%xmm0,%xmm7 + xorl %r11d,%r15d + shrdl $6,%r13d,%r13d + addl %r12d,%r9d + vpsrld $10,%xmm7,%xmm6 + andl %r15d,%esi + xorl %r10d,%r14d + addl %r13d,%r9d + vpsrlq $17,%xmm7,%xmm7 + xorl %r11d,%esi + addl %r9d,%ebx + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %esi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpsrlq $2,%xmm7,%xmm7 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpxor %xmm7,%xmm6,%xmm6 + xorl %ebx,%r13d + shrdl $9,%r14d,%r14d + xorl %edx,%r12d + vpshufd $232,%xmm6,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpslldq $8,%xmm6,%xmm6 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 48-128(%rdi),%xmm10 + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%esi + vpaddd %xmm6,%xmm0,%xmm0 + shrdl $11,%r14d,%r14d + xorl %edx,%r12d + xorl %r10d,%esi + vpaddd 0(%rbp),%xmm0,%xmm6 + shrdl $6,%r13d,%r13d + addl %r12d,%r8d + andl %esi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + addl %r8d,%eax + shrdl $2,%r14d,%r14d + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,0(%rsp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm3,%xmm0,%xmm7 + xorl %eax,%r13d + shrdl $9,%r14d,%r14d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm1,%xmm1 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 64-128(%rdi),%xmm10 + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + shrdl $11,%r14d,%r14d + xorl %ecx,%r12d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + shrdl $6,%r13d,%r13d + addl %r12d,%edx + andl %r15d,%esi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%esi + vpshufd $250,%xmm0,%xmm7 + addl %edx,%r11d + shrdl $2,%r14d,%r14d + addl %esi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + xorl %r11d,%r13d + vpslld $11,%xmm5,%xmm5 + shrdl $9,%r14d,%r14d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 80-128(%rdi),%xmm10 + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 20(%rsp),%ecx + movl %edx,%esi + shrdl $11,%r14d,%r14d + vpxor %xmm5,%xmm4,%xmm4 + xorl %ebx,%r12d + xorl %r8d,%esi + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + addl %r12d,%ecx + andl %esi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm1,%xmm1 + addl %r13d,%ecx + xorl %r8d,%r15d + addl %ecx,%r10d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $2,%r14d,%r14d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + xorl %r10d,%r13d + shrdl $9,%r14d,%r14d + vpshufd $132,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpsrldq $8,%xmm6,%xmm6 + andl %r10d,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 96-128(%rdi),%xmm10 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + vpaddd %xmm6,%xmm1,%xmm1 + movl %ecx,%r15d + shrdl $11,%r14d,%r14d + xorl %eax,%r12d + vpshufd $80,%xmm1,%xmm7 + xorl %edx,%r15d + shrdl $6,%r13d,%r13d + addl %r12d,%ebx + vpsrld $10,%xmm7,%xmm6 + andl %r15d,%esi + xorl %ecx,%r14d + addl %r13d,%ebx + vpsrlq $17,%xmm7,%xmm7 + xorl %edx,%esi + addl %ebx,%r9d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %esi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpsrlq $2,%xmm7,%xmm7 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r9d,%r13d + shrdl $9,%r14d,%r14d + xorl %r11d,%r12d + vpshufd $232,%xmm6,%xmm6 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpslldq $8,%xmm6,%xmm6 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 112-128(%rdi),%xmm10 + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%esi + vpaddd %xmm6,%xmm1,%xmm1 + shrdl $11,%r14d,%r14d + xorl %r11d,%r12d + xorl %ecx,%esi + vpaddd 32(%rbp),%xmm1,%xmm6 + shrdl $6,%r13d,%r13d + addl %r12d,%eax + andl %esi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + addl %eax,%r8d + shrdl $2,%r14d,%r14d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,16(%rsp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm0,%xmm1,%xmm7 + xorl %r8d,%r13d + shrdl $9,%r14d,%r14d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm2,%xmm2 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 128-128(%rdi),%xmm10 + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + shrdl $11,%r14d,%r14d + xorl %r10d,%r12d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + shrdl $6,%r13d,%r13d + addl %r12d,%r11d + andl %r15d,%esi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%esi + vpshufd $250,%xmm1,%xmm7 + addl %r11d,%edx + shrdl $2,%r14d,%r14d + addl %esi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + xorl %edx,%r13d + vpslld $11,%xmm5,%xmm5 + shrdl $9,%r14d,%r14d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 144-128(%rdi),%xmm10 + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 36(%rsp),%r10d + movl %r11d,%esi + shrdl $11,%r14d,%r14d + vpxor %xmm5,%xmm4,%xmm4 + xorl %r9d,%r12d + xorl %eax,%esi + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + addl %r12d,%r10d + andl %esi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm2,%xmm2 + addl %r13d,%r10d + xorl %eax,%r15d + addl %r10d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $2,%r14d,%r14d + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + xorl %ecx,%r13d + shrdl $9,%r14d,%r14d + vpshufd $132,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpsrldq $8,%xmm6,%xmm6 + andl %ecx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 160-128(%rdi),%xmm10 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + vpaddd %xmm6,%xmm2,%xmm2 + movl %r10d,%r15d + shrdl $11,%r14d,%r14d + xorl %r8d,%r12d + vpshufd $80,%xmm2,%xmm7 + xorl %r11d,%r15d + shrdl $6,%r13d,%r13d + addl %r12d,%r9d + vpsrld $10,%xmm7,%xmm6 + andl %r15d,%esi + xorl %r10d,%r14d + addl %r13d,%r9d + vpsrlq $17,%xmm7,%xmm7 + xorl %r11d,%esi + addl %r9d,%ebx + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %esi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpsrlq $2,%xmm7,%xmm7 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpxor %xmm7,%xmm6,%xmm6 + xorl %ebx,%r13d + shrdl $9,%r14d,%r14d + xorl %edx,%r12d + vpshufd $232,%xmm6,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpslldq $8,%xmm6,%xmm6 + vaesenclast %xmm10,%xmm9,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 176-128(%rdi),%xmm10 + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%esi + vpaddd %xmm6,%xmm2,%xmm2 + shrdl $11,%r14d,%r14d + xorl %edx,%r12d + xorl %r10d,%esi + vpaddd 64(%rbp),%xmm2,%xmm6 + shrdl $6,%r13d,%r13d + addl %r12d,%r8d + andl %esi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + addl %r8d,%eax + shrdl $2,%r14d,%r14d + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,32(%rsp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm1,%xmm2,%xmm7 + xorl %eax,%r13d + shrdl $9,%r14d,%r14d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm3,%xmm3 + vpand %xmm12,%xmm11,%xmm8 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 192-128(%rdi),%xmm10 + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + shrdl $11,%r14d,%r14d + xorl %ecx,%r12d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + shrdl $6,%r13d,%r13d + addl %r12d,%edx + andl %r15d,%esi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%esi + vpshufd $250,%xmm2,%xmm7 + addl %edx,%r11d + shrdl $2,%r14d,%r14d + addl %esi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + xorl %r11d,%r13d + vpslld $11,%xmm5,%xmm5 + shrdl $9,%r14d,%r14d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + vaesenclast %xmm10,%xmm9,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 208-128(%rdi),%xmm10 + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 52(%rsp),%ecx + movl %edx,%esi + shrdl $11,%r14d,%r14d + vpxor %xmm5,%xmm4,%xmm4 + xorl %ebx,%r12d + xorl %r8d,%esi + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + addl %r12d,%ecx + andl %esi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm3,%xmm3 + addl %r13d,%ecx + xorl %r8d,%r15d + addl %ecx,%r10d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $2,%r14d,%r14d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + xorl %r10d,%r13d + shrdl $9,%r14d,%r14d + vpshufd $132,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpsrldq $8,%xmm6,%xmm6 + andl %r10d,%r12d + vpand %xmm13,%xmm11,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 224-128(%rdi),%xmm10 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + vpaddd %xmm6,%xmm3,%xmm3 + movl %ecx,%r15d + shrdl $11,%r14d,%r14d + xorl %eax,%r12d + vpshufd $80,%xmm3,%xmm7 + xorl %edx,%r15d + shrdl $6,%r13d,%r13d + addl %r12d,%ebx + vpsrld $10,%xmm7,%xmm6 + andl %r15d,%esi + xorl %ecx,%r14d + addl %r13d,%ebx + vpsrlq $17,%xmm7,%xmm7 + xorl %edx,%esi + addl %ebx,%r9d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %esi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpsrlq $2,%xmm7,%xmm7 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r9d,%r13d + shrdl $9,%r14d,%r14d + xorl %r11d,%r12d + vpshufd $232,%xmm6,%xmm6 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpslldq $8,%xmm6,%xmm6 + vpor %xmm11,%xmm8,%xmm8 + vaesenclast %xmm10,%xmm9,%xmm11 + vmovdqu 0-128(%rdi),%xmm10 + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%esi + vpaddd %xmm6,%xmm3,%xmm3 + shrdl $11,%r14d,%r14d + xorl %r11d,%r12d + xorl %ecx,%esi + vpaddd 96(%rbp),%xmm3,%xmm6 + shrdl $6,%r13d,%r13d + addl %r12d,%eax + andl %esi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + addl %eax,%r8d + shrdl $2,%r14d,%r14d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,48(%rsp) + movq 64+0(%rsp),%r12 + vpand %xmm14,%xmm11,%xmm11 + movq 64+8(%rsp),%r15 + vpor %xmm11,%xmm8,%xmm8 + vmovdqu %xmm8,(%r15,%r12,1) + leaq 16(%r12),%r12 + cmpb $0,131(%rbp) + jne .Lavx_00_47 + vmovdqu (%r12),%xmm9 + movq %r12,64+0(%rsp) + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + xorl %r8d,%r13d + shrdl $9,%r14d,%r14d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpxor %xmm10,%xmm9,%xmm9 + vmovdqu 16-128(%rdi),%xmm10 + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + shrdl $11,%r14d,%r14d + xorl %r10d,%r12d + xorl %ebx,%r15d + shrdl $6,%r13d,%r13d + addl %r12d,%r11d + andl %r15d,%esi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%esi + addl %r11d,%edx + shrdl $2,%r14d,%r14d + addl %esi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + xorl %edx,%r13d + shrdl $9,%r14d,%r14d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + vpxor %xmm8,%xmm9,%xmm9 + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%esi + shrdl $11,%r14d,%r14d + xorl %r9d,%r12d + xorl %eax,%esi + shrdl $6,%r13d,%r13d + addl %r12d,%r10d + andl %esi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + addl %r10d,%ecx + shrdl $2,%r14d,%r14d + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + xorl %ecx,%r13d + shrdl $9,%r14d,%r14d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 32-128(%rdi),%xmm10 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + shrdl $11,%r14d,%r14d + xorl %r8d,%r12d + xorl %r11d,%r15d + shrdl $6,%r13d,%r13d + addl %r12d,%r9d + andl %r15d,%esi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%esi + addl %r9d,%ebx + shrdl $2,%r14d,%r14d + addl %esi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + xorl %ebx,%r13d + shrdl $9,%r14d,%r14d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 48-128(%rdi),%xmm10 + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%esi + shrdl $11,%r14d,%r14d + xorl %edx,%r12d + xorl %r10d,%esi + shrdl $6,%r13d,%r13d + addl %r12d,%r8d + andl %esi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + addl %r8d,%eax + shrdl $2,%r14d,%r14d + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + xorl %eax,%r13d + shrdl $9,%r14d,%r14d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 64-128(%rdi),%xmm10 + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + shrdl $11,%r14d,%r14d + xorl %ecx,%r12d + xorl %r9d,%r15d + shrdl $6,%r13d,%r13d + addl %r12d,%edx + andl %r15d,%esi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%esi + addl %edx,%r11d + shrdl $2,%r14d,%r14d + addl %esi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + xorl %r11d,%r13d + shrdl $9,%r14d,%r14d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 80-128(%rdi),%xmm10 + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%esi + shrdl $11,%r14d,%r14d + xorl %ebx,%r12d + xorl %r8d,%esi + shrdl $6,%r13d,%r13d + addl %r12d,%ecx + andl %esi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + addl %ecx,%r10d + shrdl $2,%r14d,%r14d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + xorl %r10d,%r13d + shrdl $9,%r14d,%r14d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 96-128(%rdi),%xmm10 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + shrdl $11,%r14d,%r14d + xorl %eax,%r12d + xorl %edx,%r15d + shrdl $6,%r13d,%r13d + addl %r12d,%ebx + andl %r15d,%esi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%esi + addl %ebx,%r9d + shrdl $2,%r14d,%r14d + addl %esi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + xorl %r9d,%r13d + shrdl $9,%r14d,%r14d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 112-128(%rdi),%xmm10 + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%esi + shrdl $11,%r14d,%r14d + xorl %r11d,%r12d + xorl %ecx,%esi + shrdl $6,%r13d,%r13d + addl %r12d,%eax + andl %esi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + addl %eax,%r8d + shrdl $2,%r14d,%r14d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + xorl %r8d,%r13d + shrdl $9,%r14d,%r14d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 128-128(%rdi),%xmm10 + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + shrdl $11,%r14d,%r14d + xorl %r10d,%r12d + xorl %ebx,%r15d + shrdl $6,%r13d,%r13d + addl %r12d,%r11d + andl %r15d,%esi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%esi + addl %r11d,%edx + shrdl $2,%r14d,%r14d + addl %esi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + xorl %edx,%r13d + shrdl $9,%r14d,%r14d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 144-128(%rdi),%xmm10 + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%esi + shrdl $11,%r14d,%r14d + xorl %r9d,%r12d + xorl %eax,%esi + shrdl $6,%r13d,%r13d + addl %r12d,%r10d + andl %esi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + addl %r10d,%ecx + shrdl $2,%r14d,%r14d + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + xorl %ecx,%r13d + shrdl $9,%r14d,%r14d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 160-128(%rdi),%xmm10 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + shrdl $11,%r14d,%r14d + xorl %r8d,%r12d + xorl %r11d,%r15d + shrdl $6,%r13d,%r13d + addl %r12d,%r9d + andl %r15d,%esi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%esi + addl %r9d,%ebx + shrdl $2,%r14d,%r14d + addl %esi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + xorl %ebx,%r13d + shrdl $9,%r14d,%r14d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vaesenclast %xmm10,%xmm9,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 176-128(%rdi),%xmm10 + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%esi + shrdl $11,%r14d,%r14d + xorl %edx,%r12d + xorl %r10d,%esi + shrdl $6,%r13d,%r13d + addl %r12d,%r8d + andl %esi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + addl %r8d,%eax + shrdl $2,%r14d,%r14d + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + xorl %eax,%r13d + shrdl $9,%r14d,%r14d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpand %xmm12,%xmm11,%xmm8 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 192-128(%rdi),%xmm10 + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + shrdl $11,%r14d,%r14d + xorl %ecx,%r12d + xorl %r9d,%r15d + shrdl $6,%r13d,%r13d + addl %r12d,%edx + andl %r15d,%esi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%esi + addl %edx,%r11d + shrdl $2,%r14d,%r14d + addl %esi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + xorl %r11d,%r13d + shrdl $9,%r14d,%r14d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + vaesenclast %xmm10,%xmm9,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 208-128(%rdi),%xmm10 + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%esi + shrdl $11,%r14d,%r14d + xorl %ebx,%r12d + xorl %r8d,%esi + shrdl $6,%r13d,%r13d + addl %r12d,%ecx + andl %esi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + addl %ecx,%r10d + shrdl $2,%r14d,%r14d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + xorl %r10d,%r13d + shrdl $9,%r14d,%r14d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + vpand %xmm13,%xmm11,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 224-128(%rdi),%xmm10 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + shrdl $11,%r14d,%r14d + xorl %eax,%r12d + xorl %edx,%r15d + shrdl $6,%r13d,%r13d + addl %r12d,%ebx + andl %r15d,%esi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%esi + addl %ebx,%r9d + shrdl $2,%r14d,%r14d + addl %esi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + xorl %r9d,%r13d + shrdl $9,%r14d,%r14d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpor %xmm11,%xmm8,%xmm8 + vaesenclast %xmm10,%xmm9,%xmm11 + vmovdqu 0-128(%rdi),%xmm10 + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%esi + shrdl $11,%r14d,%r14d + xorl %r11d,%r12d + xorl %ecx,%esi + shrdl $6,%r13d,%r13d + addl %r12d,%eax + andl %esi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + addl %eax,%r8d + shrdl $2,%r14d,%r14d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%r12 + movq 64+8(%rsp),%r13 + movq 64+40(%rsp),%r15 + movq 64+48(%rsp),%rsi + + vpand %xmm14,%xmm11,%xmm11 + movl %r14d,%eax + vpor %xmm11,%xmm8,%xmm8 + vmovdqu %xmm8,(%r12,%r13,1) + leaq 16(%r12),%r12 + + addl 0(%r15),%eax + addl 4(%r15),%ebx + addl 8(%r15),%ecx + addl 12(%r15),%edx + addl 16(%r15),%r8d + addl 20(%r15),%r9d + addl 24(%r15),%r10d + addl 28(%r15),%r11d + + cmpq 64+16(%rsp),%r12 + + movl %eax,0(%r15) + movl %ebx,4(%r15) + movl %ecx,8(%r15) + movl %edx,12(%r15) + movl %r8d,16(%r15) + movl %r9d,20(%r15) + movl %r10d,24(%r15) + movl %r11d,28(%r15) + jb .Lloop_avx + + movq 64+32(%rsp),%r8 + movq 120(%rsp),%rsi +.cfi_def_cfa %rsi,8 + vmovdqu %xmm8,(%r8) + vzeroall + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx: + .byte 0xf3,0xc3 +.cfi_endproc +.size aesni_cbc_sha256_enc_avx,.-aesni_cbc_sha256_enc_avx +.type aesni_cbc_sha256_enc_avx2,@function +.align 64 +aesni_cbc_sha256_enc_avx2: +.cfi_startproc +.Lavx2_shortcut: + movq 8(%rsp),%r10 + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + subq $576,%rsp + andq $-1024,%rsp + addq $448,%rsp + + shlq $6,%rdx + subq %rdi,%rsi + subq %rdi,%r10 + addq %rdi,%rdx + + + + movq %rdx,64+16(%rsp) + + movq %r8,64+32(%rsp) + movq %r9,64+40(%rsp) + movq %r10,64+48(%rsp) + movq %rax,120(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08 +.Lprologue_avx2: + vzeroall + + movq %rdi,%r13 + vpinsrq $1,%rsi,%xmm15,%xmm15 + leaq 128(%rcx),%rdi + leaq K256+544(%rip),%r12 + movl 240-128(%rdi),%r14d + movq %r9,%r15 + movq %r10,%rsi + vmovdqu (%r8),%xmm8 + leaq -9(%r14),%r14 + + vmovdqa 0(%r12,%r14,8),%xmm14 + vmovdqa 16(%r12,%r14,8),%xmm13 + vmovdqa 32(%r12,%r14,8),%xmm12 + + subq $-64,%r13 + movl 0(%r15),%eax + leaq (%rsi,%r13,1),%r12 + movl 4(%r15),%ebx + cmpq %rdx,%r13 + movl 8(%r15),%ecx + cmoveq %rsp,%r12 + movl 12(%r15),%edx + movl 16(%r15),%r8d + movl 20(%r15),%r9d + movl 24(%r15),%r10d + movl 28(%r15),%r11d + vmovdqu 0-128(%rdi),%xmm10 + jmp .Loop_avx2 +.align 16 +.Loop_avx2: + vmovdqa K256+512(%rip),%ymm7 + vmovdqu -64+0(%rsi,%r13,1),%xmm0 + vmovdqu -64+16(%rsi,%r13,1),%xmm1 + vmovdqu -64+32(%rsi,%r13,1),%xmm2 + vmovdqu -64+48(%rsi,%r13,1),%xmm3 + + vinserti128 $1,(%r12),%ymm0,%ymm0 + vinserti128 $1,16(%r12),%ymm1,%ymm1 + vpshufb %ymm7,%ymm0,%ymm0 + vinserti128 $1,32(%r12),%ymm2,%ymm2 + vpshufb %ymm7,%ymm1,%ymm1 + vinserti128 $1,48(%r12),%ymm3,%ymm3 + + leaq K256(%rip),%rbp + vpshufb %ymm7,%ymm2,%ymm2 + leaq -64(%r13),%r13 + vpaddd 0(%rbp),%ymm0,%ymm4 + vpshufb %ymm7,%ymm3,%ymm3 + vpaddd 32(%rbp),%ymm1,%ymm5 + vpaddd 64(%rbp),%ymm2,%ymm6 + vpaddd 96(%rbp),%ymm3,%ymm7 + vmovdqa %ymm4,0(%rsp) + xorl %r14d,%r14d + vmovdqa %ymm5,32(%rsp) + + movq 120(%rsp),%rsi +.cfi_def_cfa %rsi,8 + leaq -64(%rsp),%rsp + + + + movq %rsi,-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + movl %ebx,%esi + vmovdqa %ymm6,0(%rsp) + xorl %ecx,%esi + vmovdqa %ymm7,32(%rsp) + movl %r9d,%r12d + subq $-32*4,%rbp + jmp .Lavx2_00_47 + +.align 16 +.Lavx2_00_47: + vmovdqu (%r13),%xmm9 + vpinsrq $0,%r13,%xmm15,%xmm15 + leaq -64(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08 + + pushq 64-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08 + leaq 8(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + vpalignr $4,%ymm0,%ymm1,%ymm4 + addl 0+128(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + vpalignr $4,%ymm2,%ymm3,%ymm7 + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + vpsrld $7,%ymm4,%ymm6 + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + vpaddd %ymm7,%ymm0,%ymm0 + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%esi + vpxor %xmm10,%xmm9,%xmm9 + vmovdqu 16-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %ebx,%esi + vpshufd $250,%ymm3,%ymm7 + xorl %r13d,%r14d + leal (%r11,%rsi,1),%r11d + movl %r8d,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 4+128(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%edx,%esi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + vpslld $11,%ymm5,%ymm5 + andnl %r9d,%edx,%r12d + xorl %esi,%r13d + rorxl $6,%edx,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%esi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%esi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + vpsrlq $17,%ymm7,%ymm7 + andl %esi,%r15d + vpxor %xmm8,%xmm9,%xmm9 + xorl %r12d,%r14d + xorl %eax,%r15d + vpaddd %ymm4,%ymm0,%ymm0 + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 8+128(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + vpxor %ymm7,%ymm6,%ymm6 + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + vpshufd $132,%ymm6,%ymm6 + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + vpsrldq $8,%ymm6,%ymm6 + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + vpaddd %ymm6,%ymm0,%ymm0 + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + vpshufd $80,%ymm0,%ymm7 + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 32-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r11d,%esi + vpsrld $10,%ymm7,%ymm6 + xorl %r13d,%r14d + leal (%r9,%rsi,1),%r9d + movl %ecx,%r12d + vpsrlq $17,%ymm7,%ymm7 + addl 12+128(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + vpxor %ymm7,%ymm6,%ymm6 + rorxl $11,%ebx,%esi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + vpsrlq $2,%ymm7,%ymm7 + andnl %edx,%ebx,%r12d + xorl %esi,%r13d + rorxl $6,%ebx,%r14d + vpxor %ymm7,%ymm6,%ymm6 + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%esi + vpshufd $232,%ymm6,%ymm6 + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%esi + vpslldq $8,%ymm6,%ymm6 + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + vpaddd %ymm6,%ymm0,%ymm0 + andl %esi,%r15d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 48-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r10d,%r15d + vpaddd 0(%rbp),%ymm0,%ymm6 + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + vmovdqa %ymm6,0(%rsp) + vpalignr $4,%ymm1,%ymm2,%ymm4 + addl 32+128(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + vpalignr $4,%ymm3,%ymm0,%ymm7 + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + vpsrld $7,%ymm4,%ymm6 + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + vpaddd %ymm7,%ymm1,%ymm1 + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 64-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r9d,%esi + vpshufd $250,%ymm0,%ymm7 + xorl %r13d,%r14d + leal (%rdx,%rsi,1),%edx + movl %eax,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 36+128(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%r11d,%esi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + vpslld $11,%ymm5,%ymm5 + andnl %ebx,%r11d,%r12d + xorl %esi,%r13d + rorxl $6,%r11d,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%esi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%esi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + vpsrlq $17,%ymm7,%ymm7 + andl %esi,%r15d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 80-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r8d,%r15d + vpaddd %ymm4,%ymm1,%ymm1 + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 40+128(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + vpxor %ymm7,%ymm6,%ymm6 + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + vpshufd $132,%ymm6,%ymm6 + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + vpsrldq $8,%ymm6,%ymm6 + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + vpaddd %ymm6,%ymm1,%ymm1 + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + vpshufd $80,%ymm1,%ymm7 + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 96-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %edx,%esi + vpsrld $10,%ymm7,%ymm6 + xorl %r13d,%r14d + leal (%rbx,%rsi,1),%ebx + movl %r10d,%r12d + vpsrlq $17,%ymm7,%ymm7 + addl 44+128(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + vpxor %ymm7,%ymm6,%ymm6 + rorxl $11,%r9d,%esi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + vpsrlq $2,%ymm7,%ymm7 + andnl %r11d,%r9d,%r12d + xorl %esi,%r13d + rorxl $6,%r9d,%r14d + vpxor %ymm7,%ymm6,%ymm6 + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%esi + vpshufd $232,%ymm6,%ymm6 + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%esi + vpslldq $8,%ymm6,%ymm6 + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + vpaddd %ymm6,%ymm1,%ymm1 + andl %esi,%r15d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 112-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %ecx,%r15d + vpaddd 32(%rbp),%ymm1,%ymm6 + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + vmovdqa %ymm6,32(%rsp) + leaq -64(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08 + + pushq 64-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08 + leaq 8(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + vpalignr $4,%ymm2,%ymm3,%ymm4 + addl 0+128(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + vpalignr $4,%ymm0,%ymm1,%ymm7 + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + vpsrld $7,%ymm4,%ymm6 + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + vpaddd %ymm7,%ymm2,%ymm2 + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 128-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %ebx,%esi + vpshufd $250,%ymm1,%ymm7 + xorl %r13d,%r14d + leal (%r11,%rsi,1),%r11d + movl %r8d,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 4+128(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%edx,%esi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + vpslld $11,%ymm5,%ymm5 + andnl %r9d,%edx,%r12d + xorl %esi,%r13d + rorxl $6,%edx,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%esi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%esi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + vpsrlq $17,%ymm7,%ymm7 + andl %esi,%r15d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 144-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %eax,%r15d + vpaddd %ymm4,%ymm2,%ymm2 + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 8+128(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + vpxor %ymm7,%ymm6,%ymm6 + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + vpshufd $132,%ymm6,%ymm6 + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + vpsrldq $8,%ymm6,%ymm6 + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + vpaddd %ymm6,%ymm2,%ymm2 + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + vpshufd $80,%ymm2,%ymm7 + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 160-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r11d,%esi + vpsrld $10,%ymm7,%ymm6 + xorl %r13d,%r14d + leal (%r9,%rsi,1),%r9d + movl %ecx,%r12d + vpsrlq $17,%ymm7,%ymm7 + addl 12+128(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + vpxor %ymm7,%ymm6,%ymm6 + rorxl $11,%ebx,%esi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + vpsrlq $2,%ymm7,%ymm7 + andnl %edx,%ebx,%r12d + xorl %esi,%r13d + rorxl $6,%ebx,%r14d + vpxor %ymm7,%ymm6,%ymm6 + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%esi + vpshufd $232,%ymm6,%ymm6 + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%esi + vpslldq $8,%ymm6,%ymm6 + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + vpaddd %ymm6,%ymm2,%ymm2 + andl %esi,%r15d + vaesenclast %xmm10,%xmm9,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 176-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r10d,%r15d + vpaddd 64(%rbp),%ymm2,%ymm6 + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + vmovdqa %ymm6,0(%rsp) + vpalignr $4,%ymm3,%ymm0,%ymm4 + addl 32+128(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + vpalignr $4,%ymm1,%ymm2,%ymm7 + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + vpsrld $7,%ymm4,%ymm6 + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + vpaddd %ymm7,%ymm3,%ymm3 + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%esi + vpand %xmm12,%xmm11,%xmm8 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 192-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r9d,%esi + vpshufd $250,%ymm2,%ymm7 + xorl %r13d,%r14d + leal (%rdx,%rsi,1),%edx + movl %eax,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 36+128(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%r11d,%esi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + vpslld $11,%ymm5,%ymm5 + andnl %ebx,%r11d,%r12d + xorl %esi,%r13d + rorxl $6,%r11d,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%esi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%esi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + vpsrlq $17,%ymm7,%ymm7 + andl %esi,%r15d + vaesenclast %xmm10,%xmm9,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 208-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r8d,%r15d + vpaddd %ymm4,%ymm3,%ymm3 + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 40+128(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + vpxor %ymm7,%ymm6,%ymm6 + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + vpshufd $132,%ymm6,%ymm6 + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + vpsrldq $8,%ymm6,%ymm6 + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + vpaddd %ymm6,%ymm3,%ymm3 + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + vpshufd $80,%ymm3,%ymm7 + andl %r15d,%esi + vpand %xmm13,%xmm11,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 224-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %edx,%esi + vpsrld $10,%ymm7,%ymm6 + xorl %r13d,%r14d + leal (%rbx,%rsi,1),%ebx + movl %r10d,%r12d + vpsrlq $17,%ymm7,%ymm7 + addl 44+128(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + vpxor %ymm7,%ymm6,%ymm6 + rorxl $11,%r9d,%esi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + vpsrlq $2,%ymm7,%ymm7 + andnl %r11d,%r9d,%r12d + xorl %esi,%r13d + rorxl $6,%r9d,%r14d + vpxor %ymm7,%ymm6,%ymm6 + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%esi + vpshufd $232,%ymm6,%ymm6 + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%esi + vpslldq $8,%ymm6,%ymm6 + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + vpaddd %ymm6,%ymm3,%ymm3 + andl %esi,%r15d + vpor %xmm11,%xmm8,%xmm8 + vaesenclast %xmm10,%xmm9,%xmm11 + vmovdqu 0-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %ecx,%r15d + vpaddd 96(%rbp),%ymm3,%ymm6 + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + vmovdqa %ymm6,32(%rsp) + vmovq %xmm15,%r13 + vpextrq $1,%xmm15,%r15 + vpand %xmm14,%xmm11,%xmm11 + vpor %xmm11,%xmm8,%xmm8 + vmovdqu %xmm8,(%r15,%r13,1) + leaq 16(%r13),%r13 + leaq 128(%rbp),%rbp + cmpb $0,3(%rbp) + jne .Lavx2_00_47 + vmovdqu (%r13),%xmm9 + vpinsrq $0,%r13,%xmm15,%xmm15 + addl 0+64(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + andl %r15d,%esi + vpxor %xmm10,%xmm9,%xmm9 + vmovdqu 16-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %ebx,%esi + xorl %r13d,%r14d + leal (%r11,%rsi,1),%r11d + movl %r8d,%r12d + addl 4+64(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + rorxl $11,%edx,%esi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + andnl %r9d,%edx,%r12d + xorl %esi,%r13d + rorxl $6,%edx,%r14d + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%esi + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%esi + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + andl %esi,%r15d + vpxor %xmm8,%xmm9,%xmm9 + xorl %r12d,%r14d + xorl %eax,%r15d + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + addl 8+64(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 32-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r11d,%esi + xorl %r13d,%r14d + leal (%r9,%rsi,1),%r9d + movl %ecx,%r12d + addl 12+64(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + rorxl $11,%ebx,%esi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + andnl %edx,%ebx,%r12d + xorl %esi,%r13d + rorxl $6,%ebx,%r14d + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%esi + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%esi + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %esi,%r15d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 48-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + addl 32+64(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 64-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r9d,%esi + xorl %r13d,%r14d + leal (%rdx,%rsi,1),%edx + movl %eax,%r12d + addl 36+64(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + rorxl $11,%r11d,%esi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + andnl %ebx,%r11d,%r12d + xorl %esi,%r13d + rorxl $6,%r11d,%r14d + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%esi + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%esi + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + andl %esi,%r15d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 80-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r8d,%r15d + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + addl 40+64(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 96-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %edx,%esi + xorl %r13d,%r14d + leal (%rbx,%rsi,1),%ebx + movl %r10d,%r12d + addl 44+64(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + rorxl $11,%r9d,%esi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + andnl %r11d,%r9d,%r12d + xorl %esi,%r13d + rorxl $6,%r9d,%r14d + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%esi + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%esi + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %esi,%r15d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 112-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + addl 0(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 128-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %ebx,%esi + xorl %r13d,%r14d + leal (%r11,%rsi,1),%r11d + movl %r8d,%r12d + addl 4(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + rorxl $11,%edx,%esi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + andnl %r9d,%edx,%r12d + xorl %esi,%r13d + rorxl $6,%edx,%r14d + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%esi + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%esi + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + andl %esi,%r15d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 144-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %eax,%r15d + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + addl 8(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 160-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r11d,%esi + xorl %r13d,%r14d + leal (%r9,%rsi,1),%r9d + movl %ecx,%r12d + addl 12(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + rorxl $11,%ebx,%esi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + andnl %edx,%ebx,%r12d + xorl %esi,%r13d + rorxl $6,%ebx,%r14d + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%esi + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%esi + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %esi,%r15d + vaesenclast %xmm10,%xmm9,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 176-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + addl 32(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + andl %r15d,%esi + vpand %xmm12,%xmm11,%xmm8 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 192-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r9d,%esi + xorl %r13d,%r14d + leal (%rdx,%rsi,1),%edx + movl %eax,%r12d + addl 36(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + rorxl $11,%r11d,%esi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + andnl %ebx,%r11d,%r12d + xorl %esi,%r13d + rorxl $6,%r11d,%r14d + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%esi + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%esi + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + andl %esi,%r15d + vaesenclast %xmm10,%xmm9,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 208-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r8d,%r15d + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + addl 40(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + andl %r15d,%esi + vpand %xmm13,%xmm11,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 224-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %edx,%esi + xorl %r13d,%r14d + leal (%rbx,%rsi,1),%ebx + movl %r10d,%r12d + addl 44(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + rorxl $11,%r9d,%esi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + andnl %r11d,%r9d,%r12d + xorl %esi,%r13d + rorxl $6,%r9d,%r14d + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%esi + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%esi + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %esi,%r15d + vpor %xmm11,%xmm8,%xmm8 + vaesenclast %xmm10,%xmm9,%xmm11 + vmovdqu 0-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + vpextrq $1,%xmm15,%r12 + vmovq %xmm15,%r13 + movq 552(%rsp),%r15 + addl %r14d,%eax + leaq 448(%rsp),%rbp + + vpand %xmm14,%xmm11,%xmm11 + vpor %xmm11,%xmm8,%xmm8 + vmovdqu %xmm8,(%r12,%r13,1) + leaq 16(%r13),%r13 + + addl 0(%r15),%eax + addl 4(%r15),%ebx + addl 8(%r15),%ecx + addl 12(%r15),%edx + addl 16(%r15),%r8d + addl 20(%r15),%r9d + addl 24(%r15),%r10d + addl 28(%r15),%r11d + + movl %eax,0(%r15) + movl %ebx,4(%r15) + movl %ecx,8(%r15) + movl %edx,12(%r15) + movl %r8d,16(%r15) + movl %r9d,20(%r15) + movl %r10d,24(%r15) + movl %r11d,28(%r15) + + cmpq 80(%rbp),%r13 + je .Ldone_avx2 + + xorl %r14d,%r14d + movl %ebx,%esi + movl %r9d,%r12d + xorl %ecx,%esi + jmp .Lower_avx2 +.align 16 +.Lower_avx2: + vmovdqu (%r13),%xmm9 + vpinsrq $0,%r13,%xmm15,%xmm15 + addl 0+16(%rbp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + andl %r15d,%esi + vpxor %xmm10,%xmm9,%xmm9 + vmovdqu 16-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %ebx,%esi + xorl %r13d,%r14d + leal (%r11,%rsi,1),%r11d + movl %r8d,%r12d + addl 4+16(%rbp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + rorxl $11,%edx,%esi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + andnl %r9d,%edx,%r12d + xorl %esi,%r13d + rorxl $6,%edx,%r14d + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%esi + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%esi + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + andl %esi,%r15d + vpxor %xmm8,%xmm9,%xmm9 + xorl %r12d,%r14d + xorl %eax,%r15d + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + addl 8+16(%rbp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 32-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r11d,%esi + xorl %r13d,%r14d + leal (%r9,%rsi,1),%r9d + movl %ecx,%r12d + addl 12+16(%rbp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + rorxl $11,%ebx,%esi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + andnl %edx,%ebx,%r12d + xorl %esi,%r13d + rorxl $6,%ebx,%r14d + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%esi + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%esi + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %esi,%r15d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 48-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + addl 32+16(%rbp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 64-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r9d,%esi + xorl %r13d,%r14d + leal (%rdx,%rsi,1),%edx + movl %eax,%r12d + addl 36+16(%rbp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + rorxl $11,%r11d,%esi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + andnl %ebx,%r11d,%r12d + xorl %esi,%r13d + rorxl $6,%r11d,%r14d + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%esi + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%esi + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + andl %esi,%r15d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 80-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r8d,%r15d + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + addl 40+16(%rbp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 96-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %edx,%esi + xorl %r13d,%r14d + leal (%rbx,%rsi,1),%ebx + movl %r10d,%r12d + addl 44+16(%rbp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + rorxl $11,%r9d,%esi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + andnl %r11d,%r9d,%r12d + xorl %esi,%r13d + rorxl $6,%r9d,%r14d + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%esi + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%esi + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %esi,%r15d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 112-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + leaq -64(%rbp),%rbp + addl 0+16(%rbp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 128-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %ebx,%esi + xorl %r13d,%r14d + leal (%r11,%rsi,1),%r11d + movl %r8d,%r12d + addl 4+16(%rbp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + rorxl $11,%edx,%esi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + andnl %r9d,%edx,%r12d + xorl %esi,%r13d + rorxl $6,%edx,%r14d + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%esi + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%esi + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + andl %esi,%r15d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 144-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %eax,%r15d + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + addl 8+16(%rbp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 160-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r11d,%esi + xorl %r13d,%r14d + leal (%r9,%rsi,1),%r9d + movl %ecx,%r12d + addl 12+16(%rbp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + rorxl $11,%ebx,%esi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + andnl %edx,%ebx,%r12d + xorl %esi,%r13d + rorxl $6,%ebx,%r14d + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%esi + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%esi + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %esi,%r15d + vaesenclast %xmm10,%xmm9,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 176-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + addl 32+16(%rbp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + andl %r15d,%esi + vpand %xmm12,%xmm11,%xmm8 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 192-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r9d,%esi + xorl %r13d,%r14d + leal (%rdx,%rsi,1),%edx + movl %eax,%r12d + addl 36+16(%rbp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + rorxl $11,%r11d,%esi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + andnl %ebx,%r11d,%r12d + xorl %esi,%r13d + rorxl $6,%r11d,%r14d + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%esi + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%esi + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + andl %esi,%r15d + vaesenclast %xmm10,%xmm9,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 208-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r8d,%r15d + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + addl 40+16(%rbp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + andl %r15d,%esi + vpand %xmm13,%xmm11,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 224-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %edx,%esi + xorl %r13d,%r14d + leal (%rbx,%rsi,1),%ebx + movl %r10d,%r12d + addl 44+16(%rbp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + rorxl $11,%r9d,%esi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + andnl %r11d,%r9d,%r12d + xorl %esi,%r13d + rorxl $6,%r9d,%r14d + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%esi + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%esi + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %esi,%r15d + vpor %xmm11,%xmm8,%xmm8 + vaesenclast %xmm10,%xmm9,%xmm11 + vmovdqu 0-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + vmovq %xmm15,%r13 + vpextrq $1,%xmm15,%r15 + vpand %xmm14,%xmm11,%xmm11 + vpor %xmm11,%xmm8,%xmm8 + leaq -64(%rbp),%rbp + vmovdqu %xmm8,(%r15,%r13,1) + leaq 16(%r13),%r13 + cmpq %rsp,%rbp + jae .Lower_avx2 + + movq 552(%rsp),%r15 + leaq 64(%r13),%r13 + movq 560(%rsp),%rsi + addl %r14d,%eax + leaq 448(%rsp),%rsp + + addl 0(%r15),%eax + addl 4(%r15),%ebx + addl 8(%r15),%ecx + addl 12(%r15),%edx + addl 16(%r15),%r8d + addl 20(%r15),%r9d + addl 24(%r15),%r10d + leaq (%rsi,%r13,1),%r12 + addl 28(%r15),%r11d + + cmpq 64+16(%rsp),%r13 + + movl %eax,0(%r15) + cmoveq %rsp,%r12 + movl %ebx,4(%r15) + movl %ecx,8(%r15) + movl %edx,12(%r15) + movl %r8d,16(%r15) + movl %r9d,20(%r15) + movl %r10d,24(%r15) + movl %r11d,28(%r15) + + jbe .Loop_avx2 + leaq (%rsp),%rbp + + +.cfi_escape 0x0f,0x06,0x76,0xf8,0x00,0x06,0x23,0x08 + +.Ldone_avx2: + movq 64+32(%rbp),%r8 + movq 64+56(%rbp),%rsi +.cfi_def_cfa %rsi,8 + vmovdqu %xmm8,(%r8) + vzeroall + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx2: + .byte 0xf3,0xc3 +.cfi_endproc +.size aesni_cbc_sha256_enc_avx2,.-aesni_cbc_sha256_enc_avx2 +.type aesni_cbc_sha256_enc_shaext,@function +.align 32 +aesni_cbc_sha256_enc_shaext: +.cfi_startproc + movq 8(%rsp),%r10 + leaq K256+128(%rip),%rax + movdqu (%r9),%xmm1 + movdqu 16(%r9),%xmm2 + movdqa 512-128(%rax),%xmm3 + + movl 240(%rcx),%r11d + subq %rdi,%rsi + movups (%rcx),%xmm15 + movups (%r8),%xmm6 + movups 16(%rcx),%xmm4 + leaq 112(%rcx),%rcx + + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + movdqu (%r10),%xmm10 + movdqu 16(%r10),%xmm11 + movdqu 32(%r10),%xmm12 +.byte 102,68,15,56,0,211 + movdqu 48(%r10),%xmm13 + + movdqa 0-128(%rax),%xmm0 + paddd %xmm10,%xmm0 +.byte 102,68,15,56,0,219 + movdqa %xmm2,%xmm9 + movdqa %xmm1,%xmm8 + movups 0(%rdi),%xmm14 + xorps %xmm15,%xmm14 + xorps %xmm14,%xmm6 + movups -80(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movups -64(%rcx),%xmm4 + aesenc %xmm5,%xmm6 +.byte 15,56,203,202 + + movdqa 32-128(%rax),%xmm0 + paddd %xmm11,%xmm0 +.byte 102,68,15,56,0,227 + leaq 64(%r10),%r10 + movups -48(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movups -32(%rcx),%xmm4 + aesenc %xmm5,%xmm6 +.byte 15,56,203,202 + + movdqa 64-128(%rax),%xmm0 + paddd %xmm12,%xmm0 +.byte 102,68,15,56,0,235 +.byte 69,15,56,204,211 + movups -16(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm13,%xmm3 +.byte 102,65,15,58,15,220,4 + paddd %xmm3,%xmm10 + movups 0(%rcx),%xmm4 + aesenc %xmm5,%xmm6 +.byte 15,56,203,202 + + movdqa 96-128(%rax),%xmm0 + paddd %xmm13,%xmm0 +.byte 69,15,56,205,213 +.byte 69,15,56,204,220 + movups 16(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movups 32(%rcx),%xmm4 + aesenc %xmm5,%xmm6 + movdqa %xmm10,%xmm3 +.byte 102,65,15,58,15,221,4 + paddd %xmm3,%xmm11 +.byte 15,56,203,202 + movdqa 128-128(%rax),%xmm0 + paddd %xmm10,%xmm0 +.byte 69,15,56,205,218 +.byte 69,15,56,204,229 + movups 48(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm11,%xmm3 +.byte 102,65,15,58,15,218,4 + paddd %xmm3,%xmm12 + cmpl $11,%r11d + jb .Laesenclast1 + movups 64(%rcx),%xmm4 + aesenc %xmm5,%xmm6 + movups 80(%rcx),%xmm5 + aesenc %xmm4,%xmm6 + je .Laesenclast1 + movups 96(%rcx),%xmm4 + aesenc %xmm5,%xmm6 + movups 112(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.Laesenclast1: + aesenclast %xmm5,%xmm6 + movups 16-112(%rcx),%xmm4 + nop +.byte 15,56,203,202 + movups 16(%rdi),%xmm14 + xorps %xmm15,%xmm14 + movups %xmm6,0(%rsi,%rdi,1) + xorps %xmm14,%xmm6 + movups -80(%rcx),%xmm5 + aesenc %xmm4,%xmm6 + movdqa 160-128(%rax),%xmm0 + paddd %xmm11,%xmm0 +.byte 69,15,56,205,227 +.byte 69,15,56,204,234 + movups -64(%rcx),%xmm4 + aesenc %xmm5,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm12,%xmm3 +.byte 102,65,15,58,15,219,4 + paddd %xmm3,%xmm13 + movups -48(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.byte 15,56,203,202 + movdqa 192-128(%rax),%xmm0 + paddd %xmm12,%xmm0 +.byte 69,15,56,205,236 +.byte 69,15,56,204,211 + movups -32(%rcx),%xmm4 + aesenc %xmm5,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm13,%xmm3 +.byte 102,65,15,58,15,220,4 + paddd %xmm3,%xmm10 + movups -16(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.byte 15,56,203,202 + movdqa 224-128(%rax),%xmm0 + paddd %xmm13,%xmm0 +.byte 69,15,56,205,213 +.byte 69,15,56,204,220 + movups 0(%rcx),%xmm4 + aesenc %xmm5,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm10,%xmm3 +.byte 102,65,15,58,15,221,4 + paddd %xmm3,%xmm11 + movups 16(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.byte 15,56,203,202 + movdqa 256-128(%rax),%xmm0 + paddd %xmm10,%xmm0 +.byte 69,15,56,205,218 +.byte 69,15,56,204,229 + movups 32(%rcx),%xmm4 + aesenc %xmm5,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm11,%xmm3 +.byte 102,65,15,58,15,218,4 + paddd %xmm3,%xmm12 + movups 48(%rcx),%xmm5 + aesenc %xmm4,%xmm6 + cmpl $11,%r11d + jb .Laesenclast2 + movups 64(%rcx),%xmm4 + aesenc %xmm5,%xmm6 + movups 80(%rcx),%xmm5 + aesenc %xmm4,%xmm6 + je .Laesenclast2 + movups 96(%rcx),%xmm4 + aesenc %xmm5,%xmm6 + movups 112(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.Laesenclast2: + aesenclast %xmm5,%xmm6 + movups 16-112(%rcx),%xmm4 + nop +.byte 15,56,203,202 + movups 32(%rdi),%xmm14 + xorps %xmm15,%xmm14 + movups %xmm6,16(%rsi,%rdi,1) + xorps %xmm14,%xmm6 + movups -80(%rcx),%xmm5 + aesenc %xmm4,%xmm6 + movdqa 288-128(%rax),%xmm0 + paddd %xmm11,%xmm0 +.byte 69,15,56,205,227 +.byte 69,15,56,204,234 + movups -64(%rcx),%xmm4 + aesenc %xmm5,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm12,%xmm3 +.byte 102,65,15,58,15,219,4 + paddd %xmm3,%xmm13 + movups -48(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.byte 15,56,203,202 + movdqa 320-128(%rax),%xmm0 + paddd %xmm12,%xmm0 +.byte 69,15,56,205,236 +.byte 69,15,56,204,211 + movups -32(%rcx),%xmm4 + aesenc %xmm5,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm13,%xmm3 +.byte 102,65,15,58,15,220,4 + paddd %xmm3,%xmm10 + movups -16(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.byte 15,56,203,202 + movdqa 352-128(%rax),%xmm0 + paddd %xmm13,%xmm0 +.byte 69,15,56,205,213 +.byte 69,15,56,204,220 + movups 0(%rcx),%xmm4 + aesenc %xmm5,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm10,%xmm3 +.byte 102,65,15,58,15,221,4 + paddd %xmm3,%xmm11 + movups 16(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.byte 15,56,203,202 + movdqa 384-128(%rax),%xmm0 + paddd %xmm10,%xmm0 +.byte 69,15,56,205,218 +.byte 69,15,56,204,229 + movups 32(%rcx),%xmm4 + aesenc %xmm5,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm11,%xmm3 +.byte 102,65,15,58,15,218,4 + paddd %xmm3,%xmm12 + movups 48(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.byte 15,56,203,202 + movdqa 416-128(%rax),%xmm0 + paddd %xmm11,%xmm0 +.byte 69,15,56,205,227 +.byte 69,15,56,204,234 + cmpl $11,%r11d + jb .Laesenclast3 + movups 64(%rcx),%xmm4 + aesenc %xmm5,%xmm6 + movups 80(%rcx),%xmm5 + aesenc %xmm4,%xmm6 + je .Laesenclast3 + movups 96(%rcx),%xmm4 + aesenc %xmm5,%xmm6 + movups 112(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.Laesenclast3: + aesenclast %xmm5,%xmm6 + movups 16-112(%rcx),%xmm4 + nop +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm12,%xmm3 +.byte 102,65,15,58,15,219,4 + paddd %xmm3,%xmm13 + movups 48(%rdi),%xmm14 + xorps %xmm15,%xmm14 + movups %xmm6,32(%rsi,%rdi,1) + xorps %xmm14,%xmm6 + movups -80(%rcx),%xmm5 + aesenc %xmm4,%xmm6 + movups -64(%rcx),%xmm4 + aesenc %xmm5,%xmm6 +.byte 15,56,203,202 + + movdqa 448-128(%rax),%xmm0 + paddd %xmm12,%xmm0 +.byte 69,15,56,205,236 + movdqa %xmm7,%xmm3 + movups -48(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movups -32(%rcx),%xmm4 + aesenc %xmm5,%xmm6 +.byte 15,56,203,202 + + movdqa 480-128(%rax),%xmm0 + paddd %xmm13,%xmm0 + movups -16(%rcx),%xmm5 + aesenc %xmm4,%xmm6 + movups 0(%rcx),%xmm4 + aesenc %xmm5,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movups 16(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.byte 15,56,203,202 + + movups 32(%rcx),%xmm4 + aesenc %xmm5,%xmm6 + movups 48(%rcx),%xmm5 + aesenc %xmm4,%xmm6 + cmpl $11,%r11d + jb .Laesenclast4 + movups 64(%rcx),%xmm4 + aesenc %xmm5,%xmm6 + movups 80(%rcx),%xmm5 + aesenc %xmm4,%xmm6 + je .Laesenclast4 + movups 96(%rcx),%xmm4 + aesenc %xmm5,%xmm6 + movups 112(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.Laesenclast4: + aesenclast %xmm5,%xmm6 + movups 16-112(%rcx),%xmm4 + nop + + paddd %xmm9,%xmm2 + paddd %xmm8,%xmm1 + + decq %rdx + movups %xmm6,48(%rsi,%rdi,1) + leaq 64(%rdi),%rdi + jnz .Loop_shaext + + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm3 + pshufd $0xb1,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,211,8 + + movups %xmm6,(%r8) + movdqu %xmm1,(%r9) + movdqu %xmm2,16(%r9) + .byte 0xf3,0xc3 +.cfi_endproc +.size aesni_cbc_sha256_enc_shaext,.-aesni_cbc_sha256_enc_shaext diff --git a/secure/lib/libcrypto/amd64/chacha-x86_64.S b/secure/lib/libcrypto/amd64/chacha-x86_64.S index 0b3d5b8b6db..b01c1b87d47 100644 --- a/secure/lib/libcrypto/amd64/chacha-x86_64.S +++ b/secure/lib/libcrypto/amd64/chacha-x86_64.S @@ -331,6 +331,8 @@ ChaCha20_ssse3: .LChaCha20_ssse3: movq %rsp,%r9 .cfi_def_cfa_register %r9 + testl $2048,%r10d + jnz .LChaCha20_4xop cmpq $128,%rdx je .LChaCha20_128 ja .LChaCha20_4x @@ -626,6 +628,9 @@ ChaCha20_4x: movq %rsp,%r9 .cfi_def_cfa_register %r9 movq %r10,%r11 + shrq $32,%r10 + testq $32,%r10 + jnz .LChaCha20_8x cmpq $192,%rdx ja .Lproceed4x @@ -1167,3 +1172,1024 @@ ChaCha20_4x: .byte 0xf3,0xc3 .cfi_endproc .size ChaCha20_4x,.-ChaCha20_4x +.type ChaCha20_4xop,@function +.align 32 +ChaCha20_4xop: +.cfi_startproc +.LChaCha20_4xop: + movq %rsp,%r9 +.cfi_def_cfa_register %r9 + subq $0x140+8,%rsp + vzeroupper + + vmovdqa .Lsigma(%rip),%xmm11 + vmovdqu (%rcx),%xmm3 + vmovdqu 16(%rcx),%xmm15 + vmovdqu (%r8),%xmm7 + leaq 256(%rsp),%rcx + + vpshufd $0x00,%xmm11,%xmm8 + vpshufd $0x55,%xmm11,%xmm9 + vmovdqa %xmm8,64(%rsp) + vpshufd $0xaa,%xmm11,%xmm10 + vmovdqa %xmm9,80(%rsp) + vpshufd $0xff,%xmm11,%xmm11 + vmovdqa %xmm10,96(%rsp) + vmovdqa %xmm11,112(%rsp) + + vpshufd $0x00,%xmm3,%xmm0 + vpshufd $0x55,%xmm3,%xmm1 + vmovdqa %xmm0,128-256(%rcx) + vpshufd $0xaa,%xmm3,%xmm2 + vmovdqa %xmm1,144-256(%rcx) + vpshufd $0xff,%xmm3,%xmm3 + vmovdqa %xmm2,160-256(%rcx) + vmovdqa %xmm3,176-256(%rcx) + + vpshufd $0x00,%xmm15,%xmm12 + vpshufd $0x55,%xmm15,%xmm13 + vmovdqa %xmm12,192-256(%rcx) + vpshufd $0xaa,%xmm15,%xmm14 + vmovdqa %xmm13,208-256(%rcx) + vpshufd $0xff,%xmm15,%xmm15 + vmovdqa %xmm14,224-256(%rcx) + vmovdqa %xmm15,240-256(%rcx) + + vpshufd $0x00,%xmm7,%xmm4 + vpshufd $0x55,%xmm7,%xmm5 + vpaddd .Linc(%rip),%xmm4,%xmm4 + vpshufd $0xaa,%xmm7,%xmm6 + vmovdqa %xmm5,272-256(%rcx) + vpshufd $0xff,%xmm7,%xmm7 + vmovdqa %xmm6,288-256(%rcx) + vmovdqa %xmm7,304-256(%rcx) + + jmp .Loop_enter4xop + +.align 32 +.Loop_outer4xop: + vmovdqa 64(%rsp),%xmm8 + vmovdqa 80(%rsp),%xmm9 + vmovdqa 96(%rsp),%xmm10 + vmovdqa 112(%rsp),%xmm11 + vmovdqa 128-256(%rcx),%xmm0 + vmovdqa 144-256(%rcx),%xmm1 + vmovdqa 160-256(%rcx),%xmm2 + vmovdqa 176-256(%rcx),%xmm3 + vmovdqa 192-256(%rcx),%xmm12 + vmovdqa 208-256(%rcx),%xmm13 + vmovdqa 224-256(%rcx),%xmm14 + vmovdqa 240-256(%rcx),%xmm15 + vmovdqa 256-256(%rcx),%xmm4 + vmovdqa 272-256(%rcx),%xmm5 + vmovdqa 288-256(%rcx),%xmm6 + vmovdqa 304-256(%rcx),%xmm7 + vpaddd .Lfour(%rip),%xmm4,%xmm4 + +.Loop_enter4xop: + movl $10,%eax + vmovdqa %xmm4,256-256(%rcx) + jmp .Loop4xop + +.align 32 +.Loop4xop: + vpaddd %xmm0,%xmm8,%xmm8 + vpaddd %xmm1,%xmm9,%xmm9 + vpaddd %xmm2,%xmm10,%xmm10 + vpaddd %xmm3,%xmm11,%xmm11 + vpxor %xmm4,%xmm8,%xmm4 + vpxor %xmm5,%xmm9,%xmm5 + vpxor %xmm6,%xmm10,%xmm6 + vpxor %xmm7,%xmm11,%xmm7 +.byte 143,232,120,194,228,16 +.byte 143,232,120,194,237,16 +.byte 143,232,120,194,246,16 +.byte 143,232,120,194,255,16 + vpaddd %xmm4,%xmm12,%xmm12 + vpaddd %xmm5,%xmm13,%xmm13 + vpaddd %xmm6,%xmm14,%xmm14 + vpaddd %xmm7,%xmm15,%xmm15 + vpxor %xmm0,%xmm12,%xmm0 + vpxor %xmm1,%xmm13,%xmm1 + vpxor %xmm14,%xmm2,%xmm2 + vpxor %xmm15,%xmm3,%xmm3 +.byte 143,232,120,194,192,12 +.byte 143,232,120,194,201,12 +.byte 143,232,120,194,210,12 +.byte 143,232,120,194,219,12 + vpaddd %xmm8,%xmm0,%xmm8 + vpaddd %xmm9,%xmm1,%xmm9 + vpaddd %xmm2,%xmm10,%xmm10 + vpaddd %xmm3,%xmm11,%xmm11 + vpxor %xmm4,%xmm8,%xmm4 + vpxor %xmm5,%xmm9,%xmm5 + vpxor %xmm6,%xmm10,%xmm6 + vpxor %xmm7,%xmm11,%xmm7 +.byte 143,232,120,194,228,8 +.byte 143,232,120,194,237,8 +.byte 143,232,120,194,246,8 +.byte 143,232,120,194,255,8 + vpaddd %xmm4,%xmm12,%xmm12 + vpaddd %xmm5,%xmm13,%xmm13 + vpaddd %xmm6,%xmm14,%xmm14 + vpaddd %xmm7,%xmm15,%xmm15 + vpxor %xmm0,%xmm12,%xmm0 + vpxor %xmm1,%xmm13,%xmm1 + vpxor %xmm14,%xmm2,%xmm2 + vpxor %xmm15,%xmm3,%xmm3 +.byte 143,232,120,194,192,7 +.byte 143,232,120,194,201,7 +.byte 143,232,120,194,210,7 +.byte 143,232,120,194,219,7 + vpaddd %xmm1,%xmm8,%xmm8 + vpaddd %xmm2,%xmm9,%xmm9 + vpaddd %xmm3,%xmm10,%xmm10 + vpaddd %xmm0,%xmm11,%xmm11 + vpxor %xmm7,%xmm8,%xmm7 + vpxor %xmm4,%xmm9,%xmm4 + vpxor %xmm5,%xmm10,%xmm5 + vpxor %xmm6,%xmm11,%xmm6 +.byte 143,232,120,194,255,16 +.byte 143,232,120,194,228,16 +.byte 143,232,120,194,237,16 +.byte 143,232,120,194,246,16 + vpaddd %xmm7,%xmm14,%xmm14 + vpaddd %xmm4,%xmm15,%xmm15 + vpaddd %xmm5,%xmm12,%xmm12 + vpaddd %xmm6,%xmm13,%xmm13 + vpxor %xmm1,%xmm14,%xmm1 + vpxor %xmm2,%xmm15,%xmm2 + vpxor %xmm12,%xmm3,%xmm3 + vpxor %xmm13,%xmm0,%xmm0 +.byte 143,232,120,194,201,12 +.byte 143,232,120,194,210,12 +.byte 143,232,120,194,219,12 +.byte 143,232,120,194,192,12 + vpaddd %xmm8,%xmm1,%xmm8 + vpaddd %xmm9,%xmm2,%xmm9 + vpaddd %xmm3,%xmm10,%xmm10 + vpaddd %xmm0,%xmm11,%xmm11 + vpxor %xmm7,%xmm8,%xmm7 + vpxor %xmm4,%xmm9,%xmm4 + vpxor %xmm5,%xmm10,%xmm5 + vpxor %xmm6,%xmm11,%xmm6 +.byte 143,232,120,194,255,8 +.byte 143,232,120,194,228,8 +.byte 143,232,120,194,237,8 +.byte 143,232,120,194,246,8 + vpaddd %xmm7,%xmm14,%xmm14 + vpaddd %xmm4,%xmm15,%xmm15 + vpaddd %xmm5,%xmm12,%xmm12 + vpaddd %xmm6,%xmm13,%xmm13 + vpxor %xmm1,%xmm14,%xmm1 + vpxor %xmm2,%xmm15,%xmm2 + vpxor %xmm12,%xmm3,%xmm3 + vpxor %xmm13,%xmm0,%xmm0 +.byte 143,232,120,194,201,7 +.byte 143,232,120,194,210,7 +.byte 143,232,120,194,219,7 +.byte 143,232,120,194,192,7 + decl %eax + jnz .Loop4xop + + vpaddd 64(%rsp),%xmm8,%xmm8 + vpaddd 80(%rsp),%xmm9,%xmm9 + vpaddd 96(%rsp),%xmm10,%xmm10 + vpaddd 112(%rsp),%xmm11,%xmm11 + + vmovdqa %xmm14,32(%rsp) + vmovdqa %xmm15,48(%rsp) + + vpunpckldq %xmm9,%xmm8,%xmm14 + vpunpckldq %xmm11,%xmm10,%xmm15 + vpunpckhdq %xmm9,%xmm8,%xmm8 + vpunpckhdq %xmm11,%xmm10,%xmm10 + vpunpcklqdq %xmm15,%xmm14,%xmm9 + vpunpckhqdq %xmm15,%xmm14,%xmm14 + vpunpcklqdq %xmm10,%xmm8,%xmm11 + vpunpckhqdq %xmm10,%xmm8,%xmm8 + vpaddd 128-256(%rcx),%xmm0,%xmm0 + vpaddd 144-256(%rcx),%xmm1,%xmm1 + vpaddd 160-256(%rcx),%xmm2,%xmm2 + vpaddd 176-256(%rcx),%xmm3,%xmm3 + + vmovdqa %xmm9,0(%rsp) + vmovdqa %xmm14,16(%rsp) + vmovdqa 32(%rsp),%xmm9 + vmovdqa 48(%rsp),%xmm14 + + vpunpckldq %xmm1,%xmm0,%xmm10 + vpunpckldq %xmm3,%xmm2,%xmm15 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm15,%xmm10,%xmm1 + vpunpckhqdq %xmm15,%xmm10,%xmm10 + vpunpcklqdq %xmm2,%xmm0,%xmm3 + vpunpckhqdq %xmm2,%xmm0,%xmm0 + vpaddd 192-256(%rcx),%xmm12,%xmm12 + vpaddd 208-256(%rcx),%xmm13,%xmm13 + vpaddd 224-256(%rcx),%xmm9,%xmm9 + vpaddd 240-256(%rcx),%xmm14,%xmm14 + + vpunpckldq %xmm13,%xmm12,%xmm2 + vpunpckldq %xmm14,%xmm9,%xmm15 + vpunpckhdq %xmm13,%xmm12,%xmm12 + vpunpckhdq %xmm14,%xmm9,%xmm9 + vpunpcklqdq %xmm15,%xmm2,%xmm13 + vpunpckhqdq %xmm15,%xmm2,%xmm2 + vpunpcklqdq %xmm9,%xmm12,%xmm14 + vpunpckhqdq %xmm9,%xmm12,%xmm12 + vpaddd 256-256(%rcx),%xmm4,%xmm4 + vpaddd 272-256(%rcx),%xmm5,%xmm5 + vpaddd 288-256(%rcx),%xmm6,%xmm6 + vpaddd 304-256(%rcx),%xmm7,%xmm7 + + vpunpckldq %xmm5,%xmm4,%xmm9 + vpunpckldq %xmm7,%xmm6,%xmm15 + vpunpckhdq %xmm5,%xmm4,%xmm4 + vpunpckhdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm15,%xmm9,%xmm5 + vpunpckhqdq %xmm15,%xmm9,%xmm9 + vpunpcklqdq %xmm6,%xmm4,%xmm7 + vpunpckhqdq %xmm6,%xmm4,%xmm4 + vmovdqa 0(%rsp),%xmm6 + vmovdqa 16(%rsp),%xmm15 + + cmpq $256,%rdx + jb .Ltail4xop + + vpxor 0(%rsi),%xmm6,%xmm6 + vpxor 16(%rsi),%xmm1,%xmm1 + vpxor 32(%rsi),%xmm13,%xmm13 + vpxor 48(%rsi),%xmm5,%xmm5 + vpxor 64(%rsi),%xmm15,%xmm15 + vpxor 80(%rsi),%xmm10,%xmm10 + vpxor 96(%rsi),%xmm2,%xmm2 + vpxor 112(%rsi),%xmm9,%xmm9 + leaq 128(%rsi),%rsi + vpxor 0(%rsi),%xmm11,%xmm11 + vpxor 16(%rsi),%xmm3,%xmm3 + vpxor 32(%rsi),%xmm14,%xmm14 + vpxor 48(%rsi),%xmm7,%xmm7 + vpxor 64(%rsi),%xmm8,%xmm8 + vpxor 80(%rsi),%xmm0,%xmm0 + vpxor 96(%rsi),%xmm12,%xmm12 + vpxor 112(%rsi),%xmm4,%xmm4 + leaq 128(%rsi),%rsi + + vmovdqu %xmm6,0(%rdi) + vmovdqu %xmm1,16(%rdi) + vmovdqu %xmm13,32(%rdi) + vmovdqu %xmm5,48(%rdi) + vmovdqu %xmm15,64(%rdi) + vmovdqu %xmm10,80(%rdi) + vmovdqu %xmm2,96(%rdi) + vmovdqu %xmm9,112(%rdi) + leaq 128(%rdi),%rdi + vmovdqu %xmm11,0(%rdi) + vmovdqu %xmm3,16(%rdi) + vmovdqu %xmm14,32(%rdi) + vmovdqu %xmm7,48(%rdi) + vmovdqu %xmm8,64(%rdi) + vmovdqu %xmm0,80(%rdi) + vmovdqu %xmm12,96(%rdi) + vmovdqu %xmm4,112(%rdi) + leaq 128(%rdi),%rdi + + subq $256,%rdx + jnz .Loop_outer4xop + + jmp .Ldone4xop + +.align 32 +.Ltail4xop: + cmpq $192,%rdx + jae .L192_or_more4xop + cmpq $128,%rdx + jae .L128_or_more4xop + cmpq $64,%rdx + jae .L64_or_more4xop + + xorq %r10,%r10 + vmovdqa %xmm6,0(%rsp) + vmovdqa %xmm1,16(%rsp) + vmovdqa %xmm13,32(%rsp) + vmovdqa %xmm5,48(%rsp) + jmp .Loop_tail4xop + +.align 32 +.L64_or_more4xop: + vpxor 0(%rsi),%xmm6,%xmm6 + vpxor 16(%rsi),%xmm1,%xmm1 + vpxor 32(%rsi),%xmm13,%xmm13 + vpxor 48(%rsi),%xmm5,%xmm5 + vmovdqu %xmm6,0(%rdi) + vmovdqu %xmm1,16(%rdi) + vmovdqu %xmm13,32(%rdi) + vmovdqu %xmm5,48(%rdi) + je .Ldone4xop + + leaq 64(%rsi),%rsi + vmovdqa %xmm15,0(%rsp) + xorq %r10,%r10 + vmovdqa %xmm10,16(%rsp) + leaq 64(%rdi),%rdi + vmovdqa %xmm2,32(%rsp) + subq $64,%rdx + vmovdqa %xmm9,48(%rsp) + jmp .Loop_tail4xop + +.align 32 +.L128_or_more4xop: + vpxor 0(%rsi),%xmm6,%xmm6 + vpxor 16(%rsi),%xmm1,%xmm1 + vpxor 32(%rsi),%xmm13,%xmm13 + vpxor 48(%rsi),%xmm5,%xmm5 + vpxor 64(%rsi),%xmm15,%xmm15 + vpxor 80(%rsi),%xmm10,%xmm10 + vpxor 96(%rsi),%xmm2,%xmm2 + vpxor 112(%rsi),%xmm9,%xmm9 + + vmovdqu %xmm6,0(%rdi) + vmovdqu %xmm1,16(%rdi) + vmovdqu %xmm13,32(%rdi) + vmovdqu %xmm5,48(%rdi) + vmovdqu %xmm15,64(%rdi) + vmovdqu %xmm10,80(%rdi) + vmovdqu %xmm2,96(%rdi) + vmovdqu %xmm9,112(%rdi) + je .Ldone4xop + + leaq 128(%rsi),%rsi + vmovdqa %xmm11,0(%rsp) + xorq %r10,%r10 + vmovdqa %xmm3,16(%rsp) + leaq 128(%rdi),%rdi + vmovdqa %xmm14,32(%rsp) + subq $128,%rdx + vmovdqa %xmm7,48(%rsp) + jmp .Loop_tail4xop + +.align 32 +.L192_or_more4xop: + vpxor 0(%rsi),%xmm6,%xmm6 + vpxor 16(%rsi),%xmm1,%xmm1 + vpxor 32(%rsi),%xmm13,%xmm13 + vpxor 48(%rsi),%xmm5,%xmm5 + vpxor 64(%rsi),%xmm15,%xmm15 + vpxor 80(%rsi),%xmm10,%xmm10 + vpxor 96(%rsi),%xmm2,%xmm2 + vpxor 112(%rsi),%xmm9,%xmm9 + leaq 128(%rsi),%rsi + vpxor 0(%rsi),%xmm11,%xmm11 + vpxor 16(%rsi),%xmm3,%xmm3 + vpxor 32(%rsi),%xmm14,%xmm14 + vpxor 48(%rsi),%xmm7,%xmm7 + + vmovdqu %xmm6,0(%rdi) + vmovdqu %xmm1,16(%rdi) + vmovdqu %xmm13,32(%rdi) + vmovdqu %xmm5,48(%rdi) + vmovdqu %xmm15,64(%rdi) + vmovdqu %xmm10,80(%rdi) + vmovdqu %xmm2,96(%rdi) + vmovdqu %xmm9,112(%rdi) + leaq 128(%rdi),%rdi + vmovdqu %xmm11,0(%rdi) + vmovdqu %xmm3,16(%rdi) + vmovdqu %xmm14,32(%rdi) + vmovdqu %xmm7,48(%rdi) + je .Ldone4xop + + leaq 64(%rsi),%rsi + vmovdqa %xmm8,0(%rsp) + xorq %r10,%r10 + vmovdqa %xmm0,16(%rsp) + leaq 64(%rdi),%rdi + vmovdqa %xmm12,32(%rsp) + subq $192,%rdx + vmovdqa %xmm4,48(%rsp) + +.Loop_tail4xop: + movzbl (%rsi,%r10,1),%eax + movzbl (%rsp,%r10,1),%ecx + leaq 1(%r10),%r10 + xorl %ecx,%eax + movb %al,-1(%rdi,%r10,1) + decq %rdx + jnz .Loop_tail4xop + +.Ldone4xop: + vzeroupper + leaq (%r9),%rsp +.cfi_def_cfa_register %rsp +.L4xop_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ChaCha20_4xop,.-ChaCha20_4xop +.type ChaCha20_8x,@function +.align 32 +ChaCha20_8x: +.cfi_startproc +.LChaCha20_8x: + movq %rsp,%r9 +.cfi_def_cfa_register %r9 + subq $0x280+8,%rsp + andq $-32,%rsp + vzeroupper + + + + + + + + + + + vbroadcasti128 .Lsigma(%rip),%ymm11 + vbroadcasti128 (%rcx),%ymm3 + vbroadcasti128 16(%rcx),%ymm15 + vbroadcasti128 (%r8),%ymm7 + leaq 256(%rsp),%rcx + leaq 512(%rsp),%rax + leaq .Lrot16(%rip),%r10 + leaq .Lrot24(%rip),%r11 + + vpshufd $0x00,%ymm11,%ymm8 + vpshufd $0x55,%ymm11,%ymm9 + vmovdqa %ymm8,128-256(%rcx) + vpshufd $0xaa,%ymm11,%ymm10 + vmovdqa %ymm9,160-256(%rcx) + vpshufd $0xff,%ymm11,%ymm11 + vmovdqa %ymm10,192-256(%rcx) + vmovdqa %ymm11,224-256(%rcx) + + vpshufd $0x00,%ymm3,%ymm0 + vpshufd $0x55,%ymm3,%ymm1 + vmovdqa %ymm0,256-256(%rcx) + vpshufd $0xaa,%ymm3,%ymm2 + vmovdqa %ymm1,288-256(%rcx) + vpshufd $0xff,%ymm3,%ymm3 + vmovdqa %ymm2,320-256(%rcx) + vmovdqa %ymm3,352-256(%rcx) + + vpshufd $0x00,%ymm15,%ymm12 + vpshufd $0x55,%ymm15,%ymm13 + vmovdqa %ymm12,384-512(%rax) + vpshufd $0xaa,%ymm15,%ymm14 + vmovdqa %ymm13,416-512(%rax) + vpshufd $0xff,%ymm15,%ymm15 + vmovdqa %ymm14,448-512(%rax) + vmovdqa %ymm15,480-512(%rax) + + vpshufd $0x00,%ymm7,%ymm4 + vpshufd $0x55,%ymm7,%ymm5 + vpaddd .Lincy(%rip),%ymm4,%ymm4 + vpshufd $0xaa,%ymm7,%ymm6 + vmovdqa %ymm5,544-512(%rax) + vpshufd $0xff,%ymm7,%ymm7 + vmovdqa %ymm6,576-512(%rax) + vmovdqa %ymm7,608-512(%rax) + + jmp .Loop_enter8x + +.align 32 +.Loop_outer8x: + vmovdqa 128-256(%rcx),%ymm8 + vmovdqa 160-256(%rcx),%ymm9 + vmovdqa 192-256(%rcx),%ymm10 + vmovdqa 224-256(%rcx),%ymm11 + vmovdqa 256-256(%rcx),%ymm0 + vmovdqa 288-256(%rcx),%ymm1 + vmovdqa 320-256(%rcx),%ymm2 + vmovdqa 352-256(%rcx),%ymm3 + vmovdqa 384-512(%rax),%ymm12 + vmovdqa 416-512(%rax),%ymm13 + vmovdqa 448-512(%rax),%ymm14 + vmovdqa 480-512(%rax),%ymm15 + vmovdqa 512-512(%rax),%ymm4 + vmovdqa 544-512(%rax),%ymm5 + vmovdqa 576-512(%rax),%ymm6 + vmovdqa 608-512(%rax),%ymm7 + vpaddd .Leight(%rip),%ymm4,%ymm4 + +.Loop_enter8x: + vmovdqa %ymm14,64(%rsp) + vmovdqa %ymm15,96(%rsp) + vbroadcasti128 (%r10),%ymm15 + vmovdqa %ymm4,512-512(%rax) + movl $10,%eax + jmp .Loop8x + +.align 32 +.Loop8x: + vpaddd %ymm0,%ymm8,%ymm8 + vpxor %ymm4,%ymm8,%ymm4 + vpshufb %ymm15,%ymm4,%ymm4 + vpaddd %ymm1,%ymm9,%ymm9 + vpxor %ymm5,%ymm9,%ymm5 + vpshufb %ymm15,%ymm5,%ymm5 + vpaddd %ymm4,%ymm12,%ymm12 + vpxor %ymm0,%ymm12,%ymm0 + vpslld $12,%ymm0,%ymm14 + vpsrld $20,%ymm0,%ymm0 + vpor %ymm0,%ymm14,%ymm0 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm5,%ymm13,%ymm13 + vpxor %ymm1,%ymm13,%ymm1 + vpslld $12,%ymm1,%ymm15 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm1,%ymm15,%ymm1 + vpaddd %ymm0,%ymm8,%ymm8 + vpxor %ymm4,%ymm8,%ymm4 + vpshufb %ymm14,%ymm4,%ymm4 + vpaddd %ymm1,%ymm9,%ymm9 + vpxor %ymm5,%ymm9,%ymm5 + vpshufb %ymm14,%ymm5,%ymm5 + vpaddd %ymm4,%ymm12,%ymm12 + vpxor %ymm0,%ymm12,%ymm0 + vpslld $7,%ymm0,%ymm15 + vpsrld $25,%ymm0,%ymm0 + vpor %ymm0,%ymm15,%ymm0 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm5,%ymm13,%ymm13 + vpxor %ymm1,%ymm13,%ymm1 + vpslld $7,%ymm1,%ymm14 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm1,%ymm14,%ymm1 + vmovdqa %ymm12,0(%rsp) + vmovdqa %ymm13,32(%rsp) + vmovdqa 64(%rsp),%ymm12 + vmovdqa 96(%rsp),%ymm13 + vpaddd %ymm2,%ymm10,%ymm10 + vpxor %ymm6,%ymm10,%ymm6 + vpshufb %ymm15,%ymm6,%ymm6 + vpaddd %ymm3,%ymm11,%ymm11 + vpxor %ymm7,%ymm11,%ymm7 + vpshufb %ymm15,%ymm7,%ymm7 + vpaddd %ymm6,%ymm12,%ymm12 + vpxor %ymm2,%ymm12,%ymm2 + vpslld $12,%ymm2,%ymm14 + vpsrld $20,%ymm2,%ymm2 + vpor %ymm2,%ymm14,%ymm2 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm7,%ymm13,%ymm13 + vpxor %ymm3,%ymm13,%ymm3 + vpslld $12,%ymm3,%ymm15 + vpsrld $20,%ymm3,%ymm3 + vpor %ymm3,%ymm15,%ymm3 + vpaddd %ymm2,%ymm10,%ymm10 + vpxor %ymm6,%ymm10,%ymm6 + vpshufb %ymm14,%ymm6,%ymm6 + vpaddd %ymm3,%ymm11,%ymm11 + vpxor %ymm7,%ymm11,%ymm7 + vpshufb %ymm14,%ymm7,%ymm7 + vpaddd %ymm6,%ymm12,%ymm12 + vpxor %ymm2,%ymm12,%ymm2 + vpslld $7,%ymm2,%ymm15 + vpsrld $25,%ymm2,%ymm2 + vpor %ymm2,%ymm15,%ymm2 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm7,%ymm13,%ymm13 + vpxor %ymm3,%ymm13,%ymm3 + vpslld $7,%ymm3,%ymm14 + vpsrld $25,%ymm3,%ymm3 + vpor %ymm3,%ymm14,%ymm3 + vpaddd %ymm1,%ymm8,%ymm8 + vpxor %ymm7,%ymm8,%ymm7 + vpshufb %ymm15,%ymm7,%ymm7 + vpaddd %ymm2,%ymm9,%ymm9 + vpxor %ymm4,%ymm9,%ymm4 + vpshufb %ymm15,%ymm4,%ymm4 + vpaddd %ymm7,%ymm12,%ymm12 + vpxor %ymm1,%ymm12,%ymm1 + vpslld $12,%ymm1,%ymm14 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm1,%ymm14,%ymm1 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm4,%ymm13,%ymm13 + vpxor %ymm2,%ymm13,%ymm2 + vpslld $12,%ymm2,%ymm15 + vpsrld $20,%ymm2,%ymm2 + vpor %ymm2,%ymm15,%ymm2 + vpaddd %ymm1,%ymm8,%ymm8 + vpxor %ymm7,%ymm8,%ymm7 + vpshufb %ymm14,%ymm7,%ymm7 + vpaddd %ymm2,%ymm9,%ymm9 + vpxor %ymm4,%ymm9,%ymm4 + vpshufb %ymm14,%ymm4,%ymm4 + vpaddd %ymm7,%ymm12,%ymm12 + vpxor %ymm1,%ymm12,%ymm1 + vpslld $7,%ymm1,%ymm15 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm1,%ymm15,%ymm1 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm4,%ymm13,%ymm13 + vpxor %ymm2,%ymm13,%ymm2 + vpslld $7,%ymm2,%ymm14 + vpsrld $25,%ymm2,%ymm2 + vpor %ymm2,%ymm14,%ymm2 + vmovdqa %ymm12,64(%rsp) + vmovdqa %ymm13,96(%rsp) + vmovdqa 0(%rsp),%ymm12 + vmovdqa 32(%rsp),%ymm13 + vpaddd %ymm3,%ymm10,%ymm10 + vpxor %ymm5,%ymm10,%ymm5 + vpshufb %ymm15,%ymm5,%ymm5 + vpaddd %ymm0,%ymm11,%ymm11 + vpxor %ymm6,%ymm11,%ymm6 + vpshufb %ymm15,%ymm6,%ymm6 + vpaddd %ymm5,%ymm12,%ymm12 + vpxor %ymm3,%ymm12,%ymm3 + vpslld $12,%ymm3,%ymm14 + vpsrld $20,%ymm3,%ymm3 + vpor %ymm3,%ymm14,%ymm3 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm6,%ymm13,%ymm13 + vpxor %ymm0,%ymm13,%ymm0 + vpslld $12,%ymm0,%ymm15 + vpsrld $20,%ymm0,%ymm0 + vpor %ymm0,%ymm15,%ymm0 + vpaddd %ymm3,%ymm10,%ymm10 + vpxor %ymm5,%ymm10,%ymm5 + vpshufb %ymm14,%ymm5,%ymm5 + vpaddd %ymm0,%ymm11,%ymm11 + vpxor %ymm6,%ymm11,%ymm6 + vpshufb %ymm14,%ymm6,%ymm6 + vpaddd %ymm5,%ymm12,%ymm12 + vpxor %ymm3,%ymm12,%ymm3 + vpslld $7,%ymm3,%ymm15 + vpsrld $25,%ymm3,%ymm3 + vpor %ymm3,%ymm15,%ymm3 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm6,%ymm13,%ymm13 + vpxor %ymm0,%ymm13,%ymm0 + vpslld $7,%ymm0,%ymm14 + vpsrld $25,%ymm0,%ymm0 + vpor %ymm0,%ymm14,%ymm0 + decl %eax + jnz .Loop8x + + leaq 512(%rsp),%rax + vpaddd 128-256(%rcx),%ymm8,%ymm8 + vpaddd 160-256(%rcx),%ymm9,%ymm9 + vpaddd 192-256(%rcx),%ymm10,%ymm10 + vpaddd 224-256(%rcx),%ymm11,%ymm11 + + vpunpckldq %ymm9,%ymm8,%ymm14 + vpunpckldq %ymm11,%ymm10,%ymm15 + vpunpckhdq %ymm9,%ymm8,%ymm8 + vpunpckhdq %ymm11,%ymm10,%ymm10 + vpunpcklqdq %ymm15,%ymm14,%ymm9 + vpunpckhqdq %ymm15,%ymm14,%ymm14 + vpunpcklqdq %ymm10,%ymm8,%ymm11 + vpunpckhqdq %ymm10,%ymm8,%ymm8 + vpaddd 256-256(%rcx),%ymm0,%ymm0 + vpaddd 288-256(%rcx),%ymm1,%ymm1 + vpaddd 320-256(%rcx),%ymm2,%ymm2 + vpaddd 352-256(%rcx),%ymm3,%ymm3 + + vpunpckldq %ymm1,%ymm0,%ymm10 + vpunpckldq %ymm3,%ymm2,%ymm15 + vpunpckhdq %ymm1,%ymm0,%ymm0 + vpunpckhdq %ymm3,%ymm2,%ymm2 + vpunpcklqdq %ymm15,%ymm10,%ymm1 + vpunpckhqdq %ymm15,%ymm10,%ymm10 + vpunpcklqdq %ymm2,%ymm0,%ymm3 + vpunpckhqdq %ymm2,%ymm0,%ymm0 + vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 + vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 + vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 + vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 + vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 + vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 + vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 + vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 + vmovdqa %ymm15,0(%rsp) + vmovdqa %ymm9,32(%rsp) + vmovdqa 64(%rsp),%ymm15 + vmovdqa 96(%rsp),%ymm9 + + vpaddd 384-512(%rax),%ymm12,%ymm12 + vpaddd 416-512(%rax),%ymm13,%ymm13 + vpaddd 448-512(%rax),%ymm15,%ymm15 + vpaddd 480-512(%rax),%ymm9,%ymm9 + + vpunpckldq %ymm13,%ymm12,%ymm2 + vpunpckldq %ymm9,%ymm15,%ymm8 + vpunpckhdq %ymm13,%ymm12,%ymm12 + vpunpckhdq %ymm9,%ymm15,%ymm15 + vpunpcklqdq %ymm8,%ymm2,%ymm13 + vpunpckhqdq %ymm8,%ymm2,%ymm2 + vpunpcklqdq %ymm15,%ymm12,%ymm9 + vpunpckhqdq %ymm15,%ymm12,%ymm12 + vpaddd 512-512(%rax),%ymm4,%ymm4 + vpaddd 544-512(%rax),%ymm5,%ymm5 + vpaddd 576-512(%rax),%ymm6,%ymm6 + vpaddd 608-512(%rax),%ymm7,%ymm7 + + vpunpckldq %ymm5,%ymm4,%ymm15 + vpunpckldq %ymm7,%ymm6,%ymm8 + vpunpckhdq %ymm5,%ymm4,%ymm4 + vpunpckhdq %ymm7,%ymm6,%ymm6 + vpunpcklqdq %ymm8,%ymm15,%ymm5 + vpunpckhqdq %ymm8,%ymm15,%ymm15 + vpunpcklqdq %ymm6,%ymm4,%ymm7 + vpunpckhqdq %ymm6,%ymm4,%ymm4 + vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 + vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 + vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 + vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 + vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 + vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 + vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 + vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 + vmovdqa 0(%rsp),%ymm6 + vmovdqa 32(%rsp),%ymm12 + + cmpq $512,%rdx + jb .Ltail8x + + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + leaq 128(%rsi),%rsi + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + leaq 128(%rdi),%rdi + + vpxor 0(%rsi),%ymm12,%ymm12 + vpxor 32(%rsi),%ymm13,%ymm13 + vpxor 64(%rsi),%ymm10,%ymm10 + vpxor 96(%rsi),%ymm15,%ymm15 + leaq 128(%rsi),%rsi + vmovdqu %ymm12,0(%rdi) + vmovdqu %ymm13,32(%rdi) + vmovdqu %ymm10,64(%rdi) + vmovdqu %ymm15,96(%rdi) + leaq 128(%rdi),%rdi + + vpxor 0(%rsi),%ymm14,%ymm14 + vpxor 32(%rsi),%ymm2,%ymm2 + vpxor 64(%rsi),%ymm3,%ymm3 + vpxor 96(%rsi),%ymm7,%ymm7 + leaq 128(%rsi),%rsi + vmovdqu %ymm14,0(%rdi) + vmovdqu %ymm2,32(%rdi) + vmovdqu %ymm3,64(%rdi) + vmovdqu %ymm7,96(%rdi) + leaq 128(%rdi),%rdi + + vpxor 0(%rsi),%ymm11,%ymm11 + vpxor 32(%rsi),%ymm9,%ymm9 + vpxor 64(%rsi),%ymm0,%ymm0 + vpxor 96(%rsi),%ymm4,%ymm4 + leaq 128(%rsi),%rsi + vmovdqu %ymm11,0(%rdi) + vmovdqu %ymm9,32(%rdi) + vmovdqu %ymm0,64(%rdi) + vmovdqu %ymm4,96(%rdi) + leaq 128(%rdi),%rdi + + subq $512,%rdx + jnz .Loop_outer8x + + jmp .Ldone8x + +.Ltail8x: + cmpq $448,%rdx + jae .L448_or_more8x + cmpq $384,%rdx + jae .L384_or_more8x + cmpq $320,%rdx + jae .L320_or_more8x + cmpq $256,%rdx + jae .L256_or_more8x + cmpq $192,%rdx + jae .L192_or_more8x + cmpq $128,%rdx + jae .L128_or_more8x + cmpq $64,%rdx + jae .L64_or_more8x + + xorq %r10,%r10 + vmovdqa %ymm6,0(%rsp) + vmovdqa %ymm8,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L64_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + je .Ldone8x + + leaq 64(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm1,0(%rsp) + leaq 64(%rdi),%rdi + subq $64,%rdx + vmovdqa %ymm5,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L128_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + je .Ldone8x + + leaq 128(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm12,0(%rsp) + leaq 128(%rdi),%rdi + subq $128,%rdx + vmovdqa %ymm13,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L192_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + je .Ldone8x + + leaq 192(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm10,0(%rsp) + leaq 192(%rdi),%rdi + subq $192,%rdx + vmovdqa %ymm15,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L256_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + je .Ldone8x + + leaq 256(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm14,0(%rsp) + leaq 256(%rdi),%rdi + subq $256,%rdx + vmovdqa %ymm2,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L320_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vpxor 256(%rsi),%ymm14,%ymm14 + vpxor 288(%rsi),%ymm2,%ymm2 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + vmovdqu %ymm14,256(%rdi) + vmovdqu %ymm2,288(%rdi) + je .Ldone8x + + leaq 320(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm3,0(%rsp) + leaq 320(%rdi),%rdi + subq $320,%rdx + vmovdqa %ymm7,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L384_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vpxor 256(%rsi),%ymm14,%ymm14 + vpxor 288(%rsi),%ymm2,%ymm2 + vpxor 320(%rsi),%ymm3,%ymm3 + vpxor 352(%rsi),%ymm7,%ymm7 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + vmovdqu %ymm14,256(%rdi) + vmovdqu %ymm2,288(%rdi) + vmovdqu %ymm3,320(%rdi) + vmovdqu %ymm7,352(%rdi) + je .Ldone8x + + leaq 384(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm11,0(%rsp) + leaq 384(%rdi),%rdi + subq $384,%rdx + vmovdqa %ymm9,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L448_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vpxor 256(%rsi),%ymm14,%ymm14 + vpxor 288(%rsi),%ymm2,%ymm2 + vpxor 320(%rsi),%ymm3,%ymm3 + vpxor 352(%rsi),%ymm7,%ymm7 + vpxor 384(%rsi),%ymm11,%ymm11 + vpxor 416(%rsi),%ymm9,%ymm9 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + vmovdqu %ymm14,256(%rdi) + vmovdqu %ymm2,288(%rdi) + vmovdqu %ymm3,320(%rdi) + vmovdqu %ymm7,352(%rdi) + vmovdqu %ymm11,384(%rdi) + vmovdqu %ymm9,416(%rdi) + je .Ldone8x + + leaq 448(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm0,0(%rsp) + leaq 448(%rdi),%rdi + subq $448,%rdx + vmovdqa %ymm4,32(%rsp) + +.Loop_tail8x: + movzbl (%rsi,%r10,1),%eax + movzbl (%rsp,%r10,1),%ecx + leaq 1(%r10),%r10 + xorl %ecx,%eax + movb %al,-1(%rdi,%r10,1) + decq %rdx + jnz .Loop_tail8x + +.Ldone8x: + vzeroall + leaq (%r9),%rsp +.cfi_def_cfa_register %rsp +.L8x_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ChaCha20_8x,.-ChaCha20_8x diff --git a/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S b/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S index c69b4d978f3..df18fa496de 100644 --- a/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S +++ b/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S @@ -2790,6 +2790,10 @@ ecp_nistz256_neg: .align 32 ecp_nistz256_ord_mul_mont: .cfi_startproc + movl $0x80100,%ecx + andl OPENSSL_ia32cap_P+8(%rip),%ecx + cmpl $0x80100,%ecx + je .Lecp_nistz256_ord_mul_montx pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -3118,6 +3122,10 @@ ecp_nistz256_ord_mul_mont: .align 32 ecp_nistz256_ord_sqr_mont: .cfi_startproc + movl $0x80100,%ecx + andl OPENSSL_ia32cap_P+8(%rip),%ecx + cmpl $0x80100,%ecx + je .Lecp_nistz256_ord_sqr_montx pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -3405,6 +3413,462 @@ ecp_nistz256_ord_sqr_mont: .cfi_endproc .size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont +.type ecp_nistz256_ord_mul_montx,@function +.align 32 +ecp_nistz256_ord_mul_montx: +.cfi_startproc +.Lecp_nistz256_ord_mul_montx: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lord_mulx_body: + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + leaq -128(%rsi),%rsi + leaq .Lord-128(%rip),%r14 + movq .LordK(%rip),%r15 + + + mulxq %r9,%r8,%r9 + mulxq %r10,%rcx,%r10 + mulxq %r11,%rbp,%r11 + addq %rcx,%r9 + mulxq %r12,%rcx,%r12 + movq %r8,%rdx + mulxq %r15,%rdx,%rax + adcq %rbp,%r10 + adcq %rcx,%r11 + adcq $0,%r12 + + + xorq %r13,%r13 + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%r14),%rcx,%rbp + movq 8(%rbx),%rdx + adcxq %rcx,%r11 + adoxq %rbp,%r12 + adcxq %r8,%r12 + adoxq %r8,%r13 + adcq $0,%r13 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r9,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + adcxq %r8,%r13 + adoxq %r8,%r8 + adcq $0,%r8 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%r14),%rcx,%rbp + movq 16(%rbx),%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcxq %r9,%r13 + adoxq %r9,%r8 + adcq $0,%r8 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r10,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + adcxq %r9,%r8 + adoxq %r9,%r9 + adcq $0,%r9 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%r14),%rcx,%rbp + movq 24(%rbx),%rdx + adcxq %rcx,%r13 + adoxq %rbp,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcq $0,%r9 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r11,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r8 + adoxq %rbp,%r9 + + adcxq %r10,%r9 + adoxq %r10,%r10 + adcq $0,%r10 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%r14),%rcx,%rbp + leaq 128(%r14),%r14 + movq %r12,%rbx + adcxq %rcx,%r8 + adoxq %rbp,%r9 + movq %r13,%rdx + adcxq %r11,%r9 + adoxq %r11,%r10 + adcq $0,%r10 + + + + movq %r8,%rcx + subq 0(%r14),%r12 + sbbq 8(%r14),%r13 + sbbq 16(%r14),%r8 + movq %r9,%rbp + sbbq 24(%r14),%r9 + sbbq $0,%r10 + + cmovcq %rbx,%r12 + cmovcq %rdx,%r13 + cmovcq %rcx,%r8 + cmovcq %rbp,%r9 + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_mulx_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx + +.type ecp_nistz256_ord_sqr_montx,@function +.align 32 +ecp_nistz256_ord_sqr_montx: +.cfi_startproc +.Lecp_nistz256_ord_sqr_montx: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lord_sqrx_body: + + movq %rdx,%rbx + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + leaq .Lord(%rip),%rsi + jmp .Loop_ord_sqrx + +.align 32 +.Loop_ord_sqrx: + mulxq %r14,%r9,%r10 + mulxq %r15,%rcx,%r11 + movq %rdx,%rax +.byte 102,73,15,110,206 + mulxq %r8,%rbp,%r12 + movq %r14,%rdx + addq %rcx,%r10 +.byte 102,73,15,110,215 + adcq %rbp,%r11 + adcq $0,%r12 + xorq %r13,%r13 + + mulxq %r15,%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq %r8,%rcx,%rbp + movq %r15,%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcq $0,%r13 + + mulxq %r8,%rcx,%r14 + movq %rax,%rdx +.byte 102,73,15,110,216 + xorq %r15,%r15 + adcxq %r9,%r9 + adoxq %rcx,%r13 + adcxq %r10,%r10 + adoxq %r15,%r14 + + + mulxq %rdx,%r8,%rbp +.byte 102,72,15,126,202 + adcxq %r11,%r11 + adoxq %rbp,%r9 + adcxq %r12,%r12 + mulxq %rdx,%rcx,%rax +.byte 102,72,15,126,210 + adcxq %r13,%r13 + adoxq %rcx,%r10 + adcxq %r14,%r14 + mulxq %rdx,%rcx,%rbp +.byte 0x67 +.byte 102,72,15,126,218 + adoxq %rax,%r11 + adcxq %r15,%r15 + adoxq %rcx,%r12 + adoxq %rbp,%r13 + mulxq %rdx,%rcx,%rax + adoxq %rcx,%r14 + adoxq %rax,%r15 + + + movq %r8,%rdx + mulxq 32(%rsi),%rdx,%rcx + + xorq %rax,%rax + mulxq 0(%rsi),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + mulxq 8(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + mulxq 16(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + mulxq 24(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r8 + adcxq %rax,%r8 + + + movq %r9,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adoxq %rcx,%r9 + adcxq %rbp,%r10 + mulxq 8(%rsi),%rcx,%rbp + adoxq %rcx,%r10 + adcxq %rbp,%r11 + mulxq 16(%rsi),%rcx,%rbp + adoxq %rcx,%r11 + adcxq %rbp,%r8 + mulxq 24(%rsi),%rcx,%rbp + adoxq %rcx,%r8 + adcxq %rbp,%r9 + adoxq %rax,%r9 + + + movq %r10,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + mulxq 8(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r8 + mulxq 16(%rsi),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + mulxq 24(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + adcxq %rax,%r10 + + + movq %r11,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adoxq %rcx,%r11 + adcxq %rbp,%r8 + mulxq 8(%rsi),%rcx,%rbp + adoxq %rcx,%r8 + adcxq %rbp,%r9 + mulxq 16(%rsi),%rcx,%rbp + adoxq %rcx,%r9 + adcxq %rbp,%r10 + mulxq 24(%rsi),%rcx,%rbp + adoxq %rcx,%r10 + adcxq %rbp,%r11 + adoxq %rax,%r11 + + + addq %r8,%r12 + adcq %r13,%r9 + movq %r12,%rdx + adcq %r14,%r10 + adcq %r15,%r11 + movq %r9,%r14 + adcq $0,%rax + + + subq 0(%rsi),%r12 + movq %r10,%r15 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r8 + sbbq 24(%rsi),%r11 + sbbq $0,%rax + + cmovncq %r12,%rdx + cmovncq %r9,%r14 + cmovncq %r10,%r15 + cmovncq %r11,%r8 + + decq %rbx + jnz .Loop_ord_sqrx + + movq %rdx,0(%rdi) + movq %r14,8(%rdi) + pxor %xmm1,%xmm1 + movq %r15,16(%rdi) + pxor %xmm2,%xmm2 + movq %r8,24(%rdi) + pxor %xmm3,%xmm3 + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_sqrx_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx + @@ -3413,6 +3877,8 @@ ecp_nistz256_ord_sqr_mont: .align 32 ecp_nistz256_to_mont: .cfi_startproc + movl $0x80100,%ecx + andl OPENSSL_ia32cap_P+8(%rip),%ecx leaq .LRR(%rip),%rdx jmp .Lmul_mont .cfi_endproc @@ -3429,6 +3895,8 @@ ecp_nistz256_to_mont: .align 32 ecp_nistz256_mul_mont: .cfi_startproc + movl $0x80100,%ecx + andl OPENSSL_ia32cap_P+8(%rip),%ecx .Lmul_mont: pushq %rbp .cfi_adjust_cfa_offset 8 @@ -3449,6 +3917,8 @@ ecp_nistz256_mul_mont: .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 .Lmul_body: + cmpl $0x80100,%ecx + je .Lmul_montx movq %rdx,%rbx movq 0(%rdx),%rax movq 0(%rsi),%r9 @@ -3457,6 +3927,19 @@ ecp_nistz256_mul_mont: movq 24(%rsi),%r12 call __ecp_nistz256_mul_montq + jmp .Lmul_mont_done + +.align 32 +.Lmul_montx: + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + leaq -128(%rsi),%rsi + + call __ecp_nistz256_mul_montx .Lmul_mont_done: movq 0(%rsp),%r15 .cfi_restore %r15 @@ -3707,6 +4190,8 @@ __ecp_nistz256_mul_montq: .align 32 ecp_nistz256_sqr_mont: .cfi_startproc + movl $0x80100,%ecx + andl OPENSSL_ia32cap_P+8(%rip),%ecx pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -3726,12 +4211,25 @@ ecp_nistz256_sqr_mont: .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 .Lsqr_body: + cmpl $0x80100,%ecx + je .Lsqr_montx movq 0(%rsi),%rax movq 8(%rsi),%r14 movq 16(%rsi),%r15 movq 24(%rsi),%r8 call __ecp_nistz256_sqr_montq + jmp .Lsqr_mont_done + +.align 32 +.Lsqr_montx: + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + leaq -128(%rsi),%rsi + + call __ecp_nistz256_sqr_montx .Lsqr_mont_done: movq 0(%rsp),%r15 .cfi_restore %r15 @@ -3915,44 +4413,342 @@ __ecp_nistz256_sqr_montq: .byte 0xf3,0xc3 .cfi_endproc .size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq +.type __ecp_nistz256_mul_montx,@function +.align 32 +__ecp_nistz256_mul_montx: +.cfi_startproc + mulxq %r9,%r8,%r9 + mulxq %r10,%rcx,%r10 + movq $32,%r14 + xorq %r13,%r13 + mulxq %r11,%rbp,%r11 + movq .Lpoly+24(%rip),%r15 + adcq %rcx,%r9 + mulxq %r12,%rcx,%r12 + movq %r8,%rdx + adcq %rbp,%r10 + shlxq %r14,%r8,%rbp + adcq %rcx,%r11 + shrxq %r14,%r8,%rcx + adcq $0,%r12 + addq %rbp,%r9 + adcq %rcx,%r10 -.globl ecp_nistz256_from_mont -.type ecp_nistz256_from_mont,@function -.align 32 -ecp_nistz256_from_mont: -.cfi_startproc - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-16 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-24 -.Lfrom_body: + mulxq %r15,%rcx,%rbp + movq 8(%rbx),%rdx + adcq %rcx,%r11 + adcq %rbp,%r12 + adcq $0,%r13 + xorq %r8,%r8 - movq 0(%rsi),%rax - movq .Lpoly+24(%rip),%r13 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq %rax,%r8 - movq .Lpoly+8(%rip),%r12 + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 - movq %rax,%rcx - shlq $32,%r8 - mulq %r13 - shrq $32,%rcx - addq %r8,%r9 - adcq %rcx,%r10 - adcq %rax,%r11 - movq %r9,%rax - adcq $0,%rdx + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r9,%rdx + adcxq %rcx,%r12 + shlxq %r14,%r9,%rcx + adoxq %rbp,%r13 + shrxq %r14,%r9,%rbp + + adcxq %r8,%r13 + adoxq %r8,%r8 + adcq $0,%r8 + + + + addq %rcx,%r10 + adcq %rbp,%r11 + + mulxq %r15,%rcx,%rbp + movq 16(%rbx),%rdx + adcq %rcx,%r12 + adcq %rbp,%r13 + adcq $0,%r8 + xorq %r9,%r9 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r10,%rdx + adcxq %rcx,%r13 + shlxq %r14,%r10,%rcx + adoxq %rbp,%r8 + shrxq %r14,%r10,%rbp + + adcxq %r9,%r8 + adoxq %r9,%r9 + adcq $0,%r9 + + + + addq %rcx,%r11 + adcq %rbp,%r12 + + mulxq %r15,%rcx,%rbp + movq 24(%rbx),%rdx + adcq %rcx,%r13 + adcq %rbp,%r8 + adcq $0,%r9 + xorq %r10,%r10 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r11,%rdx + adcxq %rcx,%r8 + shlxq %r14,%r11,%rcx + adoxq %rbp,%r9 + shrxq %r14,%r11,%rbp + + adcxq %r10,%r9 + adoxq %r10,%r10 + adcq $0,%r10 + + + + addq %rcx,%r12 + adcq %rbp,%r13 + + mulxq %r15,%rcx,%rbp + movq %r12,%rbx + movq .Lpoly+8(%rip),%r14 + adcq %rcx,%r8 + movq %r13,%rdx + adcq %rbp,%r9 + adcq $0,%r10 + + + + xorl %eax,%eax + movq %r8,%rcx + sbbq $-1,%r12 + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%rbp + sbbq %r15,%r9 + sbbq $0,%r10 + + cmovcq %rbx,%r12 + cmovcq %rdx,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %rbp,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx + +.type __ecp_nistz256_sqr_montx,@function +.align 32 +__ecp_nistz256_sqr_montx: +.cfi_startproc + mulxq %r14,%r9,%r10 + mulxq %r15,%rcx,%r11 + xorl %eax,%eax + adcq %rcx,%r10 + mulxq %r8,%rbp,%r12 + movq %r14,%rdx + adcq %rbp,%r11 + adcq $0,%r12 + xorq %r13,%r13 + + + mulxq %r15,%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq %r8,%rcx,%rbp + movq %r15,%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcq $0,%r13 + + + mulxq %r8,%rcx,%r14 + movq 0+128(%rsi),%rdx + xorq %r15,%r15 + adcxq %r9,%r9 + adoxq %rcx,%r13 + adcxq %r10,%r10 + adoxq %r15,%r14 + + mulxq %rdx,%r8,%rbp + movq 8+128(%rsi),%rdx + adcxq %r11,%r11 + adoxq %rbp,%r9 + adcxq %r12,%r12 + mulxq %rdx,%rcx,%rax + movq 16+128(%rsi),%rdx + adcxq %r13,%r13 + adoxq %rcx,%r10 + adcxq %r14,%r14 +.byte 0x67 + mulxq %rdx,%rcx,%rbp + movq 24+128(%rsi),%rdx + adoxq %rax,%r11 + adcxq %r15,%r15 + adoxq %rcx,%r12 + movq $32,%rsi + adoxq %rbp,%r13 +.byte 0x67,0x67 + mulxq %rdx,%rcx,%rax + movq .Lpoly+24(%rip),%rdx + adoxq %rcx,%r14 + shlxq %rsi,%r8,%rcx + adoxq %rax,%r15 + shrxq %rsi,%r8,%rax + movq %rdx,%rbp + + + addq %rcx,%r9 + adcq %rax,%r10 + + mulxq %r8,%rcx,%r8 + adcq %rcx,%r11 + shlxq %rsi,%r9,%rcx + adcq $0,%r8 + shrxq %rsi,%r9,%rax + + + addq %rcx,%r10 + adcq %rax,%r11 + + mulxq %r9,%rcx,%r9 + adcq %rcx,%r8 + shlxq %rsi,%r10,%rcx + adcq $0,%r9 + shrxq %rsi,%r10,%rax + + + addq %rcx,%r11 + adcq %rax,%r8 + + mulxq %r10,%rcx,%r10 + adcq %rcx,%r9 + shlxq %rsi,%r11,%rcx + adcq $0,%r10 + shrxq %rsi,%r11,%rax + + + addq %rcx,%r8 + adcq %rax,%r9 + + mulxq %r11,%rcx,%r11 + adcq %rcx,%r10 + adcq $0,%r11 + + xorq %rdx,%rdx + addq %r8,%r12 + movq .Lpoly+8(%rip),%rsi + adcq %r9,%r13 + movq %r12,%r8 + adcq %r10,%r14 + adcq %r11,%r15 + movq %r13,%r9 + adcq $0,%rdx + + subq $-1,%r12 + movq %r14,%r10 + sbbq %rsi,%r13 + sbbq $0,%r14 + movq %r15,%r11 + sbbq %rbp,%r15 + sbbq $0,%rdx + + cmovcq %r8,%r12 + cmovcq %r9,%r13 + movq %r12,0(%rdi) + cmovcq %r10,%r14 + movq %r13,8(%rdi) + cmovcq %r11,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx + + + + + + +.globl ecp_nistz256_from_mont +.type ecp_nistz256_from_mont,@function +.align 32 +ecp_nistz256_from_mont: +.cfi_startproc + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-24 +.Lfrom_body: + + movq 0(%rsi),%rax + movq .Lpoly+24(%rip),%r13 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq %rax,%r8 + movq .Lpoly+8(%rip),%r12 + + + + movq %rax,%rcx + shlq $32,%r8 + mulq %r13 + shrq $32,%rcx + addq %r8,%r9 + adcq %rcx,%r10 + adcq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx @@ -4056,6 +4852,9 @@ ecp_nistz256_scatter_w5: .align 32 ecp_nistz256_gather_w5: .cfi_startproc + movl OPENSSL_ia32cap_P+8(%rip),%eax + testl $32,%eax + jnz .Lavx2_gather_w5 movdqa .LOne(%rip),%xmm0 movd %edx,%xmm1 @@ -4139,6 +4938,9 @@ ecp_nistz256_scatter_w7: .align 32 ecp_nistz256_gather_w7: .cfi_startproc + movl OPENSSL_ia32cap_P+8(%rip),%eax + testl $32,%eax + jnz .Lavx2_gather_w7 movdqa .LOne(%rip),%xmm8 movd %edx,%xmm1 @@ -4176,27 +4978,1272 @@ ecp_nistz256_gather_w7: movdqu %xmm2,0(%rdi) movdqu %xmm3,16(%rdi) - movdqu %xmm4,32(%rdi) - movdqu %xmm5,48(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc -.LSEH_end_ecp_nistz256_gather_w7: -.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 -.globl ecp_nistz256_avx2_gather_w7 -.type ecp_nistz256_avx2_gather_w7,@function -.align 32 -ecp_nistz256_avx2_gather_w7: -.cfi_startproc -.byte 0x0f,0x0b + movdqu %xmm4,32(%rdi) + movdqu %xmm5,48(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.LSEH_end_ecp_nistz256_gather_w7: +.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 + + +.type ecp_nistz256_avx2_gather_w5,@function +.align 32 +ecp_nistz256_avx2_gather_w5: +.cfi_startproc +.Lavx2_gather_w5: + vzeroupper + vmovdqa .LTwo(%rip),%ymm0 + + vpxor %ymm2,%ymm2,%ymm2 + vpxor %ymm3,%ymm3,%ymm3 + vpxor %ymm4,%ymm4,%ymm4 + + vmovdqa .LOne(%rip),%ymm5 + vmovdqa .LTwo(%rip),%ymm10 + + vmovd %edx,%xmm1 + vpermd %ymm1,%ymm2,%ymm1 + + movq $8,%rax +.Lselect_loop_avx2_w5: + + vmovdqa 0(%rsi),%ymm6 + vmovdqa 32(%rsi),%ymm7 + vmovdqa 64(%rsi),%ymm8 + + vmovdqa 96(%rsi),%ymm11 + vmovdqa 128(%rsi),%ymm12 + vmovdqa 160(%rsi),%ymm13 + + vpcmpeqd %ymm1,%ymm5,%ymm9 + vpcmpeqd %ymm1,%ymm10,%ymm14 + + vpaddd %ymm0,%ymm5,%ymm5 + vpaddd %ymm0,%ymm10,%ymm10 + leaq 192(%rsi),%rsi + + vpand %ymm9,%ymm6,%ymm6 + vpand %ymm9,%ymm7,%ymm7 + vpand %ymm9,%ymm8,%ymm8 + vpand %ymm14,%ymm11,%ymm11 + vpand %ymm14,%ymm12,%ymm12 + vpand %ymm14,%ymm13,%ymm13 + + vpxor %ymm6,%ymm2,%ymm2 + vpxor %ymm7,%ymm3,%ymm3 + vpxor %ymm8,%ymm4,%ymm4 + vpxor %ymm11,%ymm2,%ymm2 + vpxor %ymm12,%ymm3,%ymm3 + vpxor %ymm13,%ymm4,%ymm4 + + decq %rax + jnz .Lselect_loop_avx2_w5 + + vmovdqu %ymm2,0(%rdi) + vmovdqu %ymm3,32(%rdi) + vmovdqu %ymm4,64(%rdi) + vzeroupper + .byte 0xf3,0xc3 +.cfi_endproc +.LSEH_end_ecp_nistz256_avx2_gather_w5: +.size ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5 + + + +.globl ecp_nistz256_avx2_gather_w7 +.type ecp_nistz256_avx2_gather_w7,@function +.align 32 +ecp_nistz256_avx2_gather_w7: +.cfi_startproc +.Lavx2_gather_w7: + vzeroupper + vmovdqa .LThree(%rip),%ymm0 + + vpxor %ymm2,%ymm2,%ymm2 + vpxor %ymm3,%ymm3,%ymm3 + + vmovdqa .LOne(%rip),%ymm4 + vmovdqa .LTwo(%rip),%ymm8 + vmovdqa .LThree(%rip),%ymm12 + + vmovd %edx,%xmm1 + vpermd %ymm1,%ymm2,%ymm1 + + + movq $21,%rax +.Lselect_loop_avx2_w7: + + vmovdqa 0(%rsi),%ymm5 + vmovdqa 32(%rsi),%ymm6 + + vmovdqa 64(%rsi),%ymm9 + vmovdqa 96(%rsi),%ymm10 + + vmovdqa 128(%rsi),%ymm13 + vmovdqa 160(%rsi),%ymm14 + + vpcmpeqd %ymm1,%ymm4,%ymm7 + vpcmpeqd %ymm1,%ymm8,%ymm11 + vpcmpeqd %ymm1,%ymm12,%ymm15 + + vpaddd %ymm0,%ymm4,%ymm4 + vpaddd %ymm0,%ymm8,%ymm8 + vpaddd %ymm0,%ymm12,%ymm12 + leaq 192(%rsi),%rsi + + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + vpand %ymm11,%ymm9,%ymm9 + vpand %ymm11,%ymm10,%ymm10 + vpand %ymm15,%ymm13,%ymm13 + vpand %ymm15,%ymm14,%ymm14 + + vpxor %ymm5,%ymm2,%ymm2 + vpxor %ymm6,%ymm3,%ymm3 + vpxor %ymm9,%ymm2,%ymm2 + vpxor %ymm10,%ymm3,%ymm3 + vpxor %ymm13,%ymm2,%ymm2 + vpxor %ymm14,%ymm3,%ymm3 + + decq %rax + jnz .Lselect_loop_avx2_w7 + + + vmovdqa 0(%rsi),%ymm5 + vmovdqa 32(%rsi),%ymm6 + + vpcmpeqd %ymm1,%ymm4,%ymm7 + + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + + vpxor %ymm5,%ymm2,%ymm2 + vpxor %ymm6,%ymm3,%ymm3 + + vmovdqu %ymm2,0(%rdi) + vmovdqu %ymm3,32(%rdi) + vzeroupper + .byte 0xf3,0xc3 +.cfi_endproc +.LSEH_end_ecp_nistz256_avx2_gather_w7: +.size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7 +.type __ecp_nistz256_add_toq,@function +.align 32 +__ecp_nistz256_add_toq: +.cfi_startproc + xorq %r11,%r11 + addq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + movq %r12,%rax + adcq 16(%rbx),%r8 + adcq 24(%rbx),%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq + +.type __ecp_nistz256_sub_fromq,@function +.align 32 +__ecp_nistz256_sub_fromq: +.cfi_startproc + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r13 + movq %r12,%rax + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + addq $-1,%r12 + movq %r8,%rcx + adcq %r14,%r13 + adcq $0,%r8 + movq %r9,%r10 + adcq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + cmovzq %rbp,%r13 + movq %r12,0(%rdi) + cmovzq %rcx,%r8 + movq %r13,8(%rdi) + cmovzq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq + +.type __ecp_nistz256_subq,@function +.align 32 +__ecp_nistz256_subq: +.cfi_startproc + subq %r12,%rax + sbbq %r13,%rbp + movq %rax,%r12 + sbbq %r8,%rcx + sbbq %r9,%r10 + movq %rbp,%r13 + sbbq %r11,%r11 + + addq $-1,%rax + movq %rcx,%r8 + adcq %r14,%rbp + adcq $0,%rcx + movq %r10,%r9 + adcq %r15,%r10 + testq %r11,%r11 + + cmovnzq %rax,%r12 + cmovnzq %rbp,%r13 + cmovnzq %rcx,%r8 + cmovnzq %r10,%r9 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ecp_nistz256_subq,.-__ecp_nistz256_subq + +.type __ecp_nistz256_mul_by_2q,@function +.align 32 +__ecp_nistz256_mul_by_2q: +.cfi_startproc + xorq %r11,%r11 + addq %r12,%r12 + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q +.globl ecp_nistz256_point_double +.type ecp_nistz256_point_double,@function +.align 32 +ecp_nistz256_point_double: +.cfi_startproc + movl $0x80100,%ecx + andl OPENSSL_ia32cap_P+8(%rip),%ecx + cmpl $0x80100,%ecx + je .Lpoint_doublex + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $160+8,%rsp +.cfi_adjust_cfa_offset 32*5+8 +.Lpoint_doubleq_body: + +.Lpoint_double_shortcutq: + movdqu 0(%rsi),%xmm0 + movq %rsi,%rbx + movdqu 16(%rsi),%xmm1 + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r8 + movq 32+24(%rsi),%r9 + movq .Lpoly+8(%rip),%r14 + movq .Lpoly+24(%rip),%r15 + movdqa %xmm0,96(%rsp) + movdqa %xmm1,96+16(%rsp) + leaq 32(%rdi),%r10 + leaq 64(%rdi),%r11 +.byte 102,72,15,110,199 +.byte 102,73,15,110,202 +.byte 102,73,15,110,211 + + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + leaq 64-0(%rsi),%rsi + leaq 64(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 0(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 32(%rbx),%rax + movq 64+0(%rbx),%r9 + movq 64+8(%rbx),%r10 + movq 64+16(%rbx),%r11 + movq 64+24(%rbx),%r12 + leaq 64-0(%rbx),%rsi + leaq 32(%rbx),%rbx +.byte 102,72,15,126,215 + call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_by_2q + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_toq + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 +.byte 102,72,15,126,207 + call __ecp_nistz256_sqr_montq + xorq %r9,%r9 + movq %r12,%rax + addq $-1,%r12 + movq %r13,%r10 + adcq %rsi,%r13 + movq %r14,%rcx + adcq $0,%r14 + movq %r15,%r8 + adcq %rbp,%r15 + adcq $0,%r9 + xorq %rsi,%rsi + testq $1,%rax + + cmovzq %rax,%r12 + cmovzq %r10,%r13 + cmovzq %rcx,%r14 + cmovzq %r8,%r15 + cmovzq %rsi,%r9 + + movq %r13,%rax + shrq $1,%r12 + shlq $63,%rax + movq %r14,%r10 + shrq $1,%r13 + orq %rax,%r12 + shlq $63,%r10 + movq %r15,%rcx + shrq $1,%r14 + orq %r10,%r13 + shlq $63,%rcx + movq %r12,0(%rdi) + shrq $1,%r15 + movq %r13,8(%rdi) + shlq $63,%r9 + orq %rcx,%r14 + orq %r9,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + movq 64(%rsp),%rax + leaq 64(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + leaq 32(%rsp),%rbx + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_toq + + movq 96(%rsp),%rax + leaq 96(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + movq 0+32(%rsp),%rax + movq 8+32(%rsp),%r14 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r15 + movq 24+32(%rsp),%r8 +.byte 102,72,15,126,199 + call __ecp_nistz256_sqr_montq + + leaq 128(%rsp),%rbx + movq %r14,%r8 + movq %r15,%r9 + movq %rsi,%r14 + movq %rbp,%r15 + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 0(%rsp),%rdi + call __ecp_nistz256_subq + + movq 32(%rsp),%rax + leaq 32(%rsp),%rbx + movq %r12,%r14 + xorl %ecx,%ecx + movq %r12,0+0(%rsp) + movq %r13,%r10 + movq %r13,0+8(%rsp) + cmovzq %r8,%r11 + movq %r8,0+16(%rsp) + leaq 0-0(%rsp),%rsi + cmovzq %r9,%r12 + movq %r9,0+24(%rsp) + movq %r14,%r9 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + +.byte 102,72,15,126,203 +.byte 102,72,15,126,207 + call __ecp_nistz256_sub_fromq + + leaq 160+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_doubleq_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_point_double,.-ecp_nistz256_point_double +.globl ecp_nistz256_point_add +.type ecp_nistz256_point_add,@function +.align 32 +ecp_nistz256_point_add: +.cfi_startproc + movl $0x80100,%ecx + andl OPENSSL_ia32cap_P+8(%rip),%ecx + cmpl $0x80100,%ecx + je .Lpoint_addx + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $576+8,%rsp +.cfi_adjust_cfa_offset 32*18+8 +.Lpoint_addq_body: + + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq %rsi,%rbx + movq %rdx,%rsi + movdqa %xmm0,384(%rsp) + movdqa %xmm1,384+16(%rsp) + movdqa %xmm2,416(%rsp) + movdqa %xmm3,416+16(%rsp) + movdqa %xmm4,448(%rsp) + movdqa %xmm5,448+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rsi),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rsi),%xmm3 + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,480(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,480+16(%rsp) + movdqu 64(%rsi),%xmm0 + movdqu 80(%rsi),%xmm1 + movdqa %xmm2,512(%rsp) + movdqa %xmm3,512+16(%rsp) + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + + leaq 64-0(%rsi),%rsi + movq %rax,544+0(%rsp) + movq %r14,544+8(%rsp) + movq %r15,544+16(%rsp) + movq %r8,544+24(%rsp) + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm1,%xmm4 + por %xmm1,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + movq 64+0(%rbx),%rax + movq 64+8(%rbx),%r14 + movq 64+16(%rbx),%r15 + movq 64+24(%rbx),%r8 +.byte 102,72,15,110,203 + + leaq 64-0(%rbx),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 544(%rsp),%rax + leaq 544(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 416(%rsp),%rax + leaq 416(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq 0+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 512(%rsp),%rax + leaq 512(%rsp),%rbx + movq 0+256(%rsp),%r9 + movq 8+256(%rsp),%r10 + leaq 0+256(%rsp),%rsi + movq 16+256(%rsp),%r11 + movq 24+256(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 224(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + orq %r13,%r12 + movdqa %xmm4,%xmm2 + orq %r8,%r12 + orq %r9,%r12 + por %xmm5,%xmm2 +.byte 102,73,15,110,220 + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 480(%rsp),%rax + leaq 480(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 160(%rsp),%rbx + leaq 0(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + orq %r13,%r12 + orq %r8,%r12 + orq %r9,%r12 + +.byte 102,73,15,126,208 +.byte 102,73,15,126,217 + + orq %r8,%r12 + orq %r9,%r12 + + +.byte 0x3e + jnz .Ladd_proceedq + +.Ladd_doubleq: +.byte 102,72,15,126,206 +.byte 102,72,15,126,199 + addq $416,%rsp +.cfi_adjust_cfa_offset -416 + jmp .Lpoint_double_shortcutq +.cfi_adjust_cfa_offset 416 + +.align 32 +.Ladd_proceedq: + movq 0+64(%rsp),%rax + movq 8+64(%rsp),%r14 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 544(%rsp),%rax + leaq 544(%rsp),%rbx + movq 0+352(%rsp),%r9 + movq 8+352(%rsp),%r10 + leaq 0+352(%rsp),%rsi + movq 16+352(%rsp),%r11 + movq 24+352(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 0(%rsp),%rax + leaq 0(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 160(%rsp),%rax + leaq 160(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montq + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 96(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subq + + leaq 128(%rsp),%rbx + leaq 288(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 192+0(%rsp),%rax + movq 192+8(%rsp),%rbp + movq 192+16(%rsp),%rcx + movq 192+24(%rsp),%r10 + leaq 320(%rsp),%rdi + + call __ecp_nistz256_subq + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 128(%rsp),%rax + leaq 128(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq 0+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 320(%rsp),%rax + leaq 320(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 320(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 256(%rsp),%rbx + leaq 320(%rsp),%rdi + call __ecp_nistz256_sub_fromq + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 352(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 352+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 544(%rsp),%xmm2 + pand 544+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 480(%rsp),%xmm2 + pand 480+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 320(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 320+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 512(%rsp),%xmm2 + pand 512+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + +.Ladd_doneq: + leaq 576+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_addq_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_point_add,.-ecp_nistz256_point_add +.globl ecp_nistz256_point_add_affine +.type ecp_nistz256_point_add_affine,@function +.align 32 +ecp_nistz256_point_add_affine: +.cfi_startproc + movl $0x80100,%ecx + andl OPENSSL_ia32cap_P+8(%rip),%ecx + cmpl $0x80100,%ecx + je .Lpoint_add_affinex + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $480+8,%rsp +.cfi_adjust_cfa_offset 32*15+8 +.Ladd_affineq_body: + + movdqu 0(%rsi),%xmm0 + movq %rdx,%rbx + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,320(%rsp) + movdqa %xmm1,320+16(%rsp) + movdqa %xmm2,352(%rsp) + movdqa %xmm3,352+16(%rsp) + movdqa %xmm4,384(%rsp) + movdqa %xmm5,384+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rbx),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rbx),%xmm1 + movdqu 32(%rbx),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rbx),%xmm3 + movdqa %xmm0,416(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,416+16(%rsp) + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + movdqa %xmm2,448(%rsp) + movdqa %xmm3,448+16(%rsp) + por %xmm2,%xmm3 + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm1,%xmm3 + + leaq 64-0(%rsi),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm3,%xmm4 + movq 0(%rbx),%rax + + movq %r12,%r9 + por %xmm3,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + movq %r13,%r10 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + movq %r14,%r11 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + + leaq 32-0(%rsp),%rsi + movq %r15,%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 320(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 288(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 352(%rsp),%rbx + leaq 96(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+64(%rsp),%rax + movq 8+64(%rsp),%r14 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 128(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 0+96(%rsp),%rax + movq 8+96(%rsp),%r14 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r15 + movq 24+96(%rsp),%r8 + leaq 192(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 128(%rsp),%rax + leaq 128(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 320(%rsp),%rax + leaq 320(%rsp),%rbx + movq 0+128(%rsp),%r9 + movq 8+128(%rsp),%r10 + leaq 0+128(%rsp),%rsi + movq 16+128(%rsp),%r11 + movq 24+128(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 192(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subq + + leaq 160(%rsp),%rbx + leaq 224(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 64(%rsp),%rdi + + call __ecp_nistz256_subq + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 352(%rsp),%rax + leaq 352(%rsp),%rbx + movq 0+160(%rsp),%r9 + movq 8+160(%rsp),%r10 + leaq 0+160(%rsp),%rsi + movq 16+160(%rsp),%r11 + movq 24+160(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 96(%rsp),%rax + leaq 96(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 64(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 32(%rsp),%rbx + leaq 256(%rsp),%rdi + call __ecp_nistz256_sub_fromq + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand .LONE_mont(%rip),%xmm2 + pand .LONE_mont+16(%rip),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 224(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 224+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 320(%rsp),%xmm2 + pand 320+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 256(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 256+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 352(%rsp),%xmm2 + pand 352+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + + leaq 480+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Ladd_affineq_epilogue: .byte 0xf3,0xc3 .cfi_endproc -.size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7 -.type __ecp_nistz256_add_toq,@function +.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine +.type __ecp_nistz256_add_tox,@function .align 32 -__ecp_nistz256_add_toq: +__ecp_nistz256_add_tox: .cfi_startproc xorq %r11,%r11 - addq 0(%rbx),%r12 + adcq 0(%rbx),%r12 adcq 8(%rbx),%r13 movq %r12,%rax adcq 16(%rbx),%r8 @@ -4204,7 +6251,8 @@ __ecp_nistz256_add_toq: movq %r13,%rbp adcq $0,%r11 - subq $-1,%r12 + xorq %r10,%r10 + sbbq $-1,%r12 movq %r8,%rcx sbbq %r14,%r13 sbbq $0,%r8 @@ -4223,76 +6271,80 @@ __ecp_nistz256_add_toq: .byte 0xf3,0xc3 .cfi_endproc -.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq +.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox -.type __ecp_nistz256_sub_fromq,@function +.type __ecp_nistz256_sub_fromx,@function .align 32 -__ecp_nistz256_sub_fromq: +__ecp_nistz256_sub_fromx: .cfi_startproc - subq 0(%rbx),%r12 + xorq %r11,%r11 + sbbq 0(%rbx),%r12 sbbq 8(%rbx),%r13 movq %r12,%rax sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 movq %r13,%rbp - sbbq %r11,%r11 + sbbq $0,%r11 - addq $-1,%r12 + xorq %r10,%r10 + adcq $-1,%r12 movq %r8,%rcx adcq %r14,%r13 adcq $0,%r8 movq %r9,%r10 adcq %r15,%r9 - testq %r11,%r11 - cmovzq %rax,%r12 - cmovzq %rbp,%r13 + btq $0,%r11 + cmovncq %rax,%r12 + cmovncq %rbp,%r13 movq %r12,0(%rdi) - cmovzq %rcx,%r8 + cmovncq %rcx,%r8 movq %r13,8(%rdi) - cmovzq %r10,%r9 + cmovncq %r10,%r9 movq %r8,16(%rdi) movq %r9,24(%rdi) .byte 0xf3,0xc3 .cfi_endproc -.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq +.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx -.type __ecp_nistz256_subq,@function +.type __ecp_nistz256_subx,@function .align 32 -__ecp_nistz256_subq: +__ecp_nistz256_subx: .cfi_startproc - subq %r12,%rax + xorq %r11,%r11 + sbbq %r12,%rax sbbq %r13,%rbp movq %rax,%r12 sbbq %r8,%rcx sbbq %r9,%r10 movq %rbp,%r13 - sbbq %r11,%r11 + sbbq $0,%r11 - addq $-1,%rax + xorq %r9,%r9 + adcq $-1,%rax movq %rcx,%r8 adcq %r14,%rbp adcq $0,%rcx movq %r10,%r9 adcq %r15,%r10 - testq %r11,%r11 - cmovnzq %rax,%r12 - cmovnzq %rbp,%r13 - cmovnzq %rcx,%r8 - cmovnzq %r10,%r9 + btq $0,%r11 + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + cmovcq %rcx,%r8 + cmovcq %r10,%r9 .byte 0xf3,0xc3 .cfi_endproc -.size __ecp_nistz256_subq,.-__ecp_nistz256_subq +.size __ecp_nistz256_subx,.-__ecp_nistz256_subx -.type __ecp_nistz256_mul_by_2q,@function +.type __ecp_nistz256_mul_by_2x,@function .align 32 -__ecp_nistz256_mul_by_2q: +__ecp_nistz256_mul_by_2x: .cfi_startproc xorq %r11,%r11 - addq %r12,%r12 + adcq %r12,%r12 adcq %r13,%r13 movq %r12,%rax adcq %r8,%r8 @@ -4300,7 +6352,8 @@ __ecp_nistz256_mul_by_2q: movq %r13,%rbp adcq $0,%r11 - subq $-1,%r12 + xorq %r10,%r10 + sbbq $-1,%r12 movq %r8,%rcx sbbq %r14,%r13 sbbq $0,%r8 @@ -4319,12 +6372,12 @@ __ecp_nistz256_mul_by_2q: .byte 0xf3,0xc3 .cfi_endproc -.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q -.globl ecp_nistz256_point_double -.type ecp_nistz256_point_double,@function +.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x +.type ecp_nistz256_point_doublex,@function .align 32 -ecp_nistz256_point_double: +ecp_nistz256_point_doublex: .cfi_startproc +.Lpoint_doublex: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -4345,9 +6398,9 @@ ecp_nistz256_point_double: .cfi_offset %r15,-56 subq $160+8,%rsp .cfi_adjust_cfa_offset 32*5+8 -.Lpoint_doubleq_body: +.Lpoint_doublex_body: -.Lpoint_double_shortcutq: +.Lpoint_double_shortcutx: movdqu 0(%rsi),%xmm0 movq %rsi,%rbx movdqu 16(%rsi),%xmm1 @@ -4366,34 +6419,34 @@ ecp_nistz256_point_double: .byte 102,73,15,110,211 leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_by_2q + call __ecp_nistz256_mul_by_2x - movq 64+0(%rsi),%rax + movq 64+0(%rsi),%rdx movq 64+8(%rsi),%r14 movq 64+16(%rsi),%r15 movq 64+24(%rsi),%r8 - leaq 64-0(%rsi),%rsi + leaq 64-128(%rsi),%rsi leaq 64(%rsp),%rdi - call __ecp_nistz256_sqr_montq + call __ecp_nistz256_sqr_montx - movq 0+0(%rsp),%rax + movq 0+0(%rsp),%rdx movq 8+0(%rsp),%r14 - leaq 0+0(%rsp),%rsi + leaq -128+0(%rsp),%rsi movq 16+0(%rsp),%r15 movq 24+0(%rsp),%r8 leaq 0(%rsp),%rdi - call __ecp_nistz256_sqr_montq + call __ecp_nistz256_sqr_montx - movq 32(%rbx),%rax + movq 32(%rbx),%rdx movq 64+0(%rbx),%r9 movq 64+8(%rbx),%r10 movq 64+16(%rbx),%r11 movq 64+24(%rbx),%r12 - leaq 64-0(%rbx),%rsi + leaq 64-128(%rbx),%rsi leaq 32(%rbx),%rbx .byte 102,72,15,126,215 - call __ecp_nistz256_mul_montq - call __ecp_nistz256_mul_by_2q + call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_by_2x movq 96+0(%rsp),%r12 movq 96+8(%rsp),%r13 @@ -4401,7 +6454,7 @@ ecp_nistz256_point_double: movq 96+16(%rsp),%r8 movq 96+24(%rsp),%r9 leaq 32(%rsp),%rdi - call __ecp_nistz256_add_toq + call __ecp_nistz256_add_tox movq 96+0(%rsp),%r12 movq 96+8(%rsp),%r13 @@ -4409,15 +6462,15 @@ ecp_nistz256_point_double: movq 96+16(%rsp),%r8 movq 96+24(%rsp),%r9 leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromq + call __ecp_nistz256_sub_fromx - movq 0+0(%rsp),%rax + movq 0+0(%rsp),%rdx movq 8+0(%rsp),%r14 - leaq 0+0(%rsp),%rsi + leaq -128+0(%rsp),%rsi movq 16+0(%rsp),%r15 movq 24+0(%rsp),%r8 .byte 102,72,15,126,207 - call __ecp_nistz256_sqr_montq + call __ecp_nistz256_sqr_montx xorq %r9,%r9 movq %r12,%rax addq $-1,%r12 @@ -4456,59 +6509,59 @@ ecp_nistz256_point_double: orq %r9,%r15 movq %r14,16(%rdi) movq %r15,24(%rdi) - movq 64(%rsp),%rax + movq 64(%rsp),%rdx leaq 64(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 - leaq 0+32(%rsp),%rsi + leaq -128+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_montx leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_by_2q + call __ecp_nistz256_mul_by_2x leaq 32(%rsp),%rbx leaq 32(%rsp),%rdi - call __ecp_nistz256_add_toq + call __ecp_nistz256_add_tox - movq 96(%rsp),%rax + movq 96(%rsp),%rdx leaq 96(%rsp),%rbx movq 0+0(%rsp),%r9 movq 8+0(%rsp),%r10 - leaq 0+0(%rsp),%rsi + leaq -128+0(%rsp),%rsi movq 16+0(%rsp),%r11 movq 24+0(%rsp),%r12 leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_montx leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_by_2q + call __ecp_nistz256_mul_by_2x - movq 0+32(%rsp),%rax + movq 0+32(%rsp),%rdx movq 8+32(%rsp),%r14 - leaq 0+32(%rsp),%rsi + leaq -128+32(%rsp),%rsi movq 16+32(%rsp),%r15 movq 24+32(%rsp),%r8 .byte 102,72,15,126,199 - call __ecp_nistz256_sqr_montq + call __ecp_nistz256_sqr_montx leaq 128(%rsp),%rbx movq %r14,%r8 movq %r15,%r9 movq %rsi,%r14 movq %rbp,%r15 - call __ecp_nistz256_sub_fromq + call __ecp_nistz256_sub_fromx movq 0+0(%rsp),%rax movq 0+8(%rsp),%rbp movq 0+16(%rsp),%rcx movq 0+24(%rsp),%r10 leaq 0(%rsp),%rdi - call __ecp_nistz256_subq + call __ecp_nistz256_subx - movq 32(%rsp),%rax + movq 32(%rsp),%rdx leaq 32(%rsp),%rbx movq %r12,%r14 xorl %ecx,%ecx @@ -4517,16 +6570,16 @@ ecp_nistz256_point_double: movq %r13,0+8(%rsp) cmovzq %r8,%r11 movq %r8,0+16(%rsp) - leaq 0-0(%rsp),%rsi + leaq 0-128(%rsp),%rsi cmovzq %r9,%r12 movq %r9,0+24(%rsp) movq %r14,%r9 leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_montx .byte 102,72,15,126,203 .byte 102,72,15,126,207 - call __ecp_nistz256_sub_fromq + call __ecp_nistz256_sub_fromx leaq 160+56(%rsp),%rsi .cfi_def_cfa %rsi,8 @@ -4544,15 +6597,15 @@ ecp_nistz256_point_double: .cfi_restore %rbp leaq (%rsi),%rsp .cfi_def_cfa_register %rsp -.Lpoint_doubleq_epilogue: +.Lpoint_doublex_epilogue: .byte 0xf3,0xc3 .cfi_endproc -.size ecp_nistz256_point_double,.-ecp_nistz256_point_double -.globl ecp_nistz256_point_add -.type ecp_nistz256_point_add,@function +.size ecp_nistz256_point_doublex,.-ecp_nistz256_point_doublex +.type ecp_nistz256_point_addx,@function .align 32 -ecp_nistz256_point_add: +ecp_nistz256_point_addx: .cfi_startproc +.Lpoint_addx: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -4573,7 +6626,7 @@ ecp_nistz256_point_add: .cfi_offset %r15,-56 subq $576+8,%rsp .cfi_adjust_cfa_offset 32*18+8 -.Lpoint_addq_body: +.Lpoint_addx_body: movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 @@ -4597,7 +6650,7 @@ ecp_nistz256_point_add: movdqu 32(%rsi),%xmm2 por %xmm3,%xmm5 movdqu 48(%rsi),%xmm3 - movq 64+0(%rsi),%rax + movq 64+0(%rsi),%rdx movq 64+8(%rsi),%r14 movq 64+16(%rsi),%r15 movq 64+24(%rsi),%r8 @@ -4613,13 +6666,13 @@ ecp_nistz256_point_add: por %xmm0,%xmm1 .byte 102,72,15,110,199 - leaq 64-0(%rsi),%rsi - movq %rax,544+0(%rsp) + leaq 64-128(%rsi),%rsi + movq %rdx,544+0(%rsp) movq %r14,544+8(%rsp) movq %r15,544+16(%rsp) movq %r8,544+24(%rsp) leaq 96(%rsp),%rdi - call __ecp_nistz256_sqr_montq + call __ecp_nistz256_sqr_montx pcmpeqd %xmm4,%xmm5 pshufd $0xb1,%xmm1,%xmm4 @@ -4630,59 +6683,59 @@ ecp_nistz256_point_add: pxor %xmm3,%xmm3 pcmpeqd %xmm3,%xmm4 pshufd $0,%xmm4,%xmm4 - movq 64+0(%rbx),%rax + movq 64+0(%rbx),%rdx movq 64+8(%rbx),%r14 movq 64+16(%rbx),%r15 movq 64+24(%rbx),%r8 .byte 102,72,15,110,203 - leaq 64-0(%rbx),%rsi + leaq 64-128(%rbx),%rsi leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montq + call __ecp_nistz256_sqr_montx - movq 544(%rsp),%rax + movq 544(%rsp),%rdx leaq 544(%rsp),%rbx movq 0+96(%rsp),%r9 movq 8+96(%rsp),%r10 - leaq 0+96(%rsp),%rsi + leaq -128+96(%rsp),%rsi movq 16+96(%rsp),%r11 movq 24+96(%rsp),%r12 leaq 224(%rsp),%rdi - call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_montx - movq 448(%rsp),%rax + movq 448(%rsp),%rdx leaq 448(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 - leaq 0+32(%rsp),%rsi + leaq -128+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_montx - movq 416(%rsp),%rax + movq 416(%rsp),%rdx leaq 416(%rsp),%rbx movq 0+224(%rsp),%r9 movq 8+224(%rsp),%r10 - leaq 0+224(%rsp),%rsi + leaq -128+224(%rsp),%rsi movq 16+224(%rsp),%r11 movq 24+224(%rsp),%r12 leaq 224(%rsp),%rdi - call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_montx - movq 512(%rsp),%rax + movq 512(%rsp),%rdx leaq 512(%rsp),%rbx movq 0+256(%rsp),%r9 movq 8+256(%rsp),%r10 - leaq 0+256(%rsp),%rsi + leaq -128+256(%rsp),%rsi movq 16+256(%rsp),%r11 movq 24+256(%rsp),%r12 leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_montx leaq 224(%rsp),%rbx leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromq + call __ecp_nistz256_sub_fromx orq %r13,%r12 movdqa %xmm4,%xmm2 @@ -4691,29 +6744,29 @@ ecp_nistz256_point_add: por %xmm5,%xmm2 .byte 102,73,15,110,220 - movq 384(%rsp),%rax + movq 384(%rsp),%rdx leaq 384(%rsp),%rbx movq 0+96(%rsp),%r9 movq 8+96(%rsp),%r10 - leaq 0+96(%rsp),%rsi + leaq -128+96(%rsp),%rsi movq 16+96(%rsp),%r11 movq 24+96(%rsp),%r12 leaq 160(%rsp),%rdi - call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_montx - movq 480(%rsp),%rax + movq 480(%rsp),%rdx leaq 480(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 - leaq 0+32(%rsp),%rsi + leaq -128+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 192(%rsp),%rdi - call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_montx leaq 160(%rsp),%rbx leaq 0(%rsp),%rdi - call __ecp_nistz256_sub_fromq + call __ecp_nistz256_sub_fromx orq %r13,%r12 orq %r8,%r12 @@ -4727,73 +6780,73 @@ ecp_nistz256_point_add: .byte 0x3e - jnz .Ladd_proceedq + jnz .Ladd_proceedx -.Ladd_doubleq: +.Ladd_doublex: .byte 102,72,15,126,206 .byte 102,72,15,126,199 addq $416,%rsp .cfi_adjust_cfa_offset -416 - jmp .Lpoint_double_shortcutq + jmp .Lpoint_double_shortcutx .cfi_adjust_cfa_offset 416 .align 32 -.Ladd_proceedq: - movq 0+64(%rsp),%rax +.Ladd_proceedx: + movq 0+64(%rsp),%rdx movq 8+64(%rsp),%r14 - leaq 0+64(%rsp),%rsi + leaq -128+64(%rsp),%rsi movq 16+64(%rsp),%r15 movq 24+64(%rsp),%r8 leaq 96(%rsp),%rdi - call __ecp_nistz256_sqr_montq + call __ecp_nistz256_sqr_montx - movq 448(%rsp),%rax + movq 448(%rsp),%rdx leaq 448(%rsp),%rbx movq 0+0(%rsp),%r9 movq 8+0(%rsp),%r10 - leaq 0+0(%rsp),%rsi + leaq -128+0(%rsp),%rsi movq 16+0(%rsp),%r11 movq 24+0(%rsp),%r12 leaq 352(%rsp),%rdi - call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_montx - movq 0+0(%rsp),%rax + movq 0+0(%rsp),%rdx movq 8+0(%rsp),%r14 - leaq 0+0(%rsp),%rsi + leaq -128+0(%rsp),%rsi movq 16+0(%rsp),%r15 movq 24+0(%rsp),%r8 leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montq + call __ecp_nistz256_sqr_montx - movq 544(%rsp),%rax + movq 544(%rsp),%rdx leaq 544(%rsp),%rbx movq 0+352(%rsp),%r9 movq 8+352(%rsp),%r10 - leaq 0+352(%rsp),%rsi + leaq -128+352(%rsp),%rsi movq 16+352(%rsp),%r11 movq 24+352(%rsp),%r12 leaq 352(%rsp),%rdi - call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_montx - movq 0(%rsp),%rax + movq 0(%rsp),%rdx leaq 0(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 - leaq 0+32(%rsp),%rsi + leaq -128+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_montx - movq 160(%rsp),%rax + movq 160(%rsp),%rdx leaq 160(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 - leaq 0+32(%rsp),%rsi + leaq -128+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 192(%rsp),%rdi - call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_montx @@ -4825,11 +6878,11 @@ ecp_nistz256_point_add: cmovcq %r10,%r9 movq 24(%rsi),%r10 - call __ecp_nistz256_subq + call __ecp_nistz256_subx leaq 128(%rsp),%rbx leaq 288(%rsp),%rdi - call __ecp_nistz256_sub_fromq + call __ecp_nistz256_sub_fromx movq 192+0(%rsp),%rax movq 192+8(%rsp),%rbp @@ -4837,35 +6890,35 @@ ecp_nistz256_point_add: movq 192+24(%rsp),%r10 leaq 320(%rsp),%rdi - call __ecp_nistz256_subq + call __ecp_nistz256_subx movq %r12,0(%rdi) movq %r13,8(%rdi) movq %r8,16(%rdi) movq %r9,24(%rdi) - movq 128(%rsp),%rax + movq 128(%rsp),%rdx leaq 128(%rsp),%rbx movq 0+224(%rsp),%r9 movq 8+224(%rsp),%r10 - leaq 0+224(%rsp),%rsi + leaq -128+224(%rsp),%rsi movq 16+224(%rsp),%r11 movq 24+224(%rsp),%r12 leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_montx - movq 320(%rsp),%rax + movq 320(%rsp),%rdx leaq 320(%rsp),%rbx movq 0+64(%rsp),%r9 movq 8+64(%rsp),%r10 - leaq 0+64(%rsp),%rsi + leaq -128+64(%rsp),%rsi movq 16+64(%rsp),%r11 movq 24+64(%rsp),%r12 leaq 320(%rsp),%rdi - call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_montx leaq 256(%rsp),%rbx leaq 320(%rsp),%rdi - call __ecp_nistz256_sub_fromq + call __ecp_nistz256_sub_fromx .byte 102,72,15,126,199 @@ -4941,7 +6994,7 @@ ecp_nistz256_point_add: movdqu %xmm2,32(%rdi) movdqu %xmm3,48(%rdi) -.Ladd_doneq: +.Ladd_donex: leaq 576+56(%rsp),%rsi .cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 @@ -4958,15 +7011,15 @@ ecp_nistz256_point_add: .cfi_restore %rbp leaq (%rsi),%rsp .cfi_def_cfa_register %rsp -.Lpoint_addq_epilogue: +.Lpoint_addx_epilogue: .byte 0xf3,0xc3 .cfi_endproc -.size ecp_nistz256_point_add,.-ecp_nistz256_point_add -.globl ecp_nistz256_point_add_affine -.type ecp_nistz256_point_add_affine,@function +.size ecp_nistz256_point_addx,.-ecp_nistz256_point_addx +.type ecp_nistz256_point_add_affinex,@function .align 32 -ecp_nistz256_point_add_affine: +ecp_nistz256_point_add_affinex: .cfi_startproc +.Lpoint_add_affinex: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -4987,7 +7040,7 @@ ecp_nistz256_point_add_affine: .cfi_offset %r15,-56 subq $480+8,%rsp .cfi_adjust_cfa_offset 32*15+8 -.Ladd_affineq_body: +.Ladd_affinex_body: movdqu 0(%rsi),%xmm0 movq %rdx,%rbx @@ -4996,7 +7049,7 @@ ecp_nistz256_point_add_affine: movdqu 48(%rsi),%xmm3 movdqu 64(%rsi),%xmm4 movdqu 80(%rsi),%xmm5 - movq 64+0(%rsi),%rax + movq 64+0(%rsi),%rdx movq 64+8(%rsi),%r14 movq 64+16(%rsi),%r15 movq 64+24(%rsi),%r8 @@ -5026,13 +7079,13 @@ ecp_nistz256_point_add_affine: pxor %xmm4,%xmm4 por %xmm1,%xmm3 - leaq 64-0(%rsi),%rsi + leaq 64-128(%rsi),%rsi leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montq + call __ecp_nistz256_sqr_montx pcmpeqd %xmm4,%xmm5 pshufd $0xb1,%xmm3,%xmm4 - movq 0(%rbx),%rax + movq 0(%rbx),%rdx movq %r12,%r9 por %xmm3,%xmm4 @@ -5045,84 +7098,84 @@ ecp_nistz256_point_add_affine: pcmpeqd %xmm3,%xmm4 pshufd $0,%xmm4,%xmm4 - leaq 32-0(%rsp),%rsi + leaq 32-128(%rsp),%rsi movq %r15,%r12 leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_montx leaq 320(%rsp),%rbx leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromq + call __ecp_nistz256_sub_fromx - movq 384(%rsp),%rax + movq 384(%rsp),%rdx leaq 384(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 - leaq 0+32(%rsp),%rsi + leaq -128+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_montx - movq 384(%rsp),%rax + movq 384(%rsp),%rdx leaq 384(%rsp),%rbx movq 0+64(%rsp),%r9 movq 8+64(%rsp),%r10 - leaq 0+64(%rsp),%rsi + leaq -128+64(%rsp),%rsi movq 16+64(%rsp),%r11 movq 24+64(%rsp),%r12 leaq 288(%rsp),%rdi - call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_montx - movq 448(%rsp),%rax + movq 448(%rsp),%rdx leaq 448(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 - leaq 0+32(%rsp),%rsi + leaq -128+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_montx leaq 352(%rsp),%rbx leaq 96(%rsp),%rdi - call __ecp_nistz256_sub_fromq + call __ecp_nistz256_sub_fromx - movq 0+64(%rsp),%rax + movq 0+64(%rsp),%rdx movq 8+64(%rsp),%r14 - leaq 0+64(%rsp),%rsi + leaq -128+64(%rsp),%rsi movq 16+64(%rsp),%r15 movq 24+64(%rsp),%r8 leaq 128(%rsp),%rdi - call __ecp_nistz256_sqr_montq + call __ecp_nistz256_sqr_montx - movq 0+96(%rsp),%rax + movq 0+96(%rsp),%rdx movq 8+96(%rsp),%r14 - leaq 0+96(%rsp),%rsi + leaq -128+96(%rsp),%rsi movq 16+96(%rsp),%r15 movq 24+96(%rsp),%r8 leaq 192(%rsp),%rdi - call __ecp_nistz256_sqr_montq + call __ecp_nistz256_sqr_montx - movq 128(%rsp),%rax + movq 128(%rsp),%rdx leaq 128(%rsp),%rbx movq 0+64(%rsp),%r9 movq 8+64(%rsp),%r10 - leaq 0+64(%rsp),%rsi + leaq -128+64(%rsp),%rsi movq 16+64(%rsp),%r11 movq 24+64(%rsp),%r12 leaq 160(%rsp),%rdi - call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_montx - movq 320(%rsp),%rax + movq 320(%rsp),%rdx leaq 320(%rsp),%rbx movq 0+128(%rsp),%r9 movq 8+128(%rsp),%r10 - leaq 0+128(%rsp),%rsi + leaq -128+128(%rsp),%rsi movq 16+128(%rsp),%r11 movq 24+128(%rsp),%r12 leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_montx @@ -5154,11 +7207,11 @@ ecp_nistz256_point_add_affine: cmovcq %r10,%r9 movq 24(%rsi),%r10 - call __ecp_nistz256_subq + call __ecp_nistz256_subx leaq 160(%rsp),%rbx leaq 224(%rsp),%rdi - call __ecp_nistz256_sub_fromq + call __ecp_nistz256_sub_fromx movq 0+0(%rsp),%rax movq 0+8(%rsp),%rbp @@ -5166,35 +7219,35 @@ ecp_nistz256_point_add_affine: movq 0+24(%rsp),%r10 leaq 64(%rsp),%rdi - call __ecp_nistz256_subq + call __ecp_nistz256_subx movq %r12,0(%rdi) movq %r13,8(%rdi) movq %r8,16(%rdi) movq %r9,24(%rdi) - movq 352(%rsp),%rax + movq 352(%rsp),%rdx leaq 352(%rsp),%rbx movq 0+160(%rsp),%r9 movq 8+160(%rsp),%r10 - leaq 0+160(%rsp),%rsi + leaq -128+160(%rsp),%rsi movq 16+160(%rsp),%r11 movq 24+160(%rsp),%r12 leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_montx - movq 96(%rsp),%rax + movq 96(%rsp),%rdx leaq 96(%rsp),%rbx movq 0+64(%rsp),%r9 movq 8+64(%rsp),%r10 - leaq 0+64(%rsp),%rsi + leaq -128+64(%rsp),%rsi movq 16+64(%rsp),%r11 movq 24+64(%rsp),%r12 leaq 64(%rsp),%rdi - call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_montx leaq 32(%rsp),%rbx leaq 256(%rsp),%rdi - call __ecp_nistz256_sub_fromq + call __ecp_nistz256_sub_fromx .byte 102,72,15,126,199 @@ -5286,7 +7339,7 @@ ecp_nistz256_point_add_affine: .cfi_restore %rbp leaq (%rsi),%rsp .cfi_def_cfa_register %rsp -.Ladd_affineq_epilogue: +.Ladd_affinex_epilogue: .byte 0xf3,0xc3 .cfi_endproc -.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine +.size ecp_nistz256_point_add_affinex,.-ecp_nistz256_point_add_affinex diff --git a/secure/lib/libcrypto/amd64/ghash-x86_64.S b/secure/lib/libcrypto/amd64/ghash-x86_64.S index 55ad7db1f24..078353528d5 100644 --- a/secure/lib/libcrypto/amd64/ghash-x86_64.S +++ b/secure/lib/libcrypto/amd64/ghash-x86_64.S @@ -1304,7 +1304,108 @@ gcm_ghash_clmul: .align 32 gcm_init_avx: .cfi_startproc - jmp .L_init_clmul + vzeroupper + + vmovdqu (%rsi),%xmm2 + vpshufd $78,%xmm2,%xmm2 + + + vpshufd $255,%xmm2,%xmm4 + vpsrlq $63,%xmm2,%xmm3 + vpsllq $1,%xmm2,%xmm2 + vpxor %xmm5,%xmm5,%xmm5 + vpcmpgtd %xmm4,%xmm5,%xmm5 + vpslldq $8,%xmm3,%xmm3 + vpor %xmm3,%xmm2,%xmm2 + + + vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5 + vpxor %xmm5,%xmm2,%xmm2 + + vpunpckhqdq %xmm2,%xmm2,%xmm6 + vmovdqa %xmm2,%xmm0 + vpxor %xmm2,%xmm6,%xmm6 + movq $4,%r10 + jmp .Linit_start_avx +.align 32 +.Linit_loop_avx: + vpalignr $8,%xmm3,%xmm4,%xmm5 + vmovdqu %xmm5,-16(%rdi) + vpunpckhqdq %xmm0,%xmm0,%xmm3 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 + vpxor %xmm0,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + + vpslldq $8,%xmm3,%xmm4 + vpsrldq $8,%xmm3,%xmm3 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm3,%xmm1,%xmm1 + vpsllq $57,%xmm0,%xmm3 + vpsllq $62,%xmm0,%xmm4 + vpxor %xmm3,%xmm4,%xmm4 + vpsllq $63,%xmm0,%xmm3 + vpxor %xmm3,%xmm4,%xmm4 + vpslldq $8,%xmm4,%xmm3 + vpsrldq $8,%xmm4,%xmm4 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrlq $1,%xmm0,%xmm4 + vpxor %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $5,%xmm4,%xmm4 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $1,%xmm0,%xmm0 + vpxor %xmm1,%xmm0,%xmm0 +.Linit_start_avx: + vmovdqa %xmm0,%xmm5 + vpunpckhqdq %xmm0,%xmm0,%xmm3 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 + vpxor %xmm0,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + + vpslldq $8,%xmm3,%xmm4 + vpsrldq $8,%xmm3,%xmm3 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm3,%xmm1,%xmm1 + vpsllq $57,%xmm0,%xmm3 + vpsllq $62,%xmm0,%xmm4 + vpxor %xmm3,%xmm4,%xmm4 + vpsllq $63,%xmm0,%xmm3 + vpxor %xmm3,%xmm4,%xmm4 + vpslldq $8,%xmm4,%xmm3 + vpsrldq $8,%xmm4,%xmm4 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrlq $1,%xmm0,%xmm4 + vpxor %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $5,%xmm4,%xmm4 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $1,%xmm0,%xmm0 + vpxor %xmm1,%xmm0,%xmm0 + vpshufd $78,%xmm5,%xmm3 + vpshufd $78,%xmm0,%xmm4 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqu %xmm5,0(%rdi) + vpxor %xmm0,%xmm4,%xmm4 + vmovdqu %xmm0,16(%rdi) + leaq 48(%rdi),%rdi + subq $1,%r10 + jnz .Linit_loop_avx + + vpalignr $8,%xmm4,%xmm3,%xmm5 + vmovdqu %xmm5,-16(%rdi) + + vzeroupper + .byte 0xf3,0xc3 .cfi_endproc .size gcm_init_avx,.-gcm_init_avx .globl gcm_gmult_avx @@ -1320,7 +1421,377 @@ gcm_gmult_avx: .align 32 gcm_ghash_avx: .cfi_startproc - jmp .L_ghash_clmul + vzeroupper + + vmovdqu (%rdi),%xmm10 + leaq .L0x1c2_polynomial(%rip),%r10 + leaq 64(%rsi),%rsi + vmovdqu .Lbswap_mask(%rip),%xmm13 + vpshufb %xmm13,%xmm10,%xmm10 + cmpq $0x80,%rcx + jb .Lshort_avx + subq $0x80,%rcx + + vmovdqu 112(%rdx),%xmm14 + vmovdqu 0-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm14 + vmovdqu 32-64(%rsi),%xmm7 + + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vmovdqu 96(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm14,%xmm9,%xmm9 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 16-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vmovdqu 80(%rdx),%xmm14 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 48-64(%rsi),%xmm6 + vpxor %xmm14,%xmm9,%xmm9 + vmovdqu 64(%rdx),%xmm15 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 80-64(%rsi),%xmm7 + + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vmovdqu 48(%rdx),%xmm14 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 96-64(%rsi),%xmm6 + vpxor %xmm5,%xmm2,%xmm2 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 128-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu 32(%rdx),%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vmovdqu 16(%rdx),%xmm14 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 144-64(%rsi),%xmm6 + vpxor %xmm5,%xmm2,%xmm2 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 176-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu (%rdx),%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 160-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 + + leaq 128(%rdx),%rdx + cmpq $0x80,%rcx + jb .Ltail_avx + + vpxor %xmm10,%xmm15,%xmm15 + subq $0x80,%rcx + jmp .Loop8x_avx + +.align 32 +.Loop8x_avx: + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vmovdqu 112(%rdx),%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpxor %xmm15,%xmm8,%xmm8 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11 + vmovdqu 0-64(%rsi),%xmm6 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12 + vmovdqu 32-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu 96(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm3,%xmm10,%xmm10 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vxorps %xmm4,%xmm11,%xmm11 + vmovdqu 16-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm5,%xmm12,%xmm12 + vxorps %xmm15,%xmm8,%xmm8 + + vmovdqu 80(%rdx),%xmm14 + vpxor %xmm10,%xmm12,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm11,%xmm12,%xmm12 + vpslldq $8,%xmm12,%xmm9 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vpsrldq $8,%xmm12,%xmm12 + vpxor %xmm9,%xmm10,%xmm10 + vmovdqu 48-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm14 + vxorps %xmm12,%xmm11,%xmm11 + vpxor %xmm1,%xmm4,%xmm4 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 80-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 64(%rdx),%xmm15 + vpalignr $8,%xmm10,%xmm10,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vxorps %xmm15,%xmm8,%xmm8 + vpxor %xmm5,%xmm2,%xmm2 + + vmovdqu 48(%rdx),%xmm14 + vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 96-64(%rsi),%xmm6 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 128-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 32(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + vpxor %xmm5,%xmm2,%xmm2 + vxorps %xmm12,%xmm10,%xmm10 + + vmovdqu 16(%rdx),%xmm14 + vpalignr $8,%xmm10,%xmm10,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 144-64(%rsi),%xmm6 + vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 + vxorps %xmm11,%xmm12,%xmm12 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 176-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu (%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 160-64(%rsi),%xmm6 + vpxor %xmm12,%xmm15,%xmm15 + vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 + vpxor %xmm10,%xmm15,%xmm15 + + leaq 128(%rdx),%rdx + subq $0x80,%rcx + jnc .Loop8x_avx + + addq $0x80,%rcx + jmp .Ltail_no_xor_avx + +.align 32 +.Lshort_avx: + vmovdqu -16(%rdx,%rcx,1),%xmm14 + leaq (%rdx,%rcx,1),%rdx + vmovdqu 0-64(%rsi),%xmm6 + vmovdqu 32-64(%rsi),%xmm7 + vpshufb %xmm13,%xmm14,%xmm15 + + vmovdqa %xmm0,%xmm3 + vmovdqa %xmm1,%xmm4 + vmovdqa %xmm2,%xmm5 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -32(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 16-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -48(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 48-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovdqu 80-64(%rsi),%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -64(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -80(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 96-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovdqu 128-64(%rsi),%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -96(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -112(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 144-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovq 184-64(%rsi),%xmm7 + subq $0x10,%rcx + jmp .Ltail_avx + +.align 32 +.Ltail_avx: + vpxor %xmm10,%xmm15,%xmm15 +.Ltail_no_xor_avx: + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + + vmovdqu (%r10),%xmm12 + + vpxor %xmm0,%xmm3,%xmm10 + vpxor %xmm1,%xmm4,%xmm11 + vpxor %xmm2,%xmm5,%xmm5 + + vpxor %xmm10,%xmm5,%xmm5 + vpxor %xmm11,%xmm5,%xmm5 + vpslldq $8,%xmm5,%xmm9 + vpsrldq $8,%xmm5,%xmm5 + vpxor %xmm9,%xmm10,%xmm10 + vpxor %xmm5,%xmm11,%xmm11 + + vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 + vpalignr $8,%xmm10,%xmm10,%xmm10 + vpxor %xmm9,%xmm10,%xmm10 + + vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 + vpalignr $8,%xmm10,%xmm10,%xmm10 + vpxor %xmm11,%xmm10,%xmm10 + vpxor %xmm9,%xmm10,%xmm10 + + cmpq $0,%rcx + jne .Lshort_avx + + vpshufb %xmm13,%xmm10,%xmm10 + vmovdqu %xmm10,(%rdi) + vzeroupper + .byte 0xf3,0xc3 .cfi_endproc .size gcm_ghash_avx,.-gcm_ghash_avx .align 64 diff --git a/secure/lib/libcrypto/amd64/poly1305-x86_64.S b/secure/lib/libcrypto/amd64/poly1305-x86_64.S index d74ee9b4505..c5a1f45fc5d 100644 --- a/secure/lib/libcrypto/amd64/poly1305-x86_64.S +++ b/secure/lib/libcrypto/amd64/poly1305-x86_64.S @@ -25,6 +25,15 @@ poly1305_init: leaq poly1305_blocks(%rip),%r10 leaq poly1305_emit(%rip),%r11 + movq OPENSSL_ia32cap_P+4(%rip),%r9 + leaq poly1305_blocks_avx(%rip),%rax + leaq poly1305_emit_avx(%rip),%rcx + btq $28,%r9 + cmovcq %rax,%r10 + cmovcq %rcx,%r11 + leaq poly1305_blocks_avx2(%rip),%rax + btq $37,%r9 + cmovcq %rax,%r10 movq $0x0ffffffc0fffffff,%rax movq $0x0ffffffc0ffffffc,%rcx andq 0(%rsi),%rax @@ -180,6 +189,1782 @@ poly1305_emit: .byte 0xf3,0xc3 .cfi_endproc .size poly1305_emit,.-poly1305_emit +.type __poly1305_block,@function +.align 32 +__poly1305_block: +.cfi_startproc + mulq %r14 + movq %rax,%r9 + movq %r11,%rax + movq %rdx,%r10 + + mulq %r14 + movq %rax,%r14 + movq %r11,%rax + movq %rdx,%r8 + + mulq %rbx + addq %rax,%r9 + movq %r13,%rax + adcq %rdx,%r10 + + mulq %rbx + movq %rbp,%rbx + addq %rax,%r14 + adcq %rdx,%r8 + + imulq %r13,%rbx + addq %rbx,%r9 + movq %r8,%rbx + adcq $0,%r10 + + imulq %r11,%rbp + addq %r9,%rbx + movq $-4,%rax + adcq %rbp,%r10 + + andq %r10,%rax + movq %r10,%rbp + shrq $2,%r10 + andq $3,%rbp + addq %r10,%rax + addq %rax,%r14 + adcq $0,%rbx + adcq $0,%rbp + .byte 0xf3,0xc3 +.cfi_endproc +.size __poly1305_block,.-__poly1305_block + +.type __poly1305_init_avx,@function +.align 32 +__poly1305_init_avx: +.cfi_startproc + movq %r11,%r14 + movq %r12,%rbx + xorq %rbp,%rbp + + leaq 48+64(%rdi),%rdi + + movq %r12,%rax + call __poly1305_block + + movl $0x3ffffff,%eax + movl $0x3ffffff,%edx + movq %r14,%r8 + andl %r14d,%eax + movq %r11,%r9 + andl %r11d,%edx + movl %eax,-64(%rdi) + shrq $26,%r8 + movl %edx,-60(%rdi) + shrq $26,%r9 + + movl $0x3ffffff,%eax + movl $0x3ffffff,%edx + andl %r8d,%eax + andl %r9d,%edx + movl %eax,-48(%rdi) + leal (%rax,%rax,4),%eax + movl %edx,-44(%rdi) + leal (%rdx,%rdx,4),%edx + movl %eax,-32(%rdi) + shrq $26,%r8 + movl %edx,-28(%rdi) + shrq $26,%r9 + + movq %rbx,%rax + movq %r12,%rdx + shlq $12,%rax + shlq $12,%rdx + orq %r8,%rax + orq %r9,%rdx + andl $0x3ffffff,%eax + andl $0x3ffffff,%edx + movl %eax,-16(%rdi) + leal (%rax,%rax,4),%eax + movl %edx,-12(%rdi) + leal (%rdx,%rdx,4),%edx + movl %eax,0(%rdi) + movq %rbx,%r8 + movl %edx,4(%rdi) + movq %r12,%r9 + + movl $0x3ffffff,%eax + movl $0x3ffffff,%edx + shrq $14,%r8 + shrq $14,%r9 + andl %r8d,%eax + andl %r9d,%edx + movl %eax,16(%rdi) + leal (%rax,%rax,4),%eax + movl %edx,20(%rdi) + leal (%rdx,%rdx,4),%edx + movl %eax,32(%rdi) + shrq $26,%r8 + movl %edx,36(%rdi) + shrq $26,%r9 + + movq %rbp,%rax + shlq $24,%rax + orq %rax,%r8 + movl %r8d,48(%rdi) + leaq (%r8,%r8,4),%r8 + movl %r9d,52(%rdi) + leaq (%r9,%r9,4),%r9 + movl %r8d,64(%rdi) + movl %r9d,68(%rdi) + + movq %r12,%rax + call __poly1305_block + + movl $0x3ffffff,%eax + movq %r14,%r8 + andl %r14d,%eax + shrq $26,%r8 + movl %eax,-52(%rdi) + + movl $0x3ffffff,%edx + andl %r8d,%edx + movl %edx,-36(%rdi) + leal (%rdx,%rdx,4),%edx + shrq $26,%r8 + movl %edx,-20(%rdi) + + movq %rbx,%rax + shlq $12,%rax + orq %r8,%rax + andl $0x3ffffff,%eax + movl %eax,-4(%rdi) + leal (%rax,%rax,4),%eax + movq %rbx,%r8 + movl %eax,12(%rdi) + + movl $0x3ffffff,%edx + shrq $14,%r8 + andl %r8d,%edx + movl %edx,28(%rdi) + leal (%rdx,%rdx,4),%edx + shrq $26,%r8 + movl %edx,44(%rdi) + + movq %rbp,%rax + shlq $24,%rax + orq %rax,%r8 + movl %r8d,60(%rdi) + leaq (%r8,%r8,4),%r8 + movl %r8d,76(%rdi) + + movq %r12,%rax + call __poly1305_block + + movl $0x3ffffff,%eax + movq %r14,%r8 + andl %r14d,%eax + shrq $26,%r8 + movl %eax,-56(%rdi) + + movl $0x3ffffff,%edx + andl %r8d,%edx + movl %edx,-40(%rdi) + leal (%rdx,%rdx,4),%edx + shrq $26,%r8 + movl %edx,-24(%rdi) + + movq %rbx,%rax + shlq $12,%rax + orq %r8,%rax + andl $0x3ffffff,%eax + movl %eax,-8(%rdi) + leal (%rax,%rax,4),%eax + movq %rbx,%r8 + movl %eax,8(%rdi) + + movl $0x3ffffff,%edx + shrq $14,%r8 + andl %r8d,%edx + movl %edx,24(%rdi) + leal (%rdx,%rdx,4),%edx + shrq $26,%r8 + movl %edx,40(%rdi) + + movq %rbp,%rax + shlq $24,%rax + orq %rax,%r8 + movl %r8d,56(%rdi) + leaq (%r8,%r8,4),%r8 + movl %r8d,72(%rdi) + + leaq -48-64(%rdi),%rdi + .byte 0xf3,0xc3 +.cfi_endproc +.size __poly1305_init_avx,.-__poly1305_init_avx + +.type poly1305_blocks_avx,@function +.align 32 +poly1305_blocks_avx: +.cfi_startproc + movl 20(%rdi),%r8d + cmpq $128,%rdx + jae .Lblocks_avx + testl %r8d,%r8d + jz .Lblocks + +.Lblocks_avx: + andq $-16,%rdx + jz .Lno_data_avx + + vzeroupper + + testl %r8d,%r8d + jz .Lbase2_64_avx + + testq $31,%rdx + jz .Leven_avx + + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lblocks_avx_body: + + movq %rdx,%r15 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movl 16(%rdi),%ebp + + movq 24(%rdi),%r11 + movq 32(%rdi),%r13 + + + movl %r8d,%r14d + andq $-2147483648,%r8 + movq %r9,%r12 + movl %r9d,%ebx + andq $-2147483648,%r9 + + shrq $6,%r8 + shlq $52,%r12 + addq %r8,%r14 + shrq $12,%rbx + shrq $18,%r9 + addq %r12,%r14 + adcq %r9,%rbx + + movq %rbp,%r8 + shlq $40,%r8 + shrq $24,%rbp + addq %r8,%rbx + adcq $0,%rbp + + movq $-4,%r9 + movq %rbp,%r8 + andq %rbp,%r9 + shrq $2,%r8 + andq $3,%rbp + addq %r9,%r8 + addq %r8,%r14 + adcq $0,%rbx + adcq $0,%rbp + + movq %r13,%r12 + movq %r13,%rax + shrq $2,%r13 + addq %r12,%r13 + + addq 0(%rsi),%r14 + adcq 8(%rsi),%rbx + leaq 16(%rsi),%rsi + adcq %rcx,%rbp + + call __poly1305_block + + testq %rcx,%rcx + jz .Lstore_base2_64_avx + + + movq %r14,%rax + movq %r14,%rdx + shrq $52,%r14 + movq %rbx,%r11 + movq %rbx,%r12 + shrq $26,%rdx + andq $0x3ffffff,%rax + shlq $12,%r11 + andq $0x3ffffff,%rdx + shrq $14,%rbx + orq %r11,%r14 + shlq $24,%rbp + andq $0x3ffffff,%r14 + shrq $40,%r12 + andq $0x3ffffff,%rbx + orq %r12,%rbp + + subq $16,%r15 + jz .Lstore_base2_26_avx + + vmovd %eax,%xmm0 + vmovd %edx,%xmm1 + vmovd %r14d,%xmm2 + vmovd %ebx,%xmm3 + vmovd %ebp,%xmm4 + jmp .Lproceed_avx + +.align 32 +.Lstore_base2_64_avx: + movq %r14,0(%rdi) + movq %rbx,8(%rdi) + movq %rbp,16(%rdi) + jmp .Ldone_avx + +.align 16 +.Lstore_base2_26_avx: + movl %eax,0(%rdi) + movl %edx,4(%rdi) + movl %r14d,8(%rdi) + movl %ebx,12(%rdi) + movl %ebp,16(%rdi) +.align 16 +.Ldone_avx: + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbp +.cfi_restore %rbp + movq 40(%rsp),%rbx +.cfi_restore %rbx + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lno_data_avx: +.Lblocks_avx_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc + +.align 32 +.Lbase2_64_avx: +.cfi_startproc + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lbase2_64_avx_body: + + movq %rdx,%r15 + + movq 24(%rdi),%r11 + movq 32(%rdi),%r13 + + movq 0(%rdi),%r14 + movq 8(%rdi),%rbx + movl 16(%rdi),%ebp + + movq %r13,%r12 + movq %r13,%rax + shrq $2,%r13 + addq %r12,%r13 + + testq $31,%rdx + jz .Linit_avx + + addq 0(%rsi),%r14 + adcq 8(%rsi),%rbx + leaq 16(%rsi),%rsi + adcq %rcx,%rbp + subq $16,%r15 + + call __poly1305_block + +.Linit_avx: + + movq %r14,%rax + movq %r14,%rdx + shrq $52,%r14 + movq %rbx,%r8 + movq %rbx,%r9 + shrq $26,%rdx + andq $0x3ffffff,%rax + shlq $12,%r8 + andq $0x3ffffff,%rdx + shrq $14,%rbx + orq %r8,%r14 + shlq $24,%rbp + andq $0x3ffffff,%r14 + shrq $40,%r9 + andq $0x3ffffff,%rbx + orq %r9,%rbp + + vmovd %eax,%xmm0 + vmovd %edx,%xmm1 + vmovd %r14d,%xmm2 + vmovd %ebx,%xmm3 + vmovd %ebp,%xmm4 + movl $1,20(%rdi) + + call __poly1305_init_avx + +.Lproceed_avx: + movq %r15,%rdx + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbp +.cfi_restore %rbp + movq 40(%rsp),%rbx +.cfi_restore %rbx + leaq 48(%rsp),%rax + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lbase2_64_avx_epilogue: + jmp .Ldo_avx +.cfi_endproc + +.align 32 +.Leven_avx: +.cfi_startproc + vmovd 0(%rdi),%xmm0 + vmovd 4(%rdi),%xmm1 + vmovd 8(%rdi),%xmm2 + vmovd 12(%rdi),%xmm3 + vmovd 16(%rdi),%xmm4 + +.Ldo_avx: + leaq -88(%rsp),%r11 +.cfi_def_cfa %r11,0x60 + subq $0x178,%rsp + subq $64,%rdx + leaq -32(%rsi),%rax + cmovcq %rax,%rsi + + vmovdqu 48(%rdi),%xmm14 + leaq 112(%rdi),%rdi + leaq .Lconst(%rip),%rcx + + + + vmovdqu 32(%rsi),%xmm5 + vmovdqu 48(%rsi),%xmm6 + vmovdqa 64(%rcx),%xmm15 + + vpsrldq $6,%xmm5,%xmm7 + vpsrldq $6,%xmm6,%xmm8 + vpunpckhqdq %xmm6,%xmm5,%xmm9 + vpunpcklqdq %xmm6,%xmm5,%xmm5 + vpunpcklqdq %xmm8,%xmm7,%xmm8 + + vpsrlq $40,%xmm9,%xmm9 + vpsrlq $26,%xmm5,%xmm6 + vpand %xmm15,%xmm5,%xmm5 + vpsrlq $4,%xmm8,%xmm7 + vpand %xmm15,%xmm6,%xmm6 + vpsrlq $30,%xmm8,%xmm8 + vpand %xmm15,%xmm7,%xmm7 + vpand %xmm15,%xmm8,%xmm8 + vpor 32(%rcx),%xmm9,%xmm9 + + jbe .Lskip_loop_avx + + + vmovdqu -48(%rdi),%xmm11 + vmovdqu -32(%rdi),%xmm12 + vpshufd $0xEE,%xmm14,%xmm13 + vpshufd $0x44,%xmm14,%xmm10 + vmovdqa %xmm13,-144(%r11) + vmovdqa %xmm10,0(%rsp) + vpshufd $0xEE,%xmm11,%xmm14 + vmovdqu -16(%rdi),%xmm10 + vpshufd $0x44,%xmm11,%xmm11 + vmovdqa %xmm14,-128(%r11) + vmovdqa %xmm11,16(%rsp) + vpshufd $0xEE,%xmm12,%xmm13 + vmovdqu 0(%rdi),%xmm11 + vpshufd $0x44,%xmm12,%xmm12 + vmovdqa %xmm13,-112(%r11) + vmovdqa %xmm12,32(%rsp) + vpshufd $0xEE,%xmm10,%xmm14 + vmovdqu 16(%rdi),%xmm12 + vpshufd $0x44,%xmm10,%xmm10 + vmovdqa %xmm14,-96(%r11) + vmovdqa %xmm10,48(%rsp) + vpshufd $0xEE,%xmm11,%xmm13 + vmovdqu 32(%rdi),%xmm10 + vpshufd $0x44,%xmm11,%xmm11 + vmovdqa %xmm13,-80(%r11) + vmovdqa %xmm11,64(%rsp) + vpshufd $0xEE,%xmm12,%xmm14 + vmovdqu 48(%rdi),%xmm11 + vpshufd $0x44,%xmm12,%xmm12 + vmovdqa %xmm14,-64(%r11) + vmovdqa %xmm12,80(%rsp) + vpshufd $0xEE,%xmm10,%xmm13 + vmovdqu 64(%rdi),%xmm12 + vpshufd $0x44,%xmm10,%xmm10 + vmovdqa %xmm13,-48(%r11) + vmovdqa %xmm10,96(%rsp) + vpshufd $0xEE,%xmm11,%xmm14 + vpshufd $0x44,%xmm11,%xmm11 + vmovdqa %xmm14,-32(%r11) + vmovdqa %xmm11,112(%rsp) + vpshufd $0xEE,%xmm12,%xmm13 + vmovdqa 0(%rsp),%xmm14 + vpshufd $0x44,%xmm12,%xmm12 + vmovdqa %xmm13,-16(%r11) + vmovdqa %xmm12,128(%rsp) + + jmp .Loop_avx + +.align 32 +.Loop_avx: + + + + + + + + + + + + + + + + + + + + + vpmuludq %xmm5,%xmm14,%xmm10 + vpmuludq %xmm6,%xmm14,%xmm11 + vmovdqa %xmm2,32(%r11) + vpmuludq %xmm7,%xmm14,%xmm12 + vmovdqa 16(%rsp),%xmm2 + vpmuludq %xmm8,%xmm14,%xmm13 + vpmuludq %xmm9,%xmm14,%xmm14 + + vmovdqa %xmm0,0(%r11) + vpmuludq 32(%rsp),%xmm9,%xmm0 + vmovdqa %xmm1,16(%r11) + vpmuludq %xmm8,%xmm2,%xmm1 + vpaddq %xmm0,%xmm10,%xmm10 + vpaddq %xmm1,%xmm14,%xmm14 + vmovdqa %xmm3,48(%r11) + vpmuludq %xmm7,%xmm2,%xmm0 + vpmuludq %xmm6,%xmm2,%xmm1 + vpaddq %xmm0,%xmm13,%xmm13 + vmovdqa 48(%rsp),%xmm3 + vpaddq %xmm1,%xmm12,%xmm12 + vmovdqa %xmm4,64(%r11) + vpmuludq %xmm5,%xmm2,%xmm2 + vpmuludq %xmm7,%xmm3,%xmm0 + vpaddq %xmm2,%xmm11,%xmm11 + + vmovdqa 64(%rsp),%xmm4 + vpaddq %xmm0,%xmm14,%xmm14 + vpmuludq %xmm6,%xmm3,%xmm1 + vpmuludq %xmm5,%xmm3,%xmm3 + vpaddq %xmm1,%xmm13,%xmm13 + vmovdqa 80(%rsp),%xmm2 + vpaddq %xmm3,%xmm12,%xmm12 + vpmuludq %xmm9,%xmm4,%xmm0 + vpmuludq %xmm8,%xmm4,%xmm4 + vpaddq %xmm0,%xmm11,%xmm11 + vmovdqa 96(%rsp),%xmm3 + vpaddq %xmm4,%xmm10,%xmm10 + + vmovdqa 128(%rsp),%xmm4 + vpmuludq %xmm6,%xmm2,%xmm1 + vpmuludq %xmm5,%xmm2,%xmm2 + vpaddq %xmm1,%xmm14,%xmm14 + vpaddq %xmm2,%xmm13,%xmm13 + vpmuludq %xmm9,%xmm3,%xmm0 + vpmuludq %xmm8,%xmm3,%xmm1 + vpaddq %xmm0,%xmm12,%xmm12 + vmovdqu 0(%rsi),%xmm0 + vpaddq %xmm1,%xmm11,%xmm11 + vpmuludq %xmm7,%xmm3,%xmm3 + vpmuludq %xmm7,%xmm4,%xmm7 + vpaddq %xmm3,%xmm10,%xmm10 + + vmovdqu 16(%rsi),%xmm1 + vpaddq %xmm7,%xmm11,%xmm11 + vpmuludq %xmm8,%xmm4,%xmm8 + vpmuludq %xmm9,%xmm4,%xmm9 + vpsrldq $6,%xmm0,%xmm2 + vpaddq %xmm8,%xmm12,%xmm12 + vpaddq %xmm9,%xmm13,%xmm13 + vpsrldq $6,%xmm1,%xmm3 + vpmuludq 112(%rsp),%xmm5,%xmm9 + vpmuludq %xmm6,%xmm4,%xmm5 + vpunpckhqdq %xmm1,%xmm0,%xmm4 + vpaddq %xmm9,%xmm14,%xmm14 + vmovdqa -144(%r11),%xmm9 + vpaddq %xmm5,%xmm10,%xmm10 + + vpunpcklqdq %xmm1,%xmm0,%xmm0 + vpunpcklqdq %xmm3,%xmm2,%xmm3 + + + vpsrldq $5,%xmm4,%xmm4 + vpsrlq $26,%xmm0,%xmm1 + vpand %xmm15,%xmm0,%xmm0 + vpsrlq $4,%xmm3,%xmm2 + vpand %xmm15,%xmm1,%xmm1 + vpand 0(%rcx),%xmm4,%xmm4 + vpsrlq $30,%xmm3,%xmm3 + vpand %xmm15,%xmm2,%xmm2 + vpand %xmm15,%xmm3,%xmm3 + vpor 32(%rcx),%xmm4,%xmm4 + + vpaddq 0(%r11),%xmm0,%xmm0 + vpaddq 16(%r11),%xmm1,%xmm1 + vpaddq 32(%r11),%xmm2,%xmm2 + vpaddq 48(%r11),%xmm3,%xmm3 + vpaddq 64(%r11),%xmm4,%xmm4 + + leaq 32(%rsi),%rax + leaq 64(%rsi),%rsi + subq $64,%rdx + cmovcq %rax,%rsi + + + + + + + + + + + vpmuludq %xmm0,%xmm9,%xmm5 + vpmuludq %xmm1,%xmm9,%xmm6 + vpaddq %xmm5,%xmm10,%xmm10 + vpaddq %xmm6,%xmm11,%xmm11 + vmovdqa -128(%r11),%xmm7 + vpmuludq %xmm2,%xmm9,%xmm5 + vpmuludq %xmm3,%xmm9,%xmm6 + vpaddq %xmm5,%xmm12,%xmm12 + vpaddq %xmm6,%xmm13,%xmm13 + vpmuludq %xmm4,%xmm9,%xmm9 + vpmuludq -112(%r11),%xmm4,%xmm5 + vpaddq %xmm9,%xmm14,%xmm14 + + vpaddq %xmm5,%xmm10,%xmm10 + vpmuludq %xmm2,%xmm7,%xmm6 + vpmuludq %xmm3,%xmm7,%xmm5 + vpaddq %xmm6,%xmm13,%xmm13 + vmovdqa -96(%r11),%xmm8 + vpaddq %xmm5,%xmm14,%xmm14 + vpmuludq %xmm1,%xmm7,%xmm6 + vpmuludq %xmm0,%xmm7,%xmm7 + vpaddq %xmm6,%xmm12,%xmm12 + vpaddq %xmm7,%xmm11,%xmm11 + + vmovdqa -80(%r11),%xmm9 + vpmuludq %xmm2,%xmm8,%xmm5 + vpmuludq %xmm1,%xmm8,%xmm6 + vpaddq %xmm5,%xmm14,%xmm14 + vpaddq %xmm6,%xmm13,%xmm13 + vmovdqa -64(%r11),%xmm7 + vpmuludq %xmm0,%xmm8,%xmm8 + vpmuludq %xmm4,%xmm9,%xmm5 + vpaddq %xmm8,%xmm12,%xmm12 + vpaddq %xmm5,%xmm11,%xmm11 + vmovdqa -48(%r11),%xmm8 + vpmuludq %xmm3,%xmm9,%xmm9 + vpmuludq %xmm1,%xmm7,%xmm6 + vpaddq %xmm9,%xmm10,%xmm10 + + vmovdqa -16(%r11),%xmm9 + vpaddq %xmm6,%xmm14,%xmm14 + vpmuludq %xmm0,%xmm7,%xmm7 + vpmuludq %xmm4,%xmm8,%xmm5 + vpaddq %xmm7,%xmm13,%xmm13 + vpaddq %xmm5,%xmm12,%xmm12 + vmovdqu 32(%rsi),%xmm5 + vpmuludq %xmm3,%xmm8,%xmm7 + vpmuludq %xmm2,%xmm8,%xmm8 + vpaddq %xmm7,%xmm11,%xmm11 + vmovdqu 48(%rsi),%xmm6 + vpaddq %xmm8,%xmm10,%xmm10 + + vpmuludq %xmm2,%xmm9,%xmm2 + vpmuludq %xmm3,%xmm9,%xmm3 + vpsrldq $6,%xmm5,%xmm7 + vpaddq %xmm2,%xmm11,%xmm11 + vpmuludq %xmm4,%xmm9,%xmm4 + vpsrldq $6,%xmm6,%xmm8 + vpaddq %xmm3,%xmm12,%xmm2 + vpaddq %xmm4,%xmm13,%xmm3 + vpmuludq -32(%r11),%xmm0,%xmm4 + vpmuludq %xmm1,%xmm9,%xmm0 + vpunpckhqdq %xmm6,%xmm5,%xmm9 + vpaddq %xmm4,%xmm14,%xmm4 + vpaddq %xmm0,%xmm10,%xmm0 + + vpunpcklqdq %xmm6,%xmm5,%xmm5 + vpunpcklqdq %xmm8,%xmm7,%xmm8 + + + vpsrldq $5,%xmm9,%xmm9 + vpsrlq $26,%xmm5,%xmm6 + vmovdqa 0(%rsp),%xmm14 + vpand %xmm15,%xmm5,%xmm5 + vpsrlq $4,%xmm8,%xmm7 + vpand %xmm15,%xmm6,%xmm6 + vpand 0(%rcx),%xmm9,%xmm9 + vpsrlq $30,%xmm8,%xmm8 + vpand %xmm15,%xmm7,%xmm7 + vpand %xmm15,%xmm8,%xmm8 + vpor 32(%rcx),%xmm9,%xmm9 + + + + + + vpsrlq $26,%xmm3,%xmm13 + vpand %xmm15,%xmm3,%xmm3 + vpaddq %xmm13,%xmm4,%xmm4 + + vpsrlq $26,%xmm0,%xmm10 + vpand %xmm15,%xmm0,%xmm0 + vpaddq %xmm10,%xmm11,%xmm1 + + vpsrlq $26,%xmm4,%xmm10 + vpand %xmm15,%xmm4,%xmm4 + + vpsrlq $26,%xmm1,%xmm11 + vpand %xmm15,%xmm1,%xmm1 + vpaddq %xmm11,%xmm2,%xmm2 + + vpaddq %xmm10,%xmm0,%xmm0 + vpsllq $2,%xmm10,%xmm10 + vpaddq %xmm10,%xmm0,%xmm0 + + vpsrlq $26,%xmm2,%xmm12 + vpand %xmm15,%xmm2,%xmm2 + vpaddq %xmm12,%xmm3,%xmm3 + + vpsrlq $26,%xmm0,%xmm10 + vpand %xmm15,%xmm0,%xmm0 + vpaddq %xmm10,%xmm1,%xmm1 + + vpsrlq $26,%xmm3,%xmm13 + vpand %xmm15,%xmm3,%xmm3 + vpaddq %xmm13,%xmm4,%xmm4 + + ja .Loop_avx + +.Lskip_loop_avx: + + + + vpshufd $0x10,%xmm14,%xmm14 + addq $32,%rdx + jnz .Long_tail_avx + + vpaddq %xmm2,%xmm7,%xmm7 + vpaddq %xmm0,%xmm5,%xmm5 + vpaddq %xmm1,%xmm6,%xmm6 + vpaddq %xmm3,%xmm8,%xmm8 + vpaddq %xmm4,%xmm9,%xmm9 + +.Long_tail_avx: + vmovdqa %xmm2,32(%r11) + vmovdqa %xmm0,0(%r11) + vmovdqa %xmm1,16(%r11) + vmovdqa %xmm3,48(%r11) + vmovdqa %xmm4,64(%r11) + + + + + + + + vpmuludq %xmm7,%xmm14,%xmm12 + vpmuludq %xmm5,%xmm14,%xmm10 + vpshufd $0x10,-48(%rdi),%xmm2 + vpmuludq %xmm6,%xmm14,%xmm11 + vpmuludq %xmm8,%xmm14,%xmm13 + vpmuludq %xmm9,%xmm14,%xmm14 + + vpmuludq %xmm8,%xmm2,%xmm0 + vpaddq %xmm0,%xmm14,%xmm14 + vpshufd $0x10,-32(%rdi),%xmm3 + vpmuludq %xmm7,%xmm2,%xmm1 + vpaddq %xmm1,%xmm13,%xmm13 + vpshufd $0x10,-16(%rdi),%xmm4 + vpmuludq %xmm6,%xmm2,%xmm0 + vpaddq %xmm0,%xmm12,%xmm12 + vpmuludq %xmm5,%xmm2,%xmm2 + vpaddq %xmm2,%xmm11,%xmm11 + vpmuludq %xmm9,%xmm3,%xmm3 + vpaddq %xmm3,%xmm10,%xmm10 + + vpshufd $0x10,0(%rdi),%xmm2 + vpmuludq %xmm7,%xmm4,%xmm1 + vpaddq %xmm1,%xmm14,%xmm14 + vpmuludq %xmm6,%xmm4,%xmm0 + vpaddq %xmm0,%xmm13,%xmm13 + vpshufd $0x10,16(%rdi),%xmm3 + vpmuludq %xmm5,%xmm4,%xmm4 + vpaddq %xmm4,%xmm12,%xmm12 + vpmuludq %xmm9,%xmm2,%xmm1 + vpaddq %xmm1,%xmm11,%xmm11 + vpshufd $0x10,32(%rdi),%xmm4 + vpmuludq %xmm8,%xmm2,%xmm2 + vpaddq %xmm2,%xmm10,%xmm10 + + vpmuludq %xmm6,%xmm3,%xmm0 + vpaddq %xmm0,%xmm14,%xmm14 + vpmuludq %xmm5,%xmm3,%xmm3 + vpaddq %xmm3,%xmm13,%xmm13 + vpshufd $0x10,48(%rdi),%xmm2 + vpmuludq %xmm9,%xmm4,%xmm1 + vpaddq %xmm1,%xmm12,%xmm12 + vpshufd $0x10,64(%rdi),%xmm3 + vpmuludq %xmm8,%xmm4,%xmm0 + vpaddq %xmm0,%xmm11,%xmm11 + vpmuludq %xmm7,%xmm4,%xmm4 + vpaddq %xmm4,%xmm10,%xmm10 + + vpmuludq %xmm5,%xmm2,%xmm2 + vpaddq %xmm2,%xmm14,%xmm14 + vpmuludq %xmm9,%xmm3,%xmm1 + vpaddq %xmm1,%xmm13,%xmm13 + vpmuludq %xmm8,%xmm3,%xmm0 + vpaddq %xmm0,%xmm12,%xmm12 + vpmuludq %xmm7,%xmm3,%xmm1 + vpaddq %xmm1,%xmm11,%xmm11 + vpmuludq %xmm6,%xmm3,%xmm3 + vpaddq %xmm3,%xmm10,%xmm10 + + jz .Lshort_tail_avx + + vmovdqu 0(%rsi),%xmm0 + vmovdqu 16(%rsi),%xmm1 + + vpsrldq $6,%xmm0,%xmm2 + vpsrldq $6,%xmm1,%xmm3 + vpunpckhqdq %xmm1,%xmm0,%xmm4 + vpunpcklqdq %xmm1,%xmm0,%xmm0 + vpunpcklqdq %xmm3,%xmm2,%xmm3 + + vpsrlq $40,%xmm4,%xmm4 + vpsrlq $26,%xmm0,%xmm1 + vpand %xmm15,%xmm0,%xmm0 + vpsrlq $4,%xmm3,%xmm2 + vpand %xmm15,%xmm1,%xmm1 + vpsrlq $30,%xmm3,%xmm3 + vpand %xmm15,%xmm2,%xmm2 + vpand %xmm15,%xmm3,%xmm3 + vpor 32(%rcx),%xmm4,%xmm4 + + vpshufd $0x32,-64(%rdi),%xmm9 + vpaddq 0(%r11),%xmm0,%xmm0 + vpaddq 16(%r11),%xmm1,%xmm1 + vpaddq 32(%r11),%xmm2,%xmm2 + vpaddq 48(%r11),%xmm3,%xmm3 + vpaddq 64(%r11),%xmm4,%xmm4 + + + + + vpmuludq %xmm0,%xmm9,%xmm5 + vpaddq %xmm5,%xmm10,%xmm10 + vpmuludq %xmm1,%xmm9,%xmm6 + vpaddq %xmm6,%xmm11,%xmm11 + vpmuludq %xmm2,%xmm9,%xmm5 + vpaddq %xmm5,%xmm12,%xmm12 + vpshufd $0x32,-48(%rdi),%xmm7 + vpmuludq %xmm3,%xmm9,%xmm6 + vpaddq %xmm6,%xmm13,%xmm13 + vpmuludq %xmm4,%xmm9,%xmm9 + vpaddq %xmm9,%xmm14,%xmm14 + + vpmuludq %xmm3,%xmm7,%xmm5 + vpaddq %xmm5,%xmm14,%xmm14 + vpshufd $0x32,-32(%rdi),%xmm8 + vpmuludq %xmm2,%xmm7,%xmm6 + vpaddq %xmm6,%xmm13,%xmm13 + vpshufd $0x32,-16(%rdi),%xmm9 + vpmuludq %xmm1,%xmm7,%xmm5 + vpaddq %xmm5,%xmm12,%xmm12 + vpmuludq %xmm0,%xmm7,%xmm7 + vpaddq %xmm7,%xmm11,%xmm11 + vpmuludq %xmm4,%xmm8,%xmm8 + vpaddq %xmm8,%xmm10,%xmm10 + + vpshufd $0x32,0(%rdi),%xmm7 + vpmuludq %xmm2,%xmm9,%xmm6 + vpaddq %xmm6,%xmm14,%xmm14 + vpmuludq %xmm1,%xmm9,%xmm5 + vpaddq %xmm5,%xmm13,%xmm13 + vpshufd $0x32,16(%rdi),%xmm8 + vpmuludq %xmm0,%xmm9,%xmm9 + vpaddq %xmm9,%xmm12,%xmm12 + vpmuludq %xmm4,%xmm7,%xmm6 + vpaddq %xmm6,%xmm11,%xmm11 + vpshufd $0x32,32(%rdi),%xmm9 + vpmuludq %xmm3,%xmm7,%xmm7 + vpaddq %xmm7,%xmm10,%xmm10 + + vpmuludq %xmm1,%xmm8,%xmm5 + vpaddq %xmm5,%xmm14,%xmm14 + vpmuludq %xmm0,%xmm8,%xmm8 + vpaddq %xmm8,%xmm13,%xmm13 + vpshufd $0x32,48(%rdi),%xmm7 + vpmuludq %xmm4,%xmm9,%xmm6 + vpaddq %xmm6,%xmm12,%xmm12 + vpshufd $0x32,64(%rdi),%xmm8 + vpmuludq %xmm3,%xmm9,%xmm5 + vpaddq %xmm5,%xmm11,%xmm11 + vpmuludq %xmm2,%xmm9,%xmm9 + vpaddq %xmm9,%xmm10,%xmm10 + + vpmuludq %xmm0,%xmm7,%xmm7 + vpaddq %xmm7,%xmm14,%xmm14 + vpmuludq %xmm4,%xmm8,%xmm6 + vpaddq %xmm6,%xmm13,%xmm13 + vpmuludq %xmm3,%xmm8,%xmm5 + vpaddq %xmm5,%xmm12,%xmm12 + vpmuludq %xmm2,%xmm8,%xmm6 + vpaddq %xmm6,%xmm11,%xmm11 + vpmuludq %xmm1,%xmm8,%xmm8 + vpaddq %xmm8,%xmm10,%xmm10 + +.Lshort_tail_avx: + + + + vpsrldq $8,%xmm14,%xmm9 + vpsrldq $8,%xmm13,%xmm8 + vpsrldq $8,%xmm11,%xmm6 + vpsrldq $8,%xmm10,%xmm5 + vpsrldq $8,%xmm12,%xmm7 + vpaddq %xmm8,%xmm13,%xmm13 + vpaddq %xmm9,%xmm14,%xmm14 + vpaddq %xmm5,%xmm10,%xmm10 + vpaddq %xmm6,%xmm11,%xmm11 + vpaddq %xmm7,%xmm12,%xmm12 + + + + + vpsrlq $26,%xmm13,%xmm3 + vpand %xmm15,%xmm13,%xmm13 + vpaddq %xmm3,%xmm14,%xmm14 + + vpsrlq $26,%xmm10,%xmm0 + vpand %xmm15,%xmm10,%xmm10 + vpaddq %xmm0,%xmm11,%xmm11 + + vpsrlq $26,%xmm14,%xmm4 + vpand %xmm15,%xmm14,%xmm14 + + vpsrlq $26,%xmm11,%xmm1 + vpand %xmm15,%xmm11,%xmm11 + vpaddq %xmm1,%xmm12,%xmm12 + + vpaddq %xmm4,%xmm10,%xmm10 + vpsllq $2,%xmm4,%xmm4 + vpaddq %xmm4,%xmm10,%xmm10 + + vpsrlq $26,%xmm12,%xmm2 + vpand %xmm15,%xmm12,%xmm12 + vpaddq %xmm2,%xmm13,%xmm13 + + vpsrlq $26,%xmm10,%xmm0 + vpand %xmm15,%xmm10,%xmm10 + vpaddq %xmm0,%xmm11,%xmm11 + + vpsrlq $26,%xmm13,%xmm3 + vpand %xmm15,%xmm13,%xmm13 + vpaddq %xmm3,%xmm14,%xmm14 + + vmovd %xmm10,-112(%rdi) + vmovd %xmm11,-108(%rdi) + vmovd %xmm12,-104(%rdi) + vmovd %xmm13,-100(%rdi) + vmovd %xmm14,-96(%rdi) + leaq 88(%r11),%rsp +.cfi_def_cfa %rsp,8 + vzeroupper + .byte 0xf3,0xc3 +.cfi_endproc +.size poly1305_blocks_avx,.-poly1305_blocks_avx + +.type poly1305_emit_avx,@function +.align 32 +poly1305_emit_avx: +.cfi_startproc + cmpl $0,20(%rdi) + je .Lemit + + movl 0(%rdi),%eax + movl 4(%rdi),%ecx + movl 8(%rdi),%r8d + movl 12(%rdi),%r11d + movl 16(%rdi),%r10d + + shlq $26,%rcx + movq %r8,%r9 + shlq $52,%r8 + addq %rcx,%rax + shrq $12,%r9 + addq %rax,%r8 + adcq $0,%r9 + + shlq $14,%r11 + movq %r10,%rax + shrq $24,%r10 + addq %r11,%r9 + shlq $40,%rax + addq %rax,%r9 + adcq $0,%r10 + + movq %r10,%rax + movq %r10,%rcx + andq $3,%r10 + shrq $2,%rax + andq $-4,%rcx + addq %rcx,%rax + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + + movq %r8,%rax + addq $5,%r8 + movq %r9,%rcx + adcq $0,%r9 + adcq $0,%r10 + shrq $2,%r10 + cmovnzq %r8,%rax + cmovnzq %r9,%rcx + + addq 0(%rdx),%rax + adcq 8(%rdx),%rcx + movq %rax,0(%rsi) + movq %rcx,8(%rsi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size poly1305_emit_avx,.-poly1305_emit_avx +.type poly1305_blocks_avx2,@function +.align 32 +poly1305_blocks_avx2: +.cfi_startproc + movl 20(%rdi),%r8d + cmpq $128,%rdx + jae .Lblocks_avx2 + testl %r8d,%r8d + jz .Lblocks + +.Lblocks_avx2: + andq $-16,%rdx + jz .Lno_data_avx2 + + vzeroupper + + testl %r8d,%r8d + jz .Lbase2_64_avx2 + + testq $63,%rdx + jz .Leven_avx2 + + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lblocks_avx2_body: + + movq %rdx,%r15 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movl 16(%rdi),%ebp + + movq 24(%rdi),%r11 + movq 32(%rdi),%r13 + + + movl %r8d,%r14d + andq $-2147483648,%r8 + movq %r9,%r12 + movl %r9d,%ebx + andq $-2147483648,%r9 + + shrq $6,%r8 + shlq $52,%r12 + addq %r8,%r14 + shrq $12,%rbx + shrq $18,%r9 + addq %r12,%r14 + adcq %r9,%rbx + + movq %rbp,%r8 + shlq $40,%r8 + shrq $24,%rbp + addq %r8,%rbx + adcq $0,%rbp + + movq $-4,%r9 + movq %rbp,%r8 + andq %rbp,%r9 + shrq $2,%r8 + andq $3,%rbp + addq %r9,%r8 + addq %r8,%r14 + adcq $0,%rbx + adcq $0,%rbp + + movq %r13,%r12 + movq %r13,%rax + shrq $2,%r13 + addq %r12,%r13 + +.Lbase2_26_pre_avx2: + addq 0(%rsi),%r14 + adcq 8(%rsi),%rbx + leaq 16(%rsi),%rsi + adcq %rcx,%rbp + subq $16,%r15 + + call __poly1305_block + movq %r12,%rax + + testq $63,%r15 + jnz .Lbase2_26_pre_avx2 + + testq %rcx,%rcx + jz .Lstore_base2_64_avx2 + + + movq %r14,%rax + movq %r14,%rdx + shrq $52,%r14 + movq %rbx,%r11 + movq %rbx,%r12 + shrq $26,%rdx + andq $0x3ffffff,%rax + shlq $12,%r11 + andq $0x3ffffff,%rdx + shrq $14,%rbx + orq %r11,%r14 + shlq $24,%rbp + andq $0x3ffffff,%r14 + shrq $40,%r12 + andq $0x3ffffff,%rbx + orq %r12,%rbp + + testq %r15,%r15 + jz .Lstore_base2_26_avx2 + + vmovd %eax,%xmm0 + vmovd %edx,%xmm1 + vmovd %r14d,%xmm2 + vmovd %ebx,%xmm3 + vmovd %ebp,%xmm4 + jmp .Lproceed_avx2 + +.align 32 +.Lstore_base2_64_avx2: + movq %r14,0(%rdi) + movq %rbx,8(%rdi) + movq %rbp,16(%rdi) + jmp .Ldone_avx2 + +.align 16 +.Lstore_base2_26_avx2: + movl %eax,0(%rdi) + movl %edx,4(%rdi) + movl %r14d,8(%rdi) + movl %ebx,12(%rdi) + movl %ebp,16(%rdi) +.align 16 +.Ldone_avx2: + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbp +.cfi_restore %rbp + movq 40(%rsp),%rbx +.cfi_restore %rbx + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lno_data_avx2: +.Lblocks_avx2_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc + +.align 32 +.Lbase2_64_avx2: +.cfi_startproc + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lbase2_64_avx2_body: + + movq %rdx,%r15 + + movq 24(%rdi),%r11 + movq 32(%rdi),%r13 + + movq 0(%rdi),%r14 + movq 8(%rdi),%rbx + movl 16(%rdi),%ebp + + movq %r13,%r12 + movq %r13,%rax + shrq $2,%r13 + addq %r12,%r13 + + testq $63,%rdx + jz .Linit_avx2 + +.Lbase2_64_pre_avx2: + addq 0(%rsi),%r14 + adcq 8(%rsi),%rbx + leaq 16(%rsi),%rsi + adcq %rcx,%rbp + subq $16,%r15 + + call __poly1305_block + movq %r12,%rax + + testq $63,%r15 + jnz .Lbase2_64_pre_avx2 + +.Linit_avx2: + + movq %r14,%rax + movq %r14,%rdx + shrq $52,%r14 + movq %rbx,%r8 + movq %rbx,%r9 + shrq $26,%rdx + andq $0x3ffffff,%rax + shlq $12,%r8 + andq $0x3ffffff,%rdx + shrq $14,%rbx + orq %r8,%r14 + shlq $24,%rbp + andq $0x3ffffff,%r14 + shrq $40,%r9 + andq $0x3ffffff,%rbx + orq %r9,%rbp + + vmovd %eax,%xmm0 + vmovd %edx,%xmm1 + vmovd %r14d,%xmm2 + vmovd %ebx,%xmm3 + vmovd %ebp,%xmm4 + movl $1,20(%rdi) + + call __poly1305_init_avx + +.Lproceed_avx2: + movq %r15,%rdx + movl OPENSSL_ia32cap_P+8(%rip),%r10d + movl $3221291008,%r11d + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbp +.cfi_restore %rbp + movq 40(%rsp),%rbx +.cfi_restore %rbx + leaq 48(%rsp),%rax + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lbase2_64_avx2_epilogue: + jmp .Ldo_avx2 +.cfi_endproc + +.align 32 +.Leven_avx2: +.cfi_startproc + movl OPENSSL_ia32cap_P+8(%rip),%r10d + vmovd 0(%rdi),%xmm0 + vmovd 4(%rdi),%xmm1 + vmovd 8(%rdi),%xmm2 + vmovd 12(%rdi),%xmm3 + vmovd 16(%rdi),%xmm4 + +.Ldo_avx2: + leaq -8(%rsp),%r11 +.cfi_def_cfa %r11,16 + subq $0x128,%rsp + leaq .Lconst(%rip),%rcx + leaq 48+64(%rdi),%rdi + vmovdqa 96(%rcx),%ymm7 + + + vmovdqu -64(%rdi),%xmm9 + andq $-512,%rsp + vmovdqu -48(%rdi),%xmm10 + vmovdqu -32(%rdi),%xmm6 + vmovdqu -16(%rdi),%xmm11 + vmovdqu 0(%rdi),%xmm12 + vmovdqu 16(%rdi),%xmm13 + leaq 144(%rsp),%rax + vmovdqu 32(%rdi),%xmm14 + vpermd %ymm9,%ymm7,%ymm9 + vmovdqu 48(%rdi),%xmm15 + vpermd %ymm10,%ymm7,%ymm10 + vmovdqu 64(%rdi),%xmm5 + vpermd %ymm6,%ymm7,%ymm6 + vmovdqa %ymm9,0(%rsp) + vpermd %ymm11,%ymm7,%ymm11 + vmovdqa %ymm10,32-144(%rax) + vpermd %ymm12,%ymm7,%ymm12 + vmovdqa %ymm6,64-144(%rax) + vpermd %ymm13,%ymm7,%ymm13 + vmovdqa %ymm11,96-144(%rax) + vpermd %ymm14,%ymm7,%ymm14 + vmovdqa %ymm12,128-144(%rax) + vpermd %ymm15,%ymm7,%ymm15 + vmovdqa %ymm13,160-144(%rax) + vpermd %ymm5,%ymm7,%ymm5 + vmovdqa %ymm14,192-144(%rax) + vmovdqa %ymm15,224-144(%rax) + vmovdqa %ymm5,256-144(%rax) + vmovdqa 64(%rcx),%ymm5 + + + + vmovdqu 0(%rsi),%xmm7 + vmovdqu 16(%rsi),%xmm8 + vinserti128 $1,32(%rsi),%ymm7,%ymm7 + vinserti128 $1,48(%rsi),%ymm8,%ymm8 + leaq 64(%rsi),%rsi + + vpsrldq $6,%ymm7,%ymm9 + vpsrldq $6,%ymm8,%ymm10 + vpunpckhqdq %ymm8,%ymm7,%ymm6 + vpunpcklqdq %ymm10,%ymm9,%ymm9 + vpunpcklqdq %ymm8,%ymm7,%ymm7 + + vpsrlq $30,%ymm9,%ymm10 + vpsrlq $4,%ymm9,%ymm9 + vpsrlq $26,%ymm7,%ymm8 + vpsrlq $40,%ymm6,%ymm6 + vpand %ymm5,%ymm9,%ymm9 + vpand %ymm5,%ymm7,%ymm7 + vpand %ymm5,%ymm8,%ymm8 + vpand %ymm5,%ymm10,%ymm10 + vpor 32(%rcx),%ymm6,%ymm6 + + vpaddq %ymm2,%ymm9,%ymm2 + subq $64,%rdx + jz .Ltail_avx2 + jmp .Loop_avx2 + +.align 32 +.Loop_avx2: + + + + + + + + + vpaddq %ymm0,%ymm7,%ymm0 + vmovdqa 0(%rsp),%ymm7 + vpaddq %ymm1,%ymm8,%ymm1 + vmovdqa 32(%rsp),%ymm8 + vpaddq %ymm3,%ymm10,%ymm3 + vmovdqa 96(%rsp),%ymm9 + vpaddq %ymm4,%ymm6,%ymm4 + vmovdqa 48(%rax),%ymm10 + vmovdqa 112(%rax),%ymm5 + + + + + + + + + + + + + + + + + vpmuludq %ymm2,%ymm7,%ymm13 + vpmuludq %ymm2,%ymm8,%ymm14 + vpmuludq %ymm2,%ymm9,%ymm15 + vpmuludq %ymm2,%ymm10,%ymm11 + vpmuludq %ymm2,%ymm5,%ymm12 + + vpmuludq %ymm0,%ymm8,%ymm6 + vpmuludq %ymm1,%ymm8,%ymm2 + vpaddq %ymm6,%ymm12,%ymm12 + vpaddq %ymm2,%ymm13,%ymm13 + vpmuludq %ymm3,%ymm8,%ymm6 + vpmuludq 64(%rsp),%ymm4,%ymm2 + vpaddq %ymm6,%ymm15,%ymm15 + vpaddq %ymm2,%ymm11,%ymm11 + vmovdqa -16(%rax),%ymm8 + + vpmuludq %ymm0,%ymm7,%ymm6 + vpmuludq %ymm1,%ymm7,%ymm2 + vpaddq %ymm6,%ymm11,%ymm11 + vpaddq %ymm2,%ymm12,%ymm12 + vpmuludq %ymm3,%ymm7,%ymm6 + vpmuludq %ymm4,%ymm7,%ymm2 + vmovdqu 0(%rsi),%xmm7 + vpaddq %ymm6,%ymm14,%ymm14 + vpaddq %ymm2,%ymm15,%ymm15 + vinserti128 $1,32(%rsi),%ymm7,%ymm7 + + vpmuludq %ymm3,%ymm8,%ymm6 + vpmuludq %ymm4,%ymm8,%ymm2 + vmovdqu 16(%rsi),%xmm8 + vpaddq %ymm6,%ymm11,%ymm11 + vpaddq %ymm2,%ymm12,%ymm12 + vmovdqa 16(%rax),%ymm2 + vpmuludq %ymm1,%ymm9,%ymm6 + vpmuludq %ymm0,%ymm9,%ymm9 + vpaddq %ymm6,%ymm14,%ymm14 + vpaddq %ymm9,%ymm13,%ymm13 + vinserti128 $1,48(%rsi),%ymm8,%ymm8 + leaq 64(%rsi),%rsi + + vpmuludq %ymm1,%ymm2,%ymm6 + vpmuludq %ymm0,%ymm2,%ymm2 + vpsrldq $6,%ymm7,%ymm9 + vpaddq %ymm6,%ymm15,%ymm15 + vpaddq %ymm2,%ymm14,%ymm14 + vpmuludq %ymm3,%ymm10,%ymm6 + vpmuludq %ymm4,%ymm10,%ymm2 + vpsrldq $6,%ymm8,%ymm10 + vpaddq %ymm6,%ymm12,%ymm12 + vpaddq %ymm2,%ymm13,%ymm13 + vpunpckhqdq %ymm8,%ymm7,%ymm6 + + vpmuludq %ymm3,%ymm5,%ymm3 + vpmuludq %ymm4,%ymm5,%ymm4 + vpunpcklqdq %ymm8,%ymm7,%ymm7 + vpaddq %ymm3,%ymm13,%ymm2 + vpaddq %ymm4,%ymm14,%ymm3 + vpunpcklqdq %ymm10,%ymm9,%ymm10 + vpmuludq 80(%rax),%ymm0,%ymm4 + vpmuludq %ymm1,%ymm5,%ymm0 + vmovdqa 64(%rcx),%ymm5 + vpaddq %ymm4,%ymm15,%ymm4 + vpaddq %ymm0,%ymm11,%ymm0 + + + + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpaddq %ymm14,%ymm4,%ymm4 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpaddq %ymm11,%ymm12,%ymm1 + + vpsrlq $26,%ymm4,%ymm15 + vpand %ymm5,%ymm4,%ymm4 + + vpsrlq $4,%ymm10,%ymm9 + + vpsrlq $26,%ymm1,%ymm12 + vpand %ymm5,%ymm1,%ymm1 + vpaddq %ymm12,%ymm2,%ymm2 + + vpaddq %ymm15,%ymm0,%ymm0 + vpsllq $2,%ymm15,%ymm15 + vpaddq %ymm15,%ymm0,%ymm0 + + vpand %ymm5,%ymm9,%ymm9 + vpsrlq $26,%ymm7,%ymm8 + + vpsrlq $26,%ymm2,%ymm13 + vpand %ymm5,%ymm2,%ymm2 + vpaddq %ymm13,%ymm3,%ymm3 + + vpaddq %ymm9,%ymm2,%ymm2 + vpsrlq $30,%ymm10,%ymm10 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpaddq %ymm11,%ymm1,%ymm1 + + vpsrlq $40,%ymm6,%ymm6 + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpaddq %ymm14,%ymm4,%ymm4 + + vpand %ymm5,%ymm7,%ymm7 + vpand %ymm5,%ymm8,%ymm8 + vpand %ymm5,%ymm10,%ymm10 + vpor 32(%rcx),%ymm6,%ymm6 + + subq $64,%rdx + jnz .Loop_avx2 + +.byte 0x66,0x90 +.Ltail_avx2: + + + + + + + + vpaddq %ymm0,%ymm7,%ymm0 + vmovdqu 4(%rsp),%ymm7 + vpaddq %ymm1,%ymm8,%ymm1 + vmovdqu 36(%rsp),%ymm8 + vpaddq %ymm3,%ymm10,%ymm3 + vmovdqu 100(%rsp),%ymm9 + vpaddq %ymm4,%ymm6,%ymm4 + vmovdqu 52(%rax),%ymm10 + vmovdqu 116(%rax),%ymm5 + + vpmuludq %ymm2,%ymm7,%ymm13 + vpmuludq %ymm2,%ymm8,%ymm14 + vpmuludq %ymm2,%ymm9,%ymm15 + vpmuludq %ymm2,%ymm10,%ymm11 + vpmuludq %ymm2,%ymm5,%ymm12 + + vpmuludq %ymm0,%ymm8,%ymm6 + vpmuludq %ymm1,%ymm8,%ymm2 + vpaddq %ymm6,%ymm12,%ymm12 + vpaddq %ymm2,%ymm13,%ymm13 + vpmuludq %ymm3,%ymm8,%ymm6 + vpmuludq 68(%rsp),%ymm4,%ymm2 + vpaddq %ymm6,%ymm15,%ymm15 + vpaddq %ymm2,%ymm11,%ymm11 + + vpmuludq %ymm0,%ymm7,%ymm6 + vpmuludq %ymm1,%ymm7,%ymm2 + vpaddq %ymm6,%ymm11,%ymm11 + vmovdqu -12(%rax),%ymm8 + vpaddq %ymm2,%ymm12,%ymm12 + vpmuludq %ymm3,%ymm7,%ymm6 + vpmuludq %ymm4,%ymm7,%ymm2 + vpaddq %ymm6,%ymm14,%ymm14 + vpaddq %ymm2,%ymm15,%ymm15 + + vpmuludq %ymm3,%ymm8,%ymm6 + vpmuludq %ymm4,%ymm8,%ymm2 + vpaddq %ymm6,%ymm11,%ymm11 + vpaddq %ymm2,%ymm12,%ymm12 + vmovdqu 20(%rax),%ymm2 + vpmuludq %ymm1,%ymm9,%ymm6 + vpmuludq %ymm0,%ymm9,%ymm9 + vpaddq %ymm6,%ymm14,%ymm14 + vpaddq %ymm9,%ymm13,%ymm13 + + vpmuludq %ymm1,%ymm2,%ymm6 + vpmuludq %ymm0,%ymm2,%ymm2 + vpaddq %ymm6,%ymm15,%ymm15 + vpaddq %ymm2,%ymm14,%ymm14 + vpmuludq %ymm3,%ymm10,%ymm6 + vpmuludq %ymm4,%ymm10,%ymm2 + vpaddq %ymm6,%ymm12,%ymm12 + vpaddq %ymm2,%ymm13,%ymm13 + + vpmuludq %ymm3,%ymm5,%ymm3 + vpmuludq %ymm4,%ymm5,%ymm4 + vpaddq %ymm3,%ymm13,%ymm2 + vpaddq %ymm4,%ymm14,%ymm3 + vpmuludq 84(%rax),%ymm0,%ymm4 + vpmuludq %ymm1,%ymm5,%ymm0 + vmovdqa 64(%rcx),%ymm5 + vpaddq %ymm4,%ymm15,%ymm4 + vpaddq %ymm0,%ymm11,%ymm0 + + + + + vpsrldq $8,%ymm12,%ymm8 + vpsrldq $8,%ymm2,%ymm9 + vpsrldq $8,%ymm3,%ymm10 + vpsrldq $8,%ymm4,%ymm6 + vpsrldq $8,%ymm0,%ymm7 + vpaddq %ymm8,%ymm12,%ymm12 + vpaddq %ymm9,%ymm2,%ymm2 + vpaddq %ymm10,%ymm3,%ymm3 + vpaddq %ymm6,%ymm4,%ymm4 + vpaddq %ymm7,%ymm0,%ymm0 + + vpermq $0x2,%ymm3,%ymm10 + vpermq $0x2,%ymm4,%ymm6 + vpermq $0x2,%ymm0,%ymm7 + vpermq $0x2,%ymm12,%ymm8 + vpermq $0x2,%ymm2,%ymm9 + vpaddq %ymm10,%ymm3,%ymm3 + vpaddq %ymm6,%ymm4,%ymm4 + vpaddq %ymm7,%ymm0,%ymm0 + vpaddq %ymm8,%ymm12,%ymm12 + vpaddq %ymm9,%ymm2,%ymm2 + + + + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpaddq %ymm14,%ymm4,%ymm4 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpaddq %ymm11,%ymm12,%ymm1 + + vpsrlq $26,%ymm4,%ymm15 + vpand %ymm5,%ymm4,%ymm4 + + vpsrlq $26,%ymm1,%ymm12 + vpand %ymm5,%ymm1,%ymm1 + vpaddq %ymm12,%ymm2,%ymm2 + + vpaddq %ymm15,%ymm0,%ymm0 + vpsllq $2,%ymm15,%ymm15 + vpaddq %ymm15,%ymm0,%ymm0 + + vpsrlq $26,%ymm2,%ymm13 + vpand %ymm5,%ymm2,%ymm2 + vpaddq %ymm13,%ymm3,%ymm3 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpaddq %ymm11,%ymm1,%ymm1 + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpaddq %ymm14,%ymm4,%ymm4 + + vmovd %xmm0,-112(%rdi) + vmovd %xmm1,-108(%rdi) + vmovd %xmm2,-104(%rdi) + vmovd %xmm3,-100(%rdi) + vmovd %xmm4,-96(%rdi) + leaq 8(%r11),%rsp +.cfi_def_cfa %rsp,8 + vzeroupper + .byte 0xf3,0xc3 +.cfi_endproc +.size poly1305_blocks_avx2,.-poly1305_blocks_avx2 +.align 64 +.Lconst: +.Lmask24: +.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 +.L129: +.long 16777216,0,16777216,0,16777216,0,16777216,0 +.Lmask26: +.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 +.Lpermd_avx2: +.long 2,2,2,3,2,0,2,1 +.Lpermd_avx512: +.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 + +.L2_44_inp_permd: +.long 0,1,1,2,2,3,7,7 +.L2_44_inp_shift: +.quad 0,12,24,64 +.L2_44_mask: +.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff +.L2_44_shift_rgt: +.quad 44,44,42,64 +.L2_44_shift_lft: +.quad 8,8,10,64 + +.align 64 +.Lx_mask44: +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff +.Lx_mask42: +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 16 .globl xor128_encrypt_n_pad diff --git a/secure/lib/libcrypto/amd64/rsaz-avx2.S b/secure/lib/libcrypto/amd64/rsaz-avx2.S index e957915a7d8..3075a52d2ee 100644 --- a/secure/lib/libcrypto/amd64/rsaz-avx2.S +++ b/secure/lib/libcrypto/amd64/rsaz-avx2.S @@ -2,26 +2,1745 @@ /* Do not modify. This file is auto-generated from rsaz-avx2.pl. */ .text -.globl rsaz_avx2_eligible -.type rsaz_avx2_eligible,@function -rsaz_avx2_eligible: - xorl %eax,%eax - .byte 0xf3,0xc3 -.size rsaz_avx2_eligible,.-rsaz_avx2_eligible - .globl rsaz_1024_sqr_avx2 -.globl rsaz_1024_mul_avx2 -.globl rsaz_1024_norm2red_avx2 -.globl rsaz_1024_red2norm_avx2 -.globl rsaz_1024_scatter5_avx2 -.globl rsaz_1024_gather5_avx2 .type rsaz_1024_sqr_avx2,@function +.align 64 rsaz_1024_sqr_avx2: +.cfi_startproc + leaq (%rsp),%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + vzeroupper + movq %rax,%rbp +.cfi_def_cfa_register %rbp + movq %rdx,%r13 + subq $832,%rsp + movq %r13,%r15 + subq $-128,%rdi + subq $-128,%rsi + subq $-128,%r13 + + andq $4095,%r15 + addq $320,%r15 + shrq $12,%r15 + vpxor %ymm9,%ymm9,%ymm9 + jz .Lsqr_1024_no_n_copy + + + + + + subq $320,%rsp + vmovdqu 0-128(%r13),%ymm0 + andq $-2048,%rsp + vmovdqu 32-128(%r13),%ymm1 + vmovdqu 64-128(%r13),%ymm2 + vmovdqu 96-128(%r13),%ymm3 + vmovdqu 128-128(%r13),%ymm4 + vmovdqu 160-128(%r13),%ymm5 + vmovdqu 192-128(%r13),%ymm6 + vmovdqu 224-128(%r13),%ymm7 + vmovdqu 256-128(%r13),%ymm8 + leaq 832+128(%rsp),%r13 + vmovdqu %ymm0,0-128(%r13) + vmovdqu %ymm1,32-128(%r13) + vmovdqu %ymm2,64-128(%r13) + vmovdqu %ymm3,96-128(%r13) + vmovdqu %ymm4,128-128(%r13) + vmovdqu %ymm5,160-128(%r13) + vmovdqu %ymm6,192-128(%r13) + vmovdqu %ymm7,224-128(%r13) + vmovdqu %ymm8,256-128(%r13) + vmovdqu %ymm9,288-128(%r13) + +.Lsqr_1024_no_n_copy: + andq $-1024,%rsp + + vmovdqu 32-128(%rsi),%ymm1 + vmovdqu 64-128(%rsi),%ymm2 + vmovdqu 96-128(%rsi),%ymm3 + vmovdqu 128-128(%rsi),%ymm4 + vmovdqu 160-128(%rsi),%ymm5 + vmovdqu 192-128(%rsi),%ymm6 + vmovdqu 224-128(%rsi),%ymm7 + vmovdqu 256-128(%rsi),%ymm8 + + leaq 192(%rsp),%rbx + vmovdqu .Land_mask(%rip),%ymm15 + jmp .LOOP_GRANDE_SQR_1024 + +.align 32 +.LOOP_GRANDE_SQR_1024: + leaq 576+128(%rsp),%r9 + leaq 448(%rsp),%r12 + + + + + vpaddq %ymm1,%ymm1,%ymm1 + vpbroadcastq 0-128(%rsi),%ymm10 + vpaddq %ymm2,%ymm2,%ymm2 + vmovdqa %ymm1,0-128(%r9) + vpaddq %ymm3,%ymm3,%ymm3 + vmovdqa %ymm2,32-128(%r9) + vpaddq %ymm4,%ymm4,%ymm4 + vmovdqa %ymm3,64-128(%r9) + vpaddq %ymm5,%ymm5,%ymm5 + vmovdqa %ymm4,96-128(%r9) + vpaddq %ymm6,%ymm6,%ymm6 + vmovdqa %ymm5,128-128(%r9) + vpaddq %ymm7,%ymm7,%ymm7 + vmovdqa %ymm6,160-128(%r9) + vpaddq %ymm8,%ymm8,%ymm8 + vmovdqa %ymm7,192-128(%r9) + vpxor %ymm9,%ymm9,%ymm9 + vmovdqa %ymm8,224-128(%r9) + + vpmuludq 0-128(%rsi),%ymm10,%ymm0 + vpbroadcastq 32-128(%rsi),%ymm11 + vmovdqu %ymm9,288-192(%rbx) + vpmuludq %ymm10,%ymm1,%ymm1 + vmovdqu %ymm9,320-448(%r12) + vpmuludq %ymm10,%ymm2,%ymm2 + vmovdqu %ymm9,352-448(%r12) + vpmuludq %ymm10,%ymm3,%ymm3 + vmovdqu %ymm9,384-448(%r12) + vpmuludq %ymm10,%ymm4,%ymm4 + vmovdqu %ymm9,416-448(%r12) + vpmuludq %ymm10,%ymm5,%ymm5 + vmovdqu %ymm9,448-448(%r12) + vpmuludq %ymm10,%ymm6,%ymm6 + vmovdqu %ymm9,480-448(%r12) + vpmuludq %ymm10,%ymm7,%ymm7 + vmovdqu %ymm9,512-448(%r12) + vpmuludq %ymm10,%ymm8,%ymm8 + vpbroadcastq 64-128(%rsi),%ymm10 + vmovdqu %ymm9,544-448(%r12) + + movq %rsi,%r15 + movl $4,%r14d + jmp .Lsqr_entry_1024 +.align 32 +.LOOP_SQR_1024: + vpbroadcastq 32-128(%r15),%ymm11 + vpmuludq 0-128(%rsi),%ymm10,%ymm0 + vpaddq 0-192(%rbx),%ymm0,%ymm0 + vpmuludq 0-128(%r9),%ymm10,%ymm1 + vpaddq 32-192(%rbx),%ymm1,%ymm1 + vpmuludq 32-128(%r9),%ymm10,%ymm2 + vpaddq 64-192(%rbx),%ymm2,%ymm2 + vpmuludq 64-128(%r9),%ymm10,%ymm3 + vpaddq 96-192(%rbx),%ymm3,%ymm3 + vpmuludq 96-128(%r9),%ymm10,%ymm4 + vpaddq 128-192(%rbx),%ymm4,%ymm4 + vpmuludq 128-128(%r9),%ymm10,%ymm5 + vpaddq 160-192(%rbx),%ymm5,%ymm5 + vpmuludq 160-128(%r9),%ymm10,%ymm6 + vpaddq 192-192(%rbx),%ymm6,%ymm6 + vpmuludq 192-128(%r9),%ymm10,%ymm7 + vpaddq 224-192(%rbx),%ymm7,%ymm7 + vpmuludq 224-128(%r9),%ymm10,%ymm8 + vpbroadcastq 64-128(%r15),%ymm10 + vpaddq 256-192(%rbx),%ymm8,%ymm8 +.Lsqr_entry_1024: + vmovdqu %ymm0,0-192(%rbx) + vmovdqu %ymm1,32-192(%rbx) + + vpmuludq 32-128(%rsi),%ymm11,%ymm12 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq 32-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm3,%ymm3 + vpmuludq 64-128(%r9),%ymm11,%ymm13 + vpaddq %ymm13,%ymm4,%ymm4 + vpmuludq 96-128(%r9),%ymm11,%ymm12 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq 128-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm6,%ymm6 + vpmuludq 160-128(%r9),%ymm11,%ymm13 + vpaddq %ymm13,%ymm7,%ymm7 + vpmuludq 192-128(%r9),%ymm11,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq 224-128(%r9),%ymm11,%ymm0 + vpbroadcastq 96-128(%r15),%ymm11 + vpaddq 288-192(%rbx),%ymm0,%ymm0 + + vmovdqu %ymm2,64-192(%rbx) + vmovdqu %ymm3,96-192(%rbx) + + vpmuludq 64-128(%rsi),%ymm10,%ymm13 + vpaddq %ymm13,%ymm4,%ymm4 + vpmuludq 64-128(%r9),%ymm10,%ymm12 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq 96-128(%r9),%ymm10,%ymm14 + vpaddq %ymm14,%ymm6,%ymm6 + vpmuludq 128-128(%r9),%ymm10,%ymm13 + vpaddq %ymm13,%ymm7,%ymm7 + vpmuludq 160-128(%r9),%ymm10,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq 192-128(%r9),%ymm10,%ymm14 + vpaddq %ymm14,%ymm0,%ymm0 + vpmuludq 224-128(%r9),%ymm10,%ymm1 + vpbroadcastq 128-128(%r15),%ymm10 + vpaddq 320-448(%r12),%ymm1,%ymm1 + + vmovdqu %ymm4,128-192(%rbx) + vmovdqu %ymm5,160-192(%rbx) + + vpmuludq 96-128(%rsi),%ymm11,%ymm12 + vpaddq %ymm12,%ymm6,%ymm6 + vpmuludq 96-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm7,%ymm7 + vpmuludq 128-128(%r9),%ymm11,%ymm13 + vpaddq %ymm13,%ymm8,%ymm8 + vpmuludq 160-128(%r9),%ymm11,%ymm12 + vpaddq %ymm12,%ymm0,%ymm0 + vpmuludq 192-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm1,%ymm1 + vpmuludq 224-128(%r9),%ymm11,%ymm2 + vpbroadcastq 160-128(%r15),%ymm11 + vpaddq 352-448(%r12),%ymm2,%ymm2 + + vmovdqu %ymm6,192-192(%rbx) + vmovdqu %ymm7,224-192(%rbx) + + vpmuludq 128-128(%rsi),%ymm10,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq 128-128(%r9),%ymm10,%ymm14 + vpaddq %ymm14,%ymm0,%ymm0 + vpmuludq 160-128(%r9),%ymm10,%ymm13 + vpaddq %ymm13,%ymm1,%ymm1 + vpmuludq 192-128(%r9),%ymm10,%ymm12 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq 224-128(%r9),%ymm10,%ymm3 + vpbroadcastq 192-128(%r15),%ymm10 + vpaddq 384-448(%r12),%ymm3,%ymm3 + + vmovdqu %ymm8,256-192(%rbx) + vmovdqu %ymm0,288-192(%rbx) + leaq 8(%rbx),%rbx + + vpmuludq 160-128(%rsi),%ymm11,%ymm13 + vpaddq %ymm13,%ymm1,%ymm1 + vpmuludq 160-128(%r9),%ymm11,%ymm12 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq 192-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm3,%ymm3 + vpmuludq 224-128(%r9),%ymm11,%ymm4 + vpbroadcastq 224-128(%r15),%ymm11 + vpaddq 416-448(%r12),%ymm4,%ymm4 + + vmovdqu %ymm1,320-448(%r12) + vmovdqu %ymm2,352-448(%r12) + + vpmuludq 192-128(%rsi),%ymm10,%ymm12 + vpaddq %ymm12,%ymm3,%ymm3 + vpmuludq 192-128(%r9),%ymm10,%ymm14 + vpbroadcastq 256-128(%r15),%ymm0 + vpaddq %ymm14,%ymm4,%ymm4 + vpmuludq 224-128(%r9),%ymm10,%ymm5 + vpbroadcastq 0+8-128(%r15),%ymm10 + vpaddq 448-448(%r12),%ymm5,%ymm5 + + vmovdqu %ymm3,384-448(%r12) + vmovdqu %ymm4,416-448(%r12) + leaq 8(%r15),%r15 + + vpmuludq 224-128(%rsi),%ymm11,%ymm12 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq 224-128(%r9),%ymm11,%ymm6 + vpaddq 480-448(%r12),%ymm6,%ymm6 + + vpmuludq 256-128(%rsi),%ymm0,%ymm7 + vmovdqu %ymm5,448-448(%r12) + vpaddq 512-448(%r12),%ymm7,%ymm7 + vmovdqu %ymm6,480-448(%r12) + vmovdqu %ymm7,512-448(%r12) + leaq 8(%r12),%r12 + + decl %r14d + jnz .LOOP_SQR_1024 + + vmovdqu 256(%rsp),%ymm8 + vmovdqu 288(%rsp),%ymm1 + vmovdqu 320(%rsp),%ymm2 + leaq 192(%rsp),%rbx + + vpsrlq $29,%ymm8,%ymm14 + vpand %ymm15,%ymm8,%ymm8 + vpsrlq $29,%ymm1,%ymm11 + vpand %ymm15,%ymm1,%ymm1 + + vpermq $0x93,%ymm14,%ymm14 + vpxor %ymm9,%ymm9,%ymm9 + vpermq $0x93,%ymm11,%ymm11 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm8,%ymm8 + vpblendd $3,%ymm11,%ymm9,%ymm11 + vpaddq %ymm14,%ymm1,%ymm1 + vpaddq %ymm11,%ymm2,%ymm2 + vmovdqu %ymm1,288-192(%rbx) + vmovdqu %ymm2,320-192(%rbx) + + movq (%rsp),%rax + movq 8(%rsp),%r10 + movq 16(%rsp),%r11 + movq 24(%rsp),%r12 + vmovdqu 32(%rsp),%ymm1 + vmovdqu 64-192(%rbx),%ymm2 + vmovdqu 96-192(%rbx),%ymm3 + vmovdqu 128-192(%rbx),%ymm4 + vmovdqu 160-192(%rbx),%ymm5 + vmovdqu 192-192(%rbx),%ymm6 + vmovdqu 224-192(%rbx),%ymm7 + + movq %rax,%r9 + imull %ecx,%eax + andl $0x1fffffff,%eax + vmovd %eax,%xmm12 + + movq %rax,%rdx + imulq -128(%r13),%rax + vpbroadcastq %xmm12,%ymm12 + addq %rax,%r9 + movq %rdx,%rax + imulq 8-128(%r13),%rax + shrq $29,%r9 + addq %rax,%r10 + movq %rdx,%rax + imulq 16-128(%r13),%rax + addq %r9,%r10 + addq %rax,%r11 + imulq 24-128(%r13),%rdx + addq %rdx,%r12 + + movq %r10,%rax + imull %ecx,%eax + andl $0x1fffffff,%eax + + movl $9,%r14d + jmp .LOOP_REDUCE_1024 + +.align 32 +.LOOP_REDUCE_1024: + vmovd %eax,%xmm13 + vpbroadcastq %xmm13,%ymm13 + + vpmuludq 32-128(%r13),%ymm12,%ymm10 + movq %rax,%rdx + imulq -128(%r13),%rax + vpaddq %ymm10,%ymm1,%ymm1 + addq %rax,%r10 + vpmuludq 64-128(%r13),%ymm12,%ymm14 + movq %rdx,%rax + imulq 8-128(%r13),%rax + vpaddq %ymm14,%ymm2,%ymm2 + vpmuludq 96-128(%r13),%ymm12,%ymm11 +.byte 0x67 + addq %rax,%r11 +.byte 0x67 + movq %rdx,%rax + imulq 16-128(%r13),%rax + shrq $29,%r10 + vpaddq %ymm11,%ymm3,%ymm3 + vpmuludq 128-128(%r13),%ymm12,%ymm10 + addq %rax,%r12 + addq %r10,%r11 + vpaddq %ymm10,%ymm4,%ymm4 + vpmuludq 160-128(%r13),%ymm12,%ymm14 + movq %r11,%rax + imull %ecx,%eax + vpaddq %ymm14,%ymm5,%ymm5 + vpmuludq 192-128(%r13),%ymm12,%ymm11 + andl $0x1fffffff,%eax + vpaddq %ymm11,%ymm6,%ymm6 + vpmuludq 224-128(%r13),%ymm12,%ymm10 + vpaddq %ymm10,%ymm7,%ymm7 + vpmuludq 256-128(%r13),%ymm12,%ymm14 + vmovd %eax,%xmm12 + + vpaddq %ymm14,%ymm8,%ymm8 + + vpbroadcastq %xmm12,%ymm12 + + vpmuludq 32-8-128(%r13),%ymm13,%ymm11 + vmovdqu 96-8-128(%r13),%ymm14 + movq %rax,%rdx + imulq -128(%r13),%rax + vpaddq %ymm11,%ymm1,%ymm1 + vpmuludq 64-8-128(%r13),%ymm13,%ymm10 + vmovdqu 128-8-128(%r13),%ymm11 + addq %rax,%r11 + movq %rdx,%rax + imulq 8-128(%r13),%rax + vpaddq %ymm10,%ymm2,%ymm2 + addq %r12,%rax + shrq $29,%r11 + vpmuludq %ymm13,%ymm14,%ymm14 + vmovdqu 160-8-128(%r13),%ymm10 + addq %r11,%rax + vpaddq %ymm14,%ymm3,%ymm3 + vpmuludq %ymm13,%ymm11,%ymm11 + vmovdqu 192-8-128(%r13),%ymm14 +.byte 0x67 + movq %rax,%r12 + imull %ecx,%eax + vpaddq %ymm11,%ymm4,%ymm4 + vpmuludq %ymm13,%ymm10,%ymm10 +.byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 + andl $0x1fffffff,%eax + vpaddq %ymm10,%ymm5,%ymm5 + vpmuludq %ymm13,%ymm14,%ymm14 + vmovdqu 256-8-128(%r13),%ymm10 + vpaddq %ymm14,%ymm6,%ymm6 + vpmuludq %ymm13,%ymm11,%ymm11 + vmovdqu 288-8-128(%r13),%ymm9 + vmovd %eax,%xmm0 + imulq -128(%r13),%rax + vpaddq %ymm11,%ymm7,%ymm7 + vpmuludq %ymm13,%ymm10,%ymm10 + vmovdqu 32-16-128(%r13),%ymm14 + vpbroadcastq %xmm0,%ymm0 + vpaddq %ymm10,%ymm8,%ymm8 + vpmuludq %ymm13,%ymm9,%ymm9 + vmovdqu 64-16-128(%r13),%ymm11 + addq %rax,%r12 + + vmovdqu 32-24-128(%r13),%ymm13 + vpmuludq %ymm12,%ymm14,%ymm14 + vmovdqu 96-16-128(%r13),%ymm10 + vpaddq %ymm14,%ymm1,%ymm1 + vpmuludq %ymm0,%ymm13,%ymm13 + vpmuludq %ymm12,%ymm11,%ymm11 +.byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff + vpaddq %ymm1,%ymm13,%ymm13 + vpaddq %ymm11,%ymm2,%ymm2 + vpmuludq %ymm12,%ymm10,%ymm10 + vmovdqu 160-16-128(%r13),%ymm11 +.byte 0x67 + vmovq %xmm13,%rax + vmovdqu %ymm13,(%rsp) + vpaddq %ymm10,%ymm3,%ymm3 + vpmuludq %ymm12,%ymm14,%ymm14 + vmovdqu 192-16-128(%r13),%ymm10 + vpaddq %ymm14,%ymm4,%ymm4 + vpmuludq %ymm12,%ymm11,%ymm11 + vmovdqu 224-16-128(%r13),%ymm14 + vpaddq %ymm11,%ymm5,%ymm5 + vpmuludq %ymm12,%ymm10,%ymm10 + vmovdqu 256-16-128(%r13),%ymm11 + vpaddq %ymm10,%ymm6,%ymm6 + vpmuludq %ymm12,%ymm14,%ymm14 + shrq $29,%r12 + vmovdqu 288-16-128(%r13),%ymm10 + addq %r12,%rax + vpaddq %ymm14,%ymm7,%ymm7 + vpmuludq %ymm12,%ymm11,%ymm11 + + movq %rax,%r9 + imull %ecx,%eax + vpaddq %ymm11,%ymm8,%ymm8 + vpmuludq %ymm12,%ymm10,%ymm10 + andl $0x1fffffff,%eax + vmovd %eax,%xmm12 + vmovdqu 96-24-128(%r13),%ymm11 +.byte 0x67 + vpaddq %ymm10,%ymm9,%ymm9 + vpbroadcastq %xmm12,%ymm12 + + vpmuludq 64-24-128(%r13),%ymm0,%ymm14 + vmovdqu 128-24-128(%r13),%ymm10 + movq %rax,%rdx + imulq -128(%r13),%rax + movq 8(%rsp),%r10 + vpaddq %ymm14,%ymm2,%ymm1 + vpmuludq %ymm0,%ymm11,%ymm11 + vmovdqu 160-24-128(%r13),%ymm14 + addq %rax,%r9 + movq %rdx,%rax + imulq 8-128(%r13),%rax +.byte 0x67 + shrq $29,%r9 + movq 16(%rsp),%r11 + vpaddq %ymm11,%ymm3,%ymm2 + vpmuludq %ymm0,%ymm10,%ymm10 + vmovdqu 192-24-128(%r13),%ymm11 + addq %rax,%r10 + movq %rdx,%rax + imulq 16-128(%r13),%rax + vpaddq %ymm10,%ymm4,%ymm3 + vpmuludq %ymm0,%ymm14,%ymm14 + vmovdqu 224-24-128(%r13),%ymm10 + imulq 24-128(%r13),%rdx + addq %rax,%r11 + leaq (%r9,%r10,1),%rax + vpaddq %ymm14,%ymm5,%ymm4 + vpmuludq %ymm0,%ymm11,%ymm11 + vmovdqu 256-24-128(%r13),%ymm14 + movq %rax,%r10 + imull %ecx,%eax + vpmuludq %ymm0,%ymm10,%ymm10 + vpaddq %ymm11,%ymm6,%ymm5 + vmovdqu 288-24-128(%r13),%ymm11 + andl $0x1fffffff,%eax + vpaddq %ymm10,%ymm7,%ymm6 + vpmuludq %ymm0,%ymm14,%ymm14 + addq 24(%rsp),%rdx + vpaddq %ymm14,%ymm8,%ymm7 + vpmuludq %ymm0,%ymm11,%ymm11 + vpaddq %ymm11,%ymm9,%ymm8 + vmovq %r12,%xmm9 + movq %rdx,%r12 + + decl %r14d + jnz .LOOP_REDUCE_1024 + leaq 448(%rsp),%r12 + vpaddq %ymm9,%ymm13,%ymm0 + vpxor %ymm9,%ymm9,%ymm9 + + vpaddq 288-192(%rbx),%ymm0,%ymm0 + vpaddq 320-448(%r12),%ymm1,%ymm1 + vpaddq 352-448(%r12),%ymm2,%ymm2 + vpaddq 384-448(%r12),%ymm3,%ymm3 + vpaddq 416-448(%r12),%ymm4,%ymm4 + vpaddq 448-448(%r12),%ymm5,%ymm5 + vpaddq 480-448(%r12),%ymm6,%ymm6 + vpaddq 512-448(%r12),%ymm7,%ymm7 + vpaddq 544-448(%r12),%ymm8,%ymm8 + + vpsrlq $29,%ymm0,%ymm14 + vpand %ymm15,%ymm0,%ymm0 + vpsrlq $29,%ymm1,%ymm11 + vpand %ymm15,%ymm1,%ymm1 + vpsrlq $29,%ymm2,%ymm12 + vpermq $0x93,%ymm14,%ymm14 + vpand %ymm15,%ymm2,%ymm2 + vpsrlq $29,%ymm3,%ymm13 + vpermq $0x93,%ymm11,%ymm11 + vpand %ymm15,%ymm3,%ymm3 + vpermq $0x93,%ymm12,%ymm12 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpermq $0x93,%ymm13,%ymm13 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm0,%ymm0 + vpblendd $3,%ymm11,%ymm12,%ymm11 + vpaddq %ymm14,%ymm1,%ymm1 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm11,%ymm2,%ymm2 + vpblendd $3,%ymm13,%ymm9,%ymm13 + vpaddq %ymm12,%ymm3,%ymm3 + vpaddq %ymm13,%ymm4,%ymm4 + + vpsrlq $29,%ymm0,%ymm14 + vpand %ymm15,%ymm0,%ymm0 + vpsrlq $29,%ymm1,%ymm11 + vpand %ymm15,%ymm1,%ymm1 + vpsrlq $29,%ymm2,%ymm12 + vpermq $0x93,%ymm14,%ymm14 + vpand %ymm15,%ymm2,%ymm2 + vpsrlq $29,%ymm3,%ymm13 + vpermq $0x93,%ymm11,%ymm11 + vpand %ymm15,%ymm3,%ymm3 + vpermq $0x93,%ymm12,%ymm12 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpermq $0x93,%ymm13,%ymm13 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm0,%ymm0 + vpblendd $3,%ymm11,%ymm12,%ymm11 + vpaddq %ymm14,%ymm1,%ymm1 + vmovdqu %ymm0,0-128(%rdi) + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm11,%ymm2,%ymm2 + vmovdqu %ymm1,32-128(%rdi) + vpblendd $3,%ymm13,%ymm9,%ymm13 + vpaddq %ymm12,%ymm3,%ymm3 + vmovdqu %ymm2,64-128(%rdi) + vpaddq %ymm13,%ymm4,%ymm4 + vmovdqu %ymm3,96-128(%rdi) + vpsrlq $29,%ymm4,%ymm14 + vpand %ymm15,%ymm4,%ymm4 + vpsrlq $29,%ymm5,%ymm11 + vpand %ymm15,%ymm5,%ymm5 + vpsrlq $29,%ymm6,%ymm12 + vpermq $0x93,%ymm14,%ymm14 + vpand %ymm15,%ymm6,%ymm6 + vpsrlq $29,%ymm7,%ymm13 + vpermq $0x93,%ymm11,%ymm11 + vpand %ymm15,%ymm7,%ymm7 + vpsrlq $29,%ymm8,%ymm0 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm8,%ymm8 + vpermq $0x93,%ymm13,%ymm13 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpermq $0x93,%ymm0,%ymm0 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm4,%ymm4 + vpblendd $3,%ymm11,%ymm12,%ymm11 + vpaddq %ymm14,%ymm5,%ymm5 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm11,%ymm6,%ymm6 + vpblendd $3,%ymm13,%ymm0,%ymm13 + vpaddq %ymm12,%ymm7,%ymm7 + vpaddq %ymm13,%ymm8,%ymm8 + + vpsrlq $29,%ymm4,%ymm14 + vpand %ymm15,%ymm4,%ymm4 + vpsrlq $29,%ymm5,%ymm11 + vpand %ymm15,%ymm5,%ymm5 + vpsrlq $29,%ymm6,%ymm12 + vpermq $0x93,%ymm14,%ymm14 + vpand %ymm15,%ymm6,%ymm6 + vpsrlq $29,%ymm7,%ymm13 + vpermq $0x93,%ymm11,%ymm11 + vpand %ymm15,%ymm7,%ymm7 + vpsrlq $29,%ymm8,%ymm0 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm8,%ymm8 + vpermq $0x93,%ymm13,%ymm13 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpermq $0x93,%ymm0,%ymm0 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm4,%ymm4 + vpblendd $3,%ymm11,%ymm12,%ymm11 + vpaddq %ymm14,%ymm5,%ymm5 + vmovdqu %ymm4,128-128(%rdi) + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm11,%ymm6,%ymm6 + vmovdqu %ymm5,160-128(%rdi) + vpblendd $3,%ymm13,%ymm0,%ymm13 + vpaddq %ymm12,%ymm7,%ymm7 + vmovdqu %ymm6,192-128(%rdi) + vpaddq %ymm13,%ymm8,%ymm8 + vmovdqu %ymm7,224-128(%rdi) + vmovdqu %ymm8,256-128(%rdi) + + movq %rdi,%rsi + decl %r8d + jne .LOOP_GRANDE_SQR_1024 + + vzeroall + movq %rbp,%rax +.cfi_def_cfa_register %rax + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lsqr_1024_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 +.globl rsaz_1024_mul_avx2 +.type rsaz_1024_mul_avx2,@function +.align 64 rsaz_1024_mul_avx2: -rsaz_1024_norm2red_avx2: +.cfi_startproc + leaq (%rsp),%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + movq %rax,%rbp +.cfi_def_cfa_register %rbp + vzeroall + movq %rdx,%r13 + subq $64,%rsp + + + + + + +.byte 0x67,0x67 + movq %rsi,%r15 + andq $4095,%r15 + addq $320,%r15 + shrq $12,%r15 + movq %rsi,%r15 + cmovnzq %r13,%rsi + cmovnzq %r15,%r13 + + movq %rcx,%r15 + subq $-128,%rsi + subq $-128,%rcx + subq $-128,%rdi + + andq $4095,%r15 + addq $320,%r15 +.byte 0x67,0x67 + shrq $12,%r15 + jz .Lmul_1024_no_n_copy + + + + + + subq $320,%rsp + vmovdqu 0-128(%rcx),%ymm0 + andq $-512,%rsp + vmovdqu 32-128(%rcx),%ymm1 + vmovdqu 64-128(%rcx),%ymm2 + vmovdqu 96-128(%rcx),%ymm3 + vmovdqu 128-128(%rcx),%ymm4 + vmovdqu 160-128(%rcx),%ymm5 + vmovdqu 192-128(%rcx),%ymm6 + vmovdqu 224-128(%rcx),%ymm7 + vmovdqu 256-128(%rcx),%ymm8 + leaq 64+128(%rsp),%rcx + vmovdqu %ymm0,0-128(%rcx) + vpxor %ymm0,%ymm0,%ymm0 + vmovdqu %ymm1,32-128(%rcx) + vpxor %ymm1,%ymm1,%ymm1 + vmovdqu %ymm2,64-128(%rcx) + vpxor %ymm2,%ymm2,%ymm2 + vmovdqu %ymm3,96-128(%rcx) + vpxor %ymm3,%ymm3,%ymm3 + vmovdqu %ymm4,128-128(%rcx) + vpxor %ymm4,%ymm4,%ymm4 + vmovdqu %ymm5,160-128(%rcx) + vpxor %ymm5,%ymm5,%ymm5 + vmovdqu %ymm6,192-128(%rcx) + vpxor %ymm6,%ymm6,%ymm6 + vmovdqu %ymm7,224-128(%rcx) + vpxor %ymm7,%ymm7,%ymm7 + vmovdqu %ymm8,256-128(%rcx) + vmovdqa %ymm0,%ymm8 + vmovdqu %ymm9,288-128(%rcx) +.Lmul_1024_no_n_copy: + andq $-64,%rsp + + movq (%r13),%rbx + vpbroadcastq (%r13),%ymm10 + vmovdqu %ymm0,(%rsp) + xorq %r9,%r9 +.byte 0x67 + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r12,%r12 + + vmovdqu .Land_mask(%rip),%ymm15 + movl $9,%r14d + vmovdqu %ymm9,288-128(%rdi) + jmp .Loop_mul_1024 + +.align 32 +.Loop_mul_1024: + vpsrlq $29,%ymm3,%ymm9 + movq %rbx,%rax + imulq -128(%rsi),%rax + addq %r9,%rax + movq %rbx,%r10 + imulq 8-128(%rsi),%r10 + addq 8(%rsp),%r10 + + movq %rax,%r9 + imull %r8d,%eax + andl $0x1fffffff,%eax + + movq %rbx,%r11 + imulq 16-128(%rsi),%r11 + addq 16(%rsp),%r11 + + movq %rbx,%r12 + imulq 24-128(%rsi),%r12 + addq 24(%rsp),%r12 + vpmuludq 32-128(%rsi),%ymm10,%ymm0 + vmovd %eax,%xmm11 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq 64-128(%rsi),%ymm10,%ymm12 + vpbroadcastq %xmm11,%ymm11 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq 96-128(%rsi),%ymm10,%ymm13 + vpand %ymm15,%ymm3,%ymm3 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq 128-128(%rsi),%ymm10,%ymm0 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq 160-128(%rsi),%ymm10,%ymm12 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq 192-128(%rsi),%ymm10,%ymm13 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq 224-128(%rsi),%ymm10,%ymm0 + vpermq $0x93,%ymm9,%ymm9 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq 256-128(%rsi),%ymm10,%ymm12 + vpbroadcastq 8(%r13),%ymm10 + vpaddq %ymm12,%ymm8,%ymm8 + + movq %rax,%rdx + imulq -128(%rcx),%rax + addq %rax,%r9 + movq %rdx,%rax + imulq 8-128(%rcx),%rax + addq %rax,%r10 + movq %rdx,%rax + imulq 16-128(%rcx),%rax + addq %rax,%r11 + shrq $29,%r9 + imulq 24-128(%rcx),%rdx + addq %rdx,%r12 + addq %r9,%r10 + + vpmuludq 32-128(%rcx),%ymm11,%ymm13 + vmovq %xmm10,%rbx + vpaddq %ymm13,%ymm1,%ymm1 + vpmuludq 64-128(%rcx),%ymm11,%ymm0 + vpaddq %ymm0,%ymm2,%ymm2 + vpmuludq 96-128(%rcx),%ymm11,%ymm12 + vpaddq %ymm12,%ymm3,%ymm3 + vpmuludq 128-128(%rcx),%ymm11,%ymm13 + vpaddq %ymm13,%ymm4,%ymm4 + vpmuludq 160-128(%rcx),%ymm11,%ymm0 + vpaddq %ymm0,%ymm5,%ymm5 + vpmuludq 192-128(%rcx),%ymm11,%ymm12 + vpaddq %ymm12,%ymm6,%ymm6 + vpmuludq 224-128(%rcx),%ymm11,%ymm13 + vpblendd $3,%ymm14,%ymm9,%ymm12 + vpaddq %ymm13,%ymm7,%ymm7 + vpmuludq 256-128(%rcx),%ymm11,%ymm0 + vpaddq %ymm12,%ymm3,%ymm3 + vpaddq %ymm0,%ymm8,%ymm8 + + movq %rbx,%rax + imulq -128(%rsi),%rax + addq %rax,%r10 + vmovdqu -8+32-128(%rsi),%ymm12 + movq %rbx,%rax + imulq 8-128(%rsi),%rax + addq %rax,%r11 + vmovdqu -8+64-128(%rsi),%ymm13 + + movq %r10,%rax + vpblendd $0xfc,%ymm14,%ymm9,%ymm9 + imull %r8d,%eax + vpaddq %ymm9,%ymm4,%ymm4 + andl $0x1fffffff,%eax + + imulq 16-128(%rsi),%rbx + addq %rbx,%r12 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovd %eax,%xmm11 + vmovdqu -8+96-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm1,%ymm1 + vpmuludq %ymm10,%ymm13,%ymm13 + vpbroadcastq %xmm11,%ymm11 + vmovdqu -8+128-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm2,%ymm2 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -8+160-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm3,%ymm3 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovdqu -8+192-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm4,%ymm4 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -8+224-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm5,%ymm5 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -8+256-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm6,%ymm6 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovdqu -8+288-128(%rsi),%ymm9 + vpaddq %ymm12,%ymm7,%ymm7 + vpmuludq %ymm10,%ymm13,%ymm13 + vpaddq %ymm13,%ymm8,%ymm8 + vpmuludq %ymm10,%ymm9,%ymm9 + vpbroadcastq 16(%r13),%ymm10 + + movq %rax,%rdx + imulq -128(%rcx),%rax + addq %rax,%r10 + vmovdqu -8+32-128(%rcx),%ymm0 + movq %rdx,%rax + imulq 8-128(%rcx),%rax + addq %rax,%r11 + vmovdqu -8+64-128(%rcx),%ymm12 + shrq $29,%r10 + imulq 16-128(%rcx),%rdx + addq %rdx,%r12 + addq %r10,%r11 + + vpmuludq %ymm11,%ymm0,%ymm0 + vmovq %xmm10,%rbx + vmovdqu -8+96-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -8+128-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -8+160-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -8+192-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -8+224-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -8+256-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -8+288-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq %ymm11,%ymm12,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq %ymm11,%ymm13,%ymm13 + vpaddq %ymm13,%ymm9,%ymm9 + + vmovdqu -16+32-128(%rsi),%ymm0 + movq %rbx,%rax + imulq -128(%rsi),%rax + addq %r11,%rax + + vmovdqu -16+64-128(%rsi),%ymm12 + movq %rax,%r11 + imull %r8d,%eax + andl $0x1fffffff,%eax + + imulq 8-128(%rsi),%rbx + addq %rbx,%r12 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovd %eax,%xmm11 + vmovdqu -16+96-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq %ymm10,%ymm12,%ymm12 + vpbroadcastq %xmm11,%ymm11 + vmovdqu -16+128-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -16+160-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -16+192-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovdqu -16+224-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -16+256-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -16+288-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq %ymm10,%ymm12,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq %ymm10,%ymm13,%ymm13 + vpbroadcastq 24(%r13),%ymm10 + vpaddq %ymm13,%ymm9,%ymm9 + + vmovdqu -16+32-128(%rcx),%ymm0 + movq %rax,%rdx + imulq -128(%rcx),%rax + addq %rax,%r11 + vmovdqu -16+64-128(%rcx),%ymm12 + imulq 8-128(%rcx),%rdx + addq %rdx,%r12 + shrq $29,%r11 + + vpmuludq %ymm11,%ymm0,%ymm0 + vmovq %xmm10,%rbx + vmovdqu -16+96-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -16+128-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -16+160-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -16+192-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -16+224-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -16+256-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -16+288-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -24+32-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -24+64-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm9,%ymm9 + + addq %r11,%r12 + imulq -128(%rsi),%rbx + addq %rbx,%r12 + + movq %r12,%rax + imull %r8d,%eax + andl $0x1fffffff,%eax + + vpmuludq %ymm10,%ymm0,%ymm0 + vmovd %eax,%xmm11 + vmovdqu -24+96-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq %ymm10,%ymm12,%ymm12 + vpbroadcastq %xmm11,%ymm11 + vmovdqu -24+128-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -24+160-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -24+192-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovdqu -24+224-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -24+256-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -24+288-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq %ymm10,%ymm12,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq %ymm10,%ymm13,%ymm13 + vpbroadcastq 32(%r13),%ymm10 + vpaddq %ymm13,%ymm9,%ymm9 + addq $32,%r13 + + vmovdqu -24+32-128(%rcx),%ymm0 + imulq -128(%rcx),%rax + addq %rax,%r12 + shrq $29,%r12 + + vmovdqu -24+64-128(%rcx),%ymm12 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovq %xmm10,%rbx + vmovdqu -24+96-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm1,%ymm0 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu %ymm0,(%rsp) + vpaddq %ymm12,%ymm2,%ymm1 + vmovdqu -24+128-128(%rcx),%ymm0 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -24+160-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm3,%ymm2 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -24+192-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm4,%ymm3 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -24+224-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm5,%ymm4 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -24+256-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm6,%ymm5 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -24+288-128(%rcx),%ymm13 + movq %r12,%r9 + vpaddq %ymm0,%ymm7,%ymm6 + vpmuludq %ymm11,%ymm12,%ymm12 + addq (%rsp),%r9 + vpaddq %ymm12,%ymm8,%ymm7 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovq %r12,%xmm12 + vpaddq %ymm13,%ymm9,%ymm8 + + decl %r14d + jnz .Loop_mul_1024 + vpaddq (%rsp),%ymm12,%ymm0 + + vpsrlq $29,%ymm0,%ymm12 + vpand %ymm15,%ymm0,%ymm0 + vpsrlq $29,%ymm1,%ymm13 + vpand %ymm15,%ymm1,%ymm1 + vpsrlq $29,%ymm2,%ymm10 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm2,%ymm2 + vpsrlq $29,%ymm3,%ymm11 + vpermq $0x93,%ymm13,%ymm13 + vpand %ymm15,%ymm3,%ymm3 + + vpblendd $3,%ymm14,%ymm12,%ymm9 + vpermq $0x93,%ymm10,%ymm10 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpermq $0x93,%ymm11,%ymm11 + vpaddq %ymm9,%ymm0,%ymm0 + vpblendd $3,%ymm13,%ymm10,%ymm13 + vpaddq %ymm12,%ymm1,%ymm1 + vpblendd $3,%ymm10,%ymm11,%ymm10 + vpaddq %ymm13,%ymm2,%ymm2 + vpblendd $3,%ymm11,%ymm14,%ymm11 + vpaddq %ymm10,%ymm3,%ymm3 + vpaddq %ymm11,%ymm4,%ymm4 + + vpsrlq $29,%ymm0,%ymm12 + vpand %ymm15,%ymm0,%ymm0 + vpsrlq $29,%ymm1,%ymm13 + vpand %ymm15,%ymm1,%ymm1 + vpsrlq $29,%ymm2,%ymm10 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm2,%ymm2 + vpsrlq $29,%ymm3,%ymm11 + vpermq $0x93,%ymm13,%ymm13 + vpand %ymm15,%ymm3,%ymm3 + vpermq $0x93,%ymm10,%ymm10 + + vpblendd $3,%ymm14,%ymm12,%ymm9 + vpermq $0x93,%ymm11,%ymm11 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm9,%ymm0,%ymm0 + vpblendd $3,%ymm13,%ymm10,%ymm13 + vpaddq %ymm12,%ymm1,%ymm1 + vpblendd $3,%ymm10,%ymm11,%ymm10 + vpaddq %ymm13,%ymm2,%ymm2 + vpblendd $3,%ymm11,%ymm14,%ymm11 + vpaddq %ymm10,%ymm3,%ymm3 + vpaddq %ymm11,%ymm4,%ymm4 + + vmovdqu %ymm0,0-128(%rdi) + vmovdqu %ymm1,32-128(%rdi) + vmovdqu %ymm2,64-128(%rdi) + vmovdqu %ymm3,96-128(%rdi) + vpsrlq $29,%ymm4,%ymm12 + vpand %ymm15,%ymm4,%ymm4 + vpsrlq $29,%ymm5,%ymm13 + vpand %ymm15,%ymm5,%ymm5 + vpsrlq $29,%ymm6,%ymm10 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm6,%ymm6 + vpsrlq $29,%ymm7,%ymm11 + vpermq $0x93,%ymm13,%ymm13 + vpand %ymm15,%ymm7,%ymm7 + vpsrlq $29,%ymm8,%ymm0 + vpermq $0x93,%ymm10,%ymm10 + vpand %ymm15,%ymm8,%ymm8 + vpermq $0x93,%ymm11,%ymm11 + + vpblendd $3,%ymm14,%ymm12,%ymm9 + vpermq $0x93,%ymm0,%ymm0 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm9,%ymm4,%ymm4 + vpblendd $3,%ymm13,%ymm10,%ymm13 + vpaddq %ymm12,%ymm5,%ymm5 + vpblendd $3,%ymm10,%ymm11,%ymm10 + vpaddq %ymm13,%ymm6,%ymm6 + vpblendd $3,%ymm11,%ymm0,%ymm11 + vpaddq %ymm10,%ymm7,%ymm7 + vpaddq %ymm11,%ymm8,%ymm8 + + vpsrlq $29,%ymm4,%ymm12 + vpand %ymm15,%ymm4,%ymm4 + vpsrlq $29,%ymm5,%ymm13 + vpand %ymm15,%ymm5,%ymm5 + vpsrlq $29,%ymm6,%ymm10 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm6,%ymm6 + vpsrlq $29,%ymm7,%ymm11 + vpermq $0x93,%ymm13,%ymm13 + vpand %ymm15,%ymm7,%ymm7 + vpsrlq $29,%ymm8,%ymm0 + vpermq $0x93,%ymm10,%ymm10 + vpand %ymm15,%ymm8,%ymm8 + vpermq $0x93,%ymm11,%ymm11 + + vpblendd $3,%ymm14,%ymm12,%ymm9 + vpermq $0x93,%ymm0,%ymm0 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm9,%ymm4,%ymm4 + vpblendd $3,%ymm13,%ymm10,%ymm13 + vpaddq %ymm12,%ymm5,%ymm5 + vpblendd $3,%ymm10,%ymm11,%ymm10 + vpaddq %ymm13,%ymm6,%ymm6 + vpblendd $3,%ymm11,%ymm0,%ymm11 + vpaddq %ymm10,%ymm7,%ymm7 + vpaddq %ymm11,%ymm8,%ymm8 + + vmovdqu %ymm4,128-128(%rdi) + vmovdqu %ymm5,160-128(%rdi) + vmovdqu %ymm6,192-128(%rdi) + vmovdqu %ymm7,224-128(%rdi) + vmovdqu %ymm8,256-128(%rdi) + vzeroupper + + movq %rbp,%rax +.cfi_def_cfa_register %rax + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lmul_1024_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2 +.globl rsaz_1024_red2norm_avx2 +.type rsaz_1024_red2norm_avx2,@function +.align 32 rsaz_1024_red2norm_avx2: +.cfi_startproc + subq $-128,%rsi + xorq %rax,%rax + movq -128(%rsi),%r8 + movq -120(%rsi),%r9 + movq -112(%rsi),%r10 + shlq $0,%r8 + shlq $29,%r9 + movq %r10,%r11 + shlq $58,%r10 + shrq $6,%r11 + addq %r8,%rax + addq %r9,%rax + addq %r10,%rax + adcq $0,%r11 + movq %rax,0(%rdi) + movq %r11,%rax + movq -104(%rsi),%r8 + movq -96(%rsi),%r9 + shlq $23,%r8 + movq %r9,%r10 + shlq $52,%r9 + shrq $12,%r10 + addq %r8,%rax + addq %r9,%rax + adcq $0,%r10 + movq %rax,8(%rdi) + movq %r10,%rax + movq -88(%rsi),%r11 + movq -80(%rsi),%r8 + shlq $17,%r11 + movq %r8,%r9 + shlq $46,%r8 + shrq $18,%r9 + addq %r11,%rax + addq %r8,%rax + adcq $0,%r9 + movq %rax,16(%rdi) + movq %r9,%rax + movq -72(%rsi),%r10 + movq -64(%rsi),%r11 + shlq $11,%r10 + movq %r11,%r8 + shlq $40,%r11 + shrq $24,%r8 + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,24(%rdi) + movq %r8,%rax + movq -56(%rsi),%r9 + movq -48(%rsi),%r10 + movq -40(%rsi),%r11 + shlq $5,%r9 + shlq $34,%r10 + movq %r11,%r8 + shlq $63,%r11 + shrq $1,%r8 + addq %r9,%rax + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,32(%rdi) + movq %r8,%rax + movq -32(%rsi),%r9 + movq -24(%rsi),%r10 + shlq $28,%r9 + movq %r10,%r11 + shlq $57,%r10 + shrq $7,%r11 + addq %r9,%rax + addq %r10,%rax + adcq $0,%r11 + movq %rax,40(%rdi) + movq %r11,%rax + movq -16(%rsi),%r8 + movq -8(%rsi),%r9 + shlq $22,%r8 + movq %r9,%r10 + shlq $51,%r9 + shrq $13,%r10 + addq %r8,%rax + addq %r9,%rax + adcq $0,%r10 + movq %rax,48(%rdi) + movq %r10,%rax + movq 0(%rsi),%r11 + movq 8(%rsi),%r8 + shlq $16,%r11 + movq %r8,%r9 + shlq $45,%r8 + shrq $19,%r9 + addq %r11,%rax + addq %r8,%rax + adcq $0,%r9 + movq %rax,56(%rdi) + movq %r9,%rax + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + shlq $10,%r10 + movq %r11,%r8 + shlq $39,%r11 + shrq $25,%r8 + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,64(%rdi) + movq %r8,%rax + movq 32(%rsi),%r9 + movq 40(%rsi),%r10 + movq 48(%rsi),%r11 + shlq $4,%r9 + shlq $33,%r10 + movq %r11,%r8 + shlq $62,%r11 + shrq $2,%r8 + addq %r9,%rax + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,72(%rdi) + movq %r8,%rax + movq 56(%rsi),%r9 + movq 64(%rsi),%r10 + shlq $27,%r9 + movq %r10,%r11 + shlq $56,%r10 + shrq $8,%r11 + addq %r9,%rax + addq %r10,%rax + adcq $0,%r11 + movq %rax,80(%rdi) + movq %r11,%rax + movq 72(%rsi),%r8 + movq 80(%rsi),%r9 + shlq $21,%r8 + movq %r9,%r10 + shlq $50,%r9 + shrq $14,%r10 + addq %r8,%rax + addq %r9,%rax + adcq $0,%r10 + movq %rax,88(%rdi) + movq %r10,%rax + movq 88(%rsi),%r11 + movq 96(%rsi),%r8 + shlq $15,%r11 + movq %r8,%r9 + shlq $44,%r8 + shrq $20,%r9 + addq %r11,%rax + addq %r8,%rax + adcq $0,%r9 + movq %rax,96(%rdi) + movq %r9,%rax + movq 104(%rsi),%r10 + movq 112(%rsi),%r11 + shlq $9,%r10 + movq %r11,%r8 + shlq $38,%r11 + shrq $26,%r8 + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,104(%rdi) + movq %r8,%rax + movq 120(%rsi),%r9 + movq 128(%rsi),%r10 + movq 136(%rsi),%r11 + shlq $3,%r9 + shlq $32,%r10 + movq %r11,%r8 + shlq $61,%r11 + shrq $3,%r8 + addq %r9,%rax + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,112(%rdi) + movq %r8,%rax + movq 144(%rsi),%r9 + movq 152(%rsi),%r10 + shlq $26,%r9 + movq %r10,%r11 + shlq $55,%r10 + shrq $9,%r11 + addq %r9,%rax + addq %r10,%rax + adcq $0,%r11 + movq %rax,120(%rdi) + movq %r11,%rax + .byte 0xf3,0xc3 +.cfi_endproc +.size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2 + +.globl rsaz_1024_norm2red_avx2 +.type rsaz_1024_norm2red_avx2,@function +.align 32 +rsaz_1024_norm2red_avx2: +.cfi_startproc + subq $-128,%rdi + movq (%rsi),%r8 + movl $0x1fffffff,%eax + movq 8(%rsi),%r9 + movq %r8,%r11 + shrq $0,%r11 + andq %rax,%r11 + movq %r11,-128(%rdi) + movq %r8,%r10 + shrq $29,%r10 + andq %rax,%r10 + movq %r10,-120(%rdi) + shrdq $58,%r9,%r8 + andq %rax,%r8 + movq %r8,-112(%rdi) + movq 16(%rsi),%r10 + movq %r9,%r8 + shrq $23,%r8 + andq %rax,%r8 + movq %r8,-104(%rdi) + shrdq $52,%r10,%r9 + andq %rax,%r9 + movq %r9,-96(%rdi) + movq 24(%rsi),%r11 + movq %r10,%r9 + shrq $17,%r9 + andq %rax,%r9 + movq %r9,-88(%rdi) + shrdq $46,%r11,%r10 + andq %rax,%r10 + movq %r10,-80(%rdi) + movq 32(%rsi),%r8 + movq %r11,%r10 + shrq $11,%r10 + andq %rax,%r10 + movq %r10,-72(%rdi) + shrdq $40,%r8,%r11 + andq %rax,%r11 + movq %r11,-64(%rdi) + movq 40(%rsi),%r9 + movq %r8,%r11 + shrq $5,%r11 + andq %rax,%r11 + movq %r11,-56(%rdi) + movq %r8,%r10 + shrq $34,%r10 + andq %rax,%r10 + movq %r10,-48(%rdi) + shrdq $63,%r9,%r8 + andq %rax,%r8 + movq %r8,-40(%rdi) + movq 48(%rsi),%r10 + movq %r9,%r8 + shrq $28,%r8 + andq %rax,%r8 + movq %r8,-32(%rdi) + shrdq $57,%r10,%r9 + andq %rax,%r9 + movq %r9,-24(%rdi) + movq 56(%rsi),%r11 + movq %r10,%r9 + shrq $22,%r9 + andq %rax,%r9 + movq %r9,-16(%rdi) + shrdq $51,%r11,%r10 + andq %rax,%r10 + movq %r10,-8(%rdi) + movq 64(%rsi),%r8 + movq %r11,%r10 + shrq $16,%r10 + andq %rax,%r10 + movq %r10,0(%rdi) + shrdq $45,%r8,%r11 + andq %rax,%r11 + movq %r11,8(%rdi) + movq 72(%rsi),%r9 + movq %r8,%r11 + shrq $10,%r11 + andq %rax,%r11 + movq %r11,16(%rdi) + shrdq $39,%r9,%r8 + andq %rax,%r8 + movq %r8,24(%rdi) + movq 80(%rsi),%r10 + movq %r9,%r8 + shrq $4,%r8 + andq %rax,%r8 + movq %r8,32(%rdi) + movq %r9,%r11 + shrq $33,%r11 + andq %rax,%r11 + movq %r11,40(%rdi) + shrdq $62,%r10,%r9 + andq %rax,%r9 + movq %r9,48(%rdi) + movq 88(%rsi),%r11 + movq %r10,%r9 + shrq $27,%r9 + andq %rax,%r9 + movq %r9,56(%rdi) + shrdq $56,%r11,%r10 + andq %rax,%r10 + movq %r10,64(%rdi) + movq 96(%rsi),%r8 + movq %r11,%r10 + shrq $21,%r10 + andq %rax,%r10 + movq %r10,72(%rdi) + shrdq $50,%r8,%r11 + andq %rax,%r11 + movq %r11,80(%rdi) + movq 104(%rsi),%r9 + movq %r8,%r11 + shrq $15,%r11 + andq %rax,%r11 + movq %r11,88(%rdi) + shrdq $44,%r9,%r8 + andq %rax,%r8 + movq %r8,96(%rdi) + movq 112(%rsi),%r10 + movq %r9,%r8 + shrq $9,%r8 + andq %rax,%r8 + movq %r8,104(%rdi) + shrdq $38,%r10,%r9 + andq %rax,%r9 + movq %r9,112(%rdi) + movq 120(%rsi),%r11 + movq %r10,%r9 + shrq $3,%r9 + andq %rax,%r9 + movq %r9,120(%rdi) + movq %r10,%r8 + shrq $32,%r8 + andq %rax,%r8 + movq %r8,128(%rdi) + shrdq $61,%r11,%r10 + andq %rax,%r10 + movq %r10,136(%rdi) + xorq %r8,%r8 + movq %r11,%r10 + shrq $26,%r10 + andq %rax,%r10 + movq %r10,144(%rdi) + shrdq $55,%r8,%r11 + andq %rax,%r11 + movq %r11,152(%rdi) + movq %r8,160(%rdi) + movq %r8,168(%rdi) + movq %r8,176(%rdi) + movq %r8,184(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2 +.globl rsaz_1024_scatter5_avx2 +.type rsaz_1024_scatter5_avx2,@function +.align 32 rsaz_1024_scatter5_avx2: +.cfi_startproc + vzeroupper + vmovdqu .Lscatter_permd(%rip),%ymm5 + shll $4,%edx + leaq (%rdi,%rdx,1),%rdi + movl $9,%eax + jmp .Loop_scatter_1024 + +.align 32 +.Loop_scatter_1024: + vmovdqu (%rsi),%ymm0 + leaq 32(%rsi),%rsi + vpermd %ymm0,%ymm5,%ymm0 + vmovdqu %xmm0,(%rdi) + leaq 512(%rdi),%rdi + decl %eax + jnz .Loop_scatter_1024 + + vzeroupper + .byte 0xf3,0xc3 +.cfi_endproc +.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2 + +.globl rsaz_1024_gather5_avx2 +.type rsaz_1024_gather5_avx2,@function +.align 32 rsaz_1024_gather5_avx2: -.byte 0x0f,0x0b +.cfi_startproc + vzeroupper + movq %rsp,%r11 +.cfi_def_cfa_register %r11 + leaq -256(%rsp),%rsp + andq $-32,%rsp + leaq .Linc(%rip),%r10 + leaq -128(%rsp),%rax + + vmovd %edx,%xmm4 + vmovdqa (%r10),%ymm0 + vmovdqa 32(%r10),%ymm1 + vmovdqa 64(%r10),%ymm5 + vpbroadcastd %xmm4,%ymm4 + + vpaddd %ymm5,%ymm0,%ymm2 + vpcmpeqd %ymm4,%ymm0,%ymm0 + vpaddd %ymm5,%ymm1,%ymm3 + vpcmpeqd %ymm4,%ymm1,%ymm1 + vmovdqa %ymm0,0+128(%rax) + vpaddd %ymm5,%ymm2,%ymm0 + vpcmpeqd %ymm4,%ymm2,%ymm2 + vmovdqa %ymm1,32+128(%rax) + vpaddd %ymm5,%ymm3,%ymm1 + vpcmpeqd %ymm4,%ymm3,%ymm3 + vmovdqa %ymm2,64+128(%rax) + vpaddd %ymm5,%ymm0,%ymm2 + vpcmpeqd %ymm4,%ymm0,%ymm0 + vmovdqa %ymm3,96+128(%rax) + vpaddd %ymm5,%ymm1,%ymm3 + vpcmpeqd %ymm4,%ymm1,%ymm1 + vmovdqa %ymm0,128+128(%rax) + vpaddd %ymm5,%ymm2,%ymm8 + vpcmpeqd %ymm4,%ymm2,%ymm2 + vmovdqa %ymm1,160+128(%rax) + vpaddd %ymm5,%ymm3,%ymm9 + vpcmpeqd %ymm4,%ymm3,%ymm3 + vmovdqa %ymm2,192+128(%rax) + vpaddd %ymm5,%ymm8,%ymm10 + vpcmpeqd %ymm4,%ymm8,%ymm8 + vmovdqa %ymm3,224+128(%rax) + vpaddd %ymm5,%ymm9,%ymm11 + vpcmpeqd %ymm4,%ymm9,%ymm9 + vpaddd %ymm5,%ymm10,%ymm12 + vpcmpeqd %ymm4,%ymm10,%ymm10 + vpaddd %ymm5,%ymm11,%ymm13 + vpcmpeqd %ymm4,%ymm11,%ymm11 + vpaddd %ymm5,%ymm12,%ymm14 + vpcmpeqd %ymm4,%ymm12,%ymm12 + vpaddd %ymm5,%ymm13,%ymm15 + vpcmpeqd %ymm4,%ymm13,%ymm13 + vpcmpeqd %ymm4,%ymm14,%ymm14 + vpcmpeqd %ymm4,%ymm15,%ymm15 + + vmovdqa -32(%r10),%ymm7 + leaq 128(%rsi),%rsi + movl $9,%edx + +.Loop_gather_1024: + vmovdqa 0-128(%rsi),%ymm0 + vmovdqa 32-128(%rsi),%ymm1 + vmovdqa 64-128(%rsi),%ymm2 + vmovdqa 96-128(%rsi),%ymm3 + vpand 0+128(%rax),%ymm0,%ymm0 + vpand 32+128(%rax),%ymm1,%ymm1 + vpand 64+128(%rax),%ymm2,%ymm2 + vpor %ymm0,%ymm1,%ymm4 + vpand 96+128(%rax),%ymm3,%ymm3 + vmovdqa 128-128(%rsi),%ymm0 + vmovdqa 160-128(%rsi),%ymm1 + vpor %ymm2,%ymm3,%ymm5 + vmovdqa 192-128(%rsi),%ymm2 + vmovdqa 224-128(%rsi),%ymm3 + vpand 128+128(%rax),%ymm0,%ymm0 + vpand 160+128(%rax),%ymm1,%ymm1 + vpand 192+128(%rax),%ymm2,%ymm2 + vpor %ymm0,%ymm4,%ymm4 + vpand 224+128(%rax),%ymm3,%ymm3 + vpand 256-128(%rsi),%ymm8,%ymm0 + vpor %ymm1,%ymm5,%ymm5 + vpand 288-128(%rsi),%ymm9,%ymm1 + vpor %ymm2,%ymm4,%ymm4 + vpand 320-128(%rsi),%ymm10,%ymm2 + vpor %ymm3,%ymm5,%ymm5 + vpand 352-128(%rsi),%ymm11,%ymm3 + vpor %ymm0,%ymm4,%ymm4 + vpand 384-128(%rsi),%ymm12,%ymm0 + vpor %ymm1,%ymm5,%ymm5 + vpand 416-128(%rsi),%ymm13,%ymm1 + vpor %ymm2,%ymm4,%ymm4 + vpand 448-128(%rsi),%ymm14,%ymm2 + vpor %ymm3,%ymm5,%ymm5 + vpand 480-128(%rsi),%ymm15,%ymm3 + leaq 512(%rsi),%rsi + vpor %ymm0,%ymm4,%ymm4 + vpor %ymm1,%ymm5,%ymm5 + vpor %ymm2,%ymm4,%ymm4 + vpor %ymm3,%ymm5,%ymm5 + + vpor %ymm5,%ymm4,%ymm4 + vextracti128 $1,%ymm4,%xmm5 + vpor %xmm4,%xmm5,%xmm5 + vpermd %ymm5,%ymm7,%ymm5 + vmovdqu %ymm5,(%rdi) + leaq 32(%rdi),%rdi + decl %edx + jnz .Loop_gather_1024 + + vpxor %ymm0,%ymm0,%ymm0 + vmovdqu %ymm0,(%rdi) + vzeroupper + leaq (%r11),%rsp +.cfi_def_cfa_register %rsp .byte 0xf3,0xc3 -.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 +.cfi_endproc +.LSEH_end_rsaz_1024_gather5: +.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 + +.globl rsaz_avx2_eligible +.type rsaz_avx2_eligible,@function +.align 32 +rsaz_avx2_eligible: + movl OPENSSL_ia32cap_P+8(%rip),%eax + movl $524544,%ecx + movl $0,%edx + andl %eax,%ecx + cmpl $524544,%ecx + cmovel %edx,%eax + andl $32,%eax + shrl $5,%eax + .byte 0xf3,0xc3 +.size rsaz_avx2_eligible,.-rsaz_avx2_eligible + +.align 64 +.Land_mask: +.quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff +.Lscatter_permd: +.long 0,2,4,6,7,7,7,7 +.Lgather_permd: +.long 0,7,1,7,2,7,3,7 +.Linc: +.long 0,0,0,0, 1,1,1,1 +.long 2,2,2,2, 3,3,3,3 +.long 4,4,4,4, 4,4,4,4 +.align 64 diff --git a/secure/lib/libcrypto/amd64/rsaz-x86_64.S b/secure/lib/libcrypto/amd64/rsaz-x86_64.S index ae64f7a7398..3ba29ea52dd 100644 --- a/secure/lib/libcrypto/amd64/rsaz-x86_64.S +++ b/secure/lib/libcrypto/amd64/rsaz-x86_64.S @@ -35,6 +35,10 @@ rsaz_512_sqr: movq (%rsi),%rdx movq 8(%rsi),%rax movq %rcx,128(%rsp) + movl $0x80100,%r11d + andl OPENSSL_ia32cap_P+8(%rip),%r11d + cmpl $0x80100,%r11d + je .Loop_sqrx jmp .Loop_sqr .align 32 @@ -405,6 +409,282 @@ rsaz_512_sqr: decl %r8d jnz .Loop_sqr + jmp .Lsqr_tail + +.align 32 +.Loop_sqrx: + movl %r8d,128+8(%rsp) +.byte 102,72,15,110,199 + + mulxq %rax,%r8,%r9 + movq %rax,%rbx + + mulxq 16(%rsi),%rcx,%r10 + xorq %rbp,%rbp + + mulxq 24(%rsi),%rax,%r11 + adcxq %rcx,%r9 + +.byte 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00 + adcxq %rax,%r10 + +.byte 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00 + adcxq %rcx,%r11 + + mulxq 48(%rsi),%rcx,%r14 + adcxq %rax,%r12 + adcxq %rcx,%r13 + + mulxq 56(%rsi),%rax,%r15 + adcxq %rax,%r14 + adcxq %rbp,%r15 + + mulxq %rdx,%rax,%rdi + movq %rbx,%rdx + xorq %rcx,%rcx + adoxq %r8,%r8 + adcxq %rdi,%r8 + adoxq %rbp,%rcx + adcxq %rbp,%rcx + + movq %rax,(%rsp) + movq %r8,8(%rsp) + + +.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00 + adoxq %rax,%r10 + adcxq %rbx,%r11 + + mulxq 24(%rsi),%rdi,%r8 + adoxq %rdi,%r11 +.byte 0x66 + adcxq %r8,%r12 + + mulxq 32(%rsi),%rax,%rbx + adoxq %rax,%r12 + adcxq %rbx,%r13 + + mulxq 40(%rsi),%rdi,%r8 + adoxq %rdi,%r13 + adcxq %r8,%r14 + +.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 + adoxq %rax,%r14 + adcxq %rbx,%r15 + +.byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 + adoxq %rdi,%r15 + adcxq %rbp,%r8 + mulxq %rdx,%rax,%rdi + adoxq %rbp,%r8 +.byte 0x48,0x8b,0x96,0x10,0x00,0x00,0x00 + + xorq %rbx,%rbx + adoxq %r9,%r9 + + adcxq %rcx,%rax + adoxq %r10,%r10 + adcxq %rax,%r9 + adoxq %rbp,%rbx + adcxq %rdi,%r10 + adcxq %rbp,%rbx + + movq %r9,16(%rsp) +.byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 + + + mulxq 24(%rsi),%rdi,%r9 + adoxq %rdi,%r12 + adcxq %r9,%r13 + + mulxq 32(%rsi),%rax,%rcx + adoxq %rax,%r13 + adcxq %rcx,%r14 + +.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00 + adoxq %rdi,%r14 + adcxq %r9,%r15 + +.byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 + adoxq %rax,%r15 + adcxq %rcx,%r8 + + mulxq 56(%rsi),%rdi,%r9 + adoxq %rdi,%r8 + adcxq %rbp,%r9 + mulxq %rdx,%rax,%rdi + adoxq %rbp,%r9 + movq 24(%rsi),%rdx + + xorq %rcx,%rcx + adoxq %r11,%r11 + + adcxq %rbx,%rax + adoxq %r12,%r12 + adcxq %rax,%r11 + adoxq %rbp,%rcx + adcxq %rdi,%r12 + adcxq %rbp,%rcx + + movq %r11,32(%rsp) + movq %r12,40(%rsp) + + + mulxq 32(%rsi),%rax,%rbx + adoxq %rax,%r14 + adcxq %rbx,%r15 + + mulxq 40(%rsi),%rdi,%r10 + adoxq %rdi,%r15 + adcxq %r10,%r8 + + mulxq 48(%rsi),%rax,%rbx + adoxq %rax,%r8 + adcxq %rbx,%r9 + + mulxq 56(%rsi),%rdi,%r10 + adoxq %rdi,%r9 + adcxq %rbp,%r10 + mulxq %rdx,%rax,%rdi + adoxq %rbp,%r10 + movq 32(%rsi),%rdx + + xorq %rbx,%rbx + adoxq %r13,%r13 + + adcxq %rcx,%rax + adoxq %r14,%r14 + adcxq %rax,%r13 + adoxq %rbp,%rbx + adcxq %rdi,%r14 + adcxq %rbp,%rbx + + movq %r13,48(%rsp) + movq %r14,56(%rsp) + + + mulxq 40(%rsi),%rdi,%r11 + adoxq %rdi,%r8 + adcxq %r11,%r9 + + mulxq 48(%rsi),%rax,%rcx + adoxq %rax,%r9 + adcxq %rcx,%r10 + + mulxq 56(%rsi),%rdi,%r11 + adoxq %rdi,%r10 + adcxq %rbp,%r11 + mulxq %rdx,%rax,%rdi + movq 40(%rsi),%rdx + adoxq %rbp,%r11 + + xorq %rcx,%rcx + adoxq %r15,%r15 + + adcxq %rbx,%rax + adoxq %r8,%r8 + adcxq %rax,%r15 + adoxq %rbp,%rcx + adcxq %rdi,%r8 + adcxq %rbp,%rcx + + movq %r15,64(%rsp) + movq %r8,72(%rsp) + + +.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 + adoxq %rax,%r10 + adcxq %rbx,%r11 + +.byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 + adoxq %rdi,%r11 + adcxq %rbp,%r12 + mulxq %rdx,%rax,%rdi + adoxq %rbp,%r12 + movq 48(%rsi),%rdx + + xorq %rbx,%rbx + adoxq %r9,%r9 + + adcxq %rcx,%rax + adoxq %r10,%r10 + adcxq %rax,%r9 + adcxq %rdi,%r10 + adoxq %rbp,%rbx + adcxq %rbp,%rbx + + movq %r9,80(%rsp) + movq %r10,88(%rsp) + + +.byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 + adoxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq %rdx,%rax,%rdi + xorq %rcx,%rcx + movq 56(%rsi),%rdx + adoxq %r11,%r11 + + adcxq %rbx,%rax + adoxq %r12,%r12 + adcxq %rax,%r11 + adoxq %rbp,%rcx + adcxq %rdi,%r12 + adcxq %rbp,%rcx + +.byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 +.byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 + + + mulxq %rdx,%rax,%rdx + xorq %rbx,%rbx + adoxq %r13,%r13 + + adcxq %rcx,%rax + adoxq %rbp,%rbx + adcxq %r13,%rax + adcxq %rdx,%rbx + +.byte 102,72,15,126,199 +.byte 102,72,15,126,205 + + movq 128(%rsp),%rdx + movq (%rsp),%r8 + movq 8(%rsp),%r9 + movq 16(%rsp),%r10 + movq 24(%rsp),%r11 + movq 32(%rsp),%r12 + movq 40(%rsp),%r13 + movq 48(%rsp),%r14 + movq 56(%rsp),%r15 + + movq %rax,112(%rsp) + movq %rbx,120(%rsp) + + call __rsaz_512_reducex + + addq 64(%rsp),%r8 + adcq 72(%rsp),%r9 + adcq 80(%rsp),%r10 + adcq 88(%rsp),%r11 + adcq 96(%rsp),%r12 + adcq 104(%rsp),%r13 + adcq 112(%rsp),%r14 + adcq 120(%rsp),%r15 + sbbq %rcx,%rcx + + call __rsaz_512_subtract + + movq %r8,%rdx + movq %r9,%rax + movl 128+8(%rsp),%r8d + movq %rdi,%rsi + + decl %r8d + jnz .Loop_sqrx + +.Lsqr_tail: leaq 128+24+48(%rsp),%rax .cfi_def_cfa %rax,8 @@ -456,6 +736,10 @@ rsaz_512_mul: .byte 102,72,15,110,199 .byte 102,72,15,110,201 movq %r8,128(%rsp) + movl $0x80100,%r11d + andl OPENSSL_ia32cap_P+8(%rip),%r11d + cmpl $0x80100,%r11d + je .Lmulx movq (%rdx),%rbx movq %rdx,%rbp call __rsaz_512_mul @@ -473,6 +757,29 @@ rsaz_512_mul: movq 56(%rsp),%r15 call __rsaz_512_reduce + jmp .Lmul_tail + +.align 32 +.Lmulx: + movq %rdx,%rbp + movq (%rdx),%rdx + call __rsaz_512_mulx + +.byte 102,72,15,126,199 +.byte 102,72,15,126,205 + + movq 128(%rsp),%rdx + movq (%rsp),%r8 + movq 8(%rsp),%r9 + movq 16(%rsp),%r10 + movq 24(%rsp),%r11 + movq 32(%rsp),%r12 + movq 40(%rsp),%r13 + movq 48(%rsp),%r14 + movq 56(%rsp),%r15 + + call __rsaz_512_reducex +.Lmul_tail: addq 64(%rsp),%r8 adcq 72(%rsp),%r9 adcq 80(%rsp),%r10 @@ -586,6 +893,10 @@ rsaz_512_mul_gather4: por %xmm9,%xmm8 pshufd $0x4e,%xmm8,%xmm9 por %xmm9,%xmm8 + movl $0x80100,%r11d + andl OPENSSL_ia32cap_P+8(%rip),%r11d + cmpl $0x80100,%r11d + je .Lmulx_gather .byte 102,76,15,126,195 movq %r8,128(%rsp) @@ -766,6 +1077,142 @@ rsaz_512_mul_gather4: movq 56(%rsp),%r15 call __rsaz_512_reduce + jmp .Lmul_gather_tail + +.align 32 +.Lmulx_gather: +.byte 102,76,15,126,194 + + movq %r8,128(%rsp) + movq %rdi,128+8(%rsp) + movq %rcx,128+16(%rsp) + + mulxq (%rsi),%rbx,%r8 + movq %rbx,(%rsp) + xorl %edi,%edi + + mulxq 8(%rsi),%rax,%r9 + + mulxq 16(%rsi),%rbx,%r10 + adcxq %rax,%r8 + + mulxq 24(%rsi),%rax,%r11 + adcxq %rbx,%r9 + + mulxq 32(%rsi),%rbx,%r12 + adcxq %rax,%r10 + + mulxq 40(%rsi),%rax,%r13 + adcxq %rbx,%r11 + + mulxq 48(%rsi),%rbx,%r14 + adcxq %rax,%r12 + + mulxq 56(%rsi),%rax,%r15 + adcxq %rbx,%r13 + adcxq %rax,%r14 +.byte 0x67 + movq %r8,%rbx + adcxq %rdi,%r15 + + movq $-7,%rcx + jmp .Loop_mulx_gather + +.align 32 +.Loop_mulx_gather: + movdqa 0(%rbp),%xmm8 + movdqa 16(%rbp),%xmm9 + movdqa 32(%rbp),%xmm10 + movdqa 48(%rbp),%xmm11 + pand %xmm0,%xmm8 + movdqa 64(%rbp),%xmm12 + pand %xmm1,%xmm9 + movdqa 80(%rbp),%xmm13 + pand %xmm2,%xmm10 + movdqa 96(%rbp),%xmm14 + pand %xmm3,%xmm11 + movdqa 112(%rbp),%xmm15 + leaq 128(%rbp),%rbp + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd $0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 +.byte 102,76,15,126,194 + +.byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 + adcxq %rax,%rbx + adoxq %r9,%r8 + + mulxq 8(%rsi),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rsi),%rax,%r10 + adcxq %rax,%r9 + adoxq %r11,%r10 + +.byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 + adcxq %rax,%r10 + adoxq %r12,%r11 + + mulxq 32(%rsi),%rax,%r12 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rsi),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + +.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 + adcxq %rax,%r13 +.byte 0x67 + adoxq %r15,%r14 + + mulxq 56(%rsi),%rax,%r15 + movq %rbx,64(%rsp,%rcx,8) + adcxq %rax,%r14 + adoxq %rdi,%r15 + movq %r8,%rbx + adcxq %rdi,%r15 + + incq %rcx + jnz .Loop_mulx_gather + + movq %r8,64(%rsp) + movq %r9,64+8(%rsp) + movq %r10,64+16(%rsp) + movq %r11,64+24(%rsp) + movq %r12,64+32(%rsp) + movq %r13,64+40(%rsp) + movq %r14,64+48(%rsp) + movq %r15,64+56(%rsp) + + movq 128(%rsp),%rdx + movq 128+8(%rsp),%rdi + movq 128+16(%rsp),%rbp + + movq (%rsp),%r8 + movq 8(%rsp),%r9 + movq 16(%rsp),%r10 + movq 24(%rsp),%r11 + movq 32(%rsp),%r12 + movq 40(%rsp),%r13 + movq 48(%rsp),%r14 + movq 56(%rsp),%r15 + + call __rsaz_512_reducex + +.Lmul_gather_tail: addq 64(%rsp),%r8 adcq 72(%rsp),%r9 adcq 80(%rsp),%r10 @@ -833,6 +1280,10 @@ rsaz_512_mul_scatter4: movq %rcx,128(%rsp) movq %rdi,%rbp + movl $0x80100,%r11d + andl OPENSSL_ia32cap_P+8(%rip),%r11d + cmpl $0x80100,%r11d + je .Lmulx_scatter movq (%rdi),%rbx call __rsaz_512_mul @@ -849,6 +1300,29 @@ rsaz_512_mul_scatter4: movq 56(%rsp),%r15 call __rsaz_512_reduce + jmp .Lmul_scatter_tail + +.align 32 +.Lmulx_scatter: + movq (%rdi),%rdx + call __rsaz_512_mulx + +.byte 102,72,15,126,199 +.byte 102,72,15,126,205 + + movq 128(%rsp),%rdx + movq (%rsp),%r8 + movq 8(%rsp),%r9 + movq 16(%rsp),%r10 + movq 24(%rsp),%r11 + movq 32(%rsp),%r12 + movq 40(%rsp),%r13 + movq 48(%rsp),%r14 + movq 56(%rsp),%r15 + + call __rsaz_512_reducex + +.Lmul_scatter_tail: addq 64(%rsp),%r8 adcq 72(%rsp),%r9 adcq 80(%rsp),%r10 @@ -918,6 +1392,7 @@ rsaz_512_mul_by_one: subq $128+24,%rsp .cfi_adjust_cfa_offset 128+24 .Lmul_by_one_body: + movl OPENSSL_ia32cap_P+8(%rip),%eax movq %rdx,%rbp movq %rcx,128(%rsp) @@ -938,7 +1413,16 @@ rsaz_512_mul_by_one: movdqa %xmm0,64(%rsp) movdqa %xmm0,80(%rsp) movdqa %xmm0,96(%rsp) + andl $0x80100,%eax + cmpl $0x80100,%eax + je .Lby_one_callx call __rsaz_512_reduce + jmp .Lby_one_tail +.align 32 +.Lby_one_callx: + movq 128(%rsp),%rdx + call __rsaz_512_reducex +.Lby_one_tail: movq %r8,(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) @@ -1053,6 +1537,64 @@ __rsaz_512_reduce: .byte 0xf3,0xc3 .cfi_endproc .size __rsaz_512_reduce,.-__rsaz_512_reduce +.type __rsaz_512_reducex,@function +.align 32 +__rsaz_512_reducex: +.cfi_startproc + + imulq %r8,%rdx + xorq %rsi,%rsi + movl $8,%ecx + jmp .Lreduction_loopx + +.align 32 +.Lreduction_loopx: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rbx,%rax + adoxq %r9,%r8 + + mulxq 8(%rbp),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rbx,%r10 + adcxq %rbx,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rbx,%r11 + adcxq %rbx,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 + movq %rdx,%rax + movq %r8,%rdx + adcxq %rbx,%r11 + adoxq %r13,%r12 + + mulxq 128+8(%rsp),%rbx,%rdx + movq %rax,%rdx + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + +.byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 + adcxq %rax,%r13 + adoxq %r15,%r14 + + mulxq 56(%rbp),%rax,%r15 + movq %rbx,%rdx + adcxq %rax,%r14 + adoxq %rsi,%r15 + adcxq %rsi,%r15 + + decl %ecx + jne .Lreduction_loopx + + .byte 0xf3,0xc3 +.cfi_endproc +.size __rsaz_512_reducex,.-__rsaz_512_reducex .type __rsaz_512_subtract,@function .align 32 __rsaz_512_subtract: @@ -1256,6 +1798,128 @@ __rsaz_512_mul: .byte 0xf3,0xc3 .cfi_endproc .size __rsaz_512_mul,.-__rsaz_512_mul +.type __rsaz_512_mulx,@function +.align 32 +__rsaz_512_mulx: +.cfi_startproc + mulxq (%rsi),%rbx,%r8 + movq $-6,%rcx + + mulxq 8(%rsi),%rax,%r9 + movq %rbx,8(%rsp) + + mulxq 16(%rsi),%rbx,%r10 + adcq %rax,%r8 + + mulxq 24(%rsi),%rax,%r11 + adcq %rbx,%r9 + + mulxq 32(%rsi),%rbx,%r12 + adcq %rax,%r10 + + mulxq 40(%rsi),%rax,%r13 + adcq %rbx,%r11 + + mulxq 48(%rsi),%rbx,%r14 + adcq %rax,%r12 + + mulxq 56(%rsi),%rax,%r15 + movq 8(%rbp),%rdx + adcq %rbx,%r13 + adcq %rax,%r14 + adcq $0,%r15 + + xorq %rdi,%rdi + jmp .Loop_mulx + +.align 32 +.Loop_mulx: + movq %r8,%rbx + mulxq (%rsi),%rax,%r8 + adcxq %rax,%rbx + adoxq %r9,%r8 + + mulxq 8(%rsi),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rsi),%rax,%r10 + adcxq %rax,%r9 + adoxq %r11,%r10 + + mulxq 24(%rsi),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + +.byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rsi),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rsi),%rax,%r14 + adcxq %rax,%r13 + adoxq %r15,%r14 + + mulxq 56(%rsi),%rax,%r15 + movq 64(%rbp,%rcx,8),%rdx + movq %rbx,8+64-8(%rsp,%rcx,8) + adcxq %rax,%r14 + adoxq %rdi,%r15 + adcxq %rdi,%r15 + + incq %rcx + jnz .Loop_mulx + + movq %r8,%rbx + mulxq (%rsi),%rax,%r8 + adcxq %rax,%rbx + adoxq %r9,%r8 + +.byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 + adcxq %rax,%r8 + adoxq %r10,%r9 + +.byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 + adcxq %rax,%r9 + adoxq %r11,%r10 + + mulxq 24(%rsi),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + + mulxq 32(%rsi),%rax,%r12 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rsi),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + +.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 + adcxq %rax,%r13 + adoxq %r15,%r14 + +.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 + adcxq %rax,%r14 + adoxq %rdi,%r15 + adcxq %rdi,%r15 + + movq %rbx,8+64-8(%rsp) + movq %r8,8+64(%rsp) + movq %r9,8+64+8(%rsp) + movq %r10,8+64+16(%rsp) + movq %r11,8+64+24(%rsp) + movq %r12,8+64+32(%rsp) + movq %r13,8+64+40(%rsp) + movq %r14,8+64+48(%rsp) + movq %r15,8+64+56(%rsp) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __rsaz_512_mulx,.-__rsaz_512_mulx .globl rsaz_512_scatter4 .type rsaz_512_scatter4,@function .align 16 diff --git a/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S b/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S index 488e554c247..0090e020c57 100644 --- a/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S +++ b/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S @@ -12,6 +12,8 @@ sha1_multi_block: movq OPENSSL_ia32cap_P+4(%rip),%rcx btq $61,%rcx jc _shaext_shortcut + testl $268435456,%ecx + jnz _avx_shortcut movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx @@ -2937,6 +2939,4319 @@ _shaext_shortcut: .byte 0xf3,0xc3 .cfi_endproc .size sha1_multi_block_shaext,.-sha1_multi_block_shaext +.type sha1_multi_block_avx,@function +.align 32 +sha1_multi_block_avx: +.cfi_startproc +_avx_shortcut: + shrq $32,%rcx + cmpl $2,%edx + jb .Lavx + testl $32,%ecx + jnz _avx2_shortcut + jmp .Lavx +.align 32 +.Lavx: + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + subq $288,%rsp + andq $-256,%rsp + movq %rax,272(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x90,0x02,0x06,0x23,0x08 +.Lbody_avx: + leaq K_XX_XX(%rip),%rbp + leaq 256(%rsp),%rbx + + vzeroupper +.Loop_grande_avx: + movl %edx,280(%rsp) + xorl %edx,%edx + movq 0(%rsi),%r8 + movl 8(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,0(%rbx) + cmovleq %rbp,%r8 + movq 16(%rsi),%r9 + movl 24(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,4(%rbx) + cmovleq %rbp,%r9 + movq 32(%rsi),%r10 + movl 40(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,8(%rbx) + cmovleq %rbp,%r10 + movq 48(%rsi),%r11 + movl 56(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,12(%rbx) + cmovleq %rbp,%r11 + testl %edx,%edx + jz .Ldone_avx + + vmovdqu 0(%rdi),%xmm10 + leaq 128(%rsp),%rax + vmovdqu 32(%rdi),%xmm11 + vmovdqu 64(%rdi),%xmm12 + vmovdqu 96(%rdi),%xmm13 + vmovdqu 128(%rdi),%xmm14 + vmovdqu 96(%rbp),%xmm5 + jmp .Loop_avx + +.align 32 +.Loop_avx: + vmovdqa -32(%rbp),%xmm15 + vmovd (%r8),%xmm0 + leaq 64(%r8),%r8 + vmovd (%r9),%xmm2 + leaq 64(%r9),%r9 + vpinsrd $1,(%r10),%xmm0,%xmm0 + leaq 64(%r10),%r10 + vpinsrd $1,(%r11),%xmm2,%xmm2 + leaq 64(%r11),%r11 + vmovd -60(%r8),%xmm1 + vpunpckldq %xmm2,%xmm0,%xmm0 + vmovd -60(%r9),%xmm9 + vpshufb %xmm5,%xmm0,%xmm0 + vpinsrd $1,-60(%r10),%xmm1,%xmm1 + vpinsrd $1,-60(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm14,%xmm14 + vpslld $5,%xmm10,%xmm8 + vpandn %xmm13,%xmm11,%xmm7 + vpand %xmm12,%xmm11,%xmm6 + + vmovdqa %xmm0,0-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpunpckldq %xmm9,%xmm1,%xmm1 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vmovd -56(%r8),%xmm2 + + vpslld $30,%xmm11,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vmovd -56(%r9),%xmm9 + vpaddd %xmm6,%xmm14,%xmm14 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpshufb %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpinsrd $1,-56(%r10),%xmm2,%xmm2 + vpinsrd $1,-56(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm13,%xmm13 + vpslld $5,%xmm14,%xmm8 + vpandn %xmm12,%xmm10,%xmm7 + vpand %xmm11,%xmm10,%xmm6 + + vmovdqa %xmm1,16-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpunpckldq %xmm9,%xmm2,%xmm2 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vmovd -52(%r8),%xmm3 + + vpslld $30,%xmm10,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vmovd -52(%r9),%xmm9 + vpaddd %xmm6,%xmm13,%xmm13 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpshufb %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpinsrd $1,-52(%r10),%xmm3,%xmm3 + vpinsrd $1,-52(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm12,%xmm12 + vpslld $5,%xmm13,%xmm8 + vpandn %xmm11,%xmm14,%xmm7 + vpand %xmm10,%xmm14,%xmm6 + + vmovdqa %xmm2,32-128(%rax) + vpaddd %xmm2,%xmm12,%xmm12 + vpunpckldq %xmm9,%xmm3,%xmm3 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vmovd -48(%r8),%xmm4 + + vpslld $30,%xmm14,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vmovd -48(%r9),%xmm9 + vpaddd %xmm6,%xmm12,%xmm12 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpshufb %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpinsrd $1,-48(%r10),%xmm4,%xmm4 + vpinsrd $1,-48(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm11,%xmm11 + vpslld $5,%xmm12,%xmm8 + vpandn %xmm10,%xmm13,%xmm7 + vpand %xmm14,%xmm13,%xmm6 + + vmovdqa %xmm3,48-128(%rax) + vpaddd %xmm3,%xmm11,%xmm11 + vpunpckldq %xmm9,%xmm4,%xmm4 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vmovd -44(%r8),%xmm0 + + vpslld $30,%xmm13,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vmovd -44(%r9),%xmm9 + vpaddd %xmm6,%xmm11,%xmm11 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpshufb %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpinsrd $1,-44(%r10),%xmm0,%xmm0 + vpinsrd $1,-44(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm10,%xmm10 + vpslld $5,%xmm11,%xmm8 + vpandn %xmm14,%xmm12,%xmm7 + vpand %xmm13,%xmm12,%xmm6 + + vmovdqa %xmm4,64-128(%rax) + vpaddd %xmm4,%xmm10,%xmm10 + vpunpckldq %xmm9,%xmm0,%xmm0 + vpsrld $27,%xmm11,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vmovd -40(%r8),%xmm1 + + vpslld $30,%xmm12,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vmovd -40(%r9),%xmm9 + vpaddd %xmm6,%xmm10,%xmm10 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + vpshufb %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vpinsrd $1,-40(%r10),%xmm1,%xmm1 + vpinsrd $1,-40(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm14,%xmm14 + vpslld $5,%xmm10,%xmm8 + vpandn %xmm13,%xmm11,%xmm7 + vpand %xmm12,%xmm11,%xmm6 + + vmovdqa %xmm0,80-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpunpckldq %xmm9,%xmm1,%xmm1 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vmovd -36(%r8),%xmm2 + + vpslld $30,%xmm11,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vmovd -36(%r9),%xmm9 + vpaddd %xmm6,%xmm14,%xmm14 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpshufb %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpinsrd $1,-36(%r10),%xmm2,%xmm2 + vpinsrd $1,-36(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm13,%xmm13 + vpslld $5,%xmm14,%xmm8 + vpandn %xmm12,%xmm10,%xmm7 + vpand %xmm11,%xmm10,%xmm6 + + vmovdqa %xmm1,96-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpunpckldq %xmm9,%xmm2,%xmm2 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vmovd -32(%r8),%xmm3 + + vpslld $30,%xmm10,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vmovd -32(%r9),%xmm9 + vpaddd %xmm6,%xmm13,%xmm13 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpshufb %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpinsrd $1,-32(%r10),%xmm3,%xmm3 + vpinsrd $1,-32(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm12,%xmm12 + vpslld $5,%xmm13,%xmm8 + vpandn %xmm11,%xmm14,%xmm7 + vpand %xmm10,%xmm14,%xmm6 + + vmovdqa %xmm2,112-128(%rax) + vpaddd %xmm2,%xmm12,%xmm12 + vpunpckldq %xmm9,%xmm3,%xmm3 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vmovd -28(%r8),%xmm4 + + vpslld $30,%xmm14,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vmovd -28(%r9),%xmm9 + vpaddd %xmm6,%xmm12,%xmm12 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpshufb %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpinsrd $1,-28(%r10),%xmm4,%xmm4 + vpinsrd $1,-28(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm11,%xmm11 + vpslld $5,%xmm12,%xmm8 + vpandn %xmm10,%xmm13,%xmm7 + vpand %xmm14,%xmm13,%xmm6 + + vmovdqa %xmm3,128-128(%rax) + vpaddd %xmm3,%xmm11,%xmm11 + vpunpckldq %xmm9,%xmm4,%xmm4 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vmovd -24(%r8),%xmm0 + + vpslld $30,%xmm13,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vmovd -24(%r9),%xmm9 + vpaddd %xmm6,%xmm11,%xmm11 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpshufb %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpinsrd $1,-24(%r10),%xmm0,%xmm0 + vpinsrd $1,-24(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm10,%xmm10 + vpslld $5,%xmm11,%xmm8 + vpandn %xmm14,%xmm12,%xmm7 + vpand %xmm13,%xmm12,%xmm6 + + vmovdqa %xmm4,144-128(%rax) + vpaddd %xmm4,%xmm10,%xmm10 + vpunpckldq %xmm9,%xmm0,%xmm0 + vpsrld $27,%xmm11,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vmovd -20(%r8),%xmm1 + + vpslld $30,%xmm12,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vmovd -20(%r9),%xmm9 + vpaddd %xmm6,%xmm10,%xmm10 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + vpshufb %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vpinsrd $1,-20(%r10),%xmm1,%xmm1 + vpinsrd $1,-20(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm14,%xmm14 + vpslld $5,%xmm10,%xmm8 + vpandn %xmm13,%xmm11,%xmm7 + vpand %xmm12,%xmm11,%xmm6 + + vmovdqa %xmm0,160-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpunpckldq %xmm9,%xmm1,%xmm1 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vmovd -16(%r8),%xmm2 + + vpslld $30,%xmm11,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vmovd -16(%r9),%xmm9 + vpaddd %xmm6,%xmm14,%xmm14 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpshufb %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpinsrd $1,-16(%r10),%xmm2,%xmm2 + vpinsrd $1,-16(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm13,%xmm13 + vpslld $5,%xmm14,%xmm8 + vpandn %xmm12,%xmm10,%xmm7 + vpand %xmm11,%xmm10,%xmm6 + + vmovdqa %xmm1,176-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpunpckldq %xmm9,%xmm2,%xmm2 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vmovd -12(%r8),%xmm3 + + vpslld $30,%xmm10,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vmovd -12(%r9),%xmm9 + vpaddd %xmm6,%xmm13,%xmm13 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpshufb %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpinsrd $1,-12(%r10),%xmm3,%xmm3 + vpinsrd $1,-12(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm12,%xmm12 + vpslld $5,%xmm13,%xmm8 + vpandn %xmm11,%xmm14,%xmm7 + vpand %xmm10,%xmm14,%xmm6 + + vmovdqa %xmm2,192-128(%rax) + vpaddd %xmm2,%xmm12,%xmm12 + vpunpckldq %xmm9,%xmm3,%xmm3 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vmovd -8(%r8),%xmm4 + + vpslld $30,%xmm14,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vmovd -8(%r9),%xmm9 + vpaddd %xmm6,%xmm12,%xmm12 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpshufb %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpinsrd $1,-8(%r10),%xmm4,%xmm4 + vpinsrd $1,-8(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm11,%xmm11 + vpslld $5,%xmm12,%xmm8 + vpandn %xmm10,%xmm13,%xmm7 + vpand %xmm14,%xmm13,%xmm6 + + vmovdqa %xmm3,208-128(%rax) + vpaddd %xmm3,%xmm11,%xmm11 + vpunpckldq %xmm9,%xmm4,%xmm4 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vmovd -4(%r8),%xmm0 + + vpslld $30,%xmm13,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vmovd -4(%r9),%xmm9 + vpaddd %xmm6,%xmm11,%xmm11 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpshufb %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vmovdqa 0-128(%rax),%xmm1 + vpinsrd $1,-4(%r10),%xmm0,%xmm0 + vpinsrd $1,-4(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm10,%xmm10 + prefetcht0 63(%r8) + vpslld $5,%xmm11,%xmm8 + vpandn %xmm14,%xmm12,%xmm7 + vpand %xmm13,%xmm12,%xmm6 + + vmovdqa %xmm4,224-128(%rax) + vpaddd %xmm4,%xmm10,%xmm10 + vpunpckldq %xmm9,%xmm0,%xmm0 + vpsrld $27,%xmm11,%xmm9 + prefetcht0 63(%r9) + vpxor %xmm7,%xmm6,%xmm6 + + vpslld $30,%xmm12,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + prefetcht0 63(%r10) + vpaddd %xmm6,%xmm10,%xmm10 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + prefetcht0 63(%r11) + vpshufb %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vmovdqa 16-128(%rax),%xmm2 + vpxor %xmm3,%xmm1,%xmm1 + vmovdqa 32-128(%rax),%xmm3 + + vpaddd %xmm15,%xmm14,%xmm14 + vpslld $5,%xmm10,%xmm8 + vpandn %xmm13,%xmm11,%xmm7 + + vpand %xmm12,%xmm11,%xmm6 + + vmovdqa %xmm0,240-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpxor 128-128(%rax),%xmm1,%xmm1 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vpxor %xmm3,%xmm1,%xmm1 + + + vpslld $30,%xmm11,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm14,%xmm14 + + vpsrld $31,%xmm1,%xmm5 + vpaddd %xmm1,%xmm1,%xmm1 + + vpsrld $2,%xmm11,%xmm11 + + vpaddd %xmm8,%xmm14,%xmm14 + vpor %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa 48-128(%rax),%xmm4 + + vpaddd %xmm15,%xmm13,%xmm13 + vpslld $5,%xmm14,%xmm8 + vpandn %xmm12,%xmm10,%xmm7 + + vpand %xmm11,%xmm10,%xmm6 + + vmovdqa %xmm1,0-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpxor 144-128(%rax),%xmm2,%xmm2 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vpxor %xmm4,%xmm2,%xmm2 + + + vpslld $30,%xmm10,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm13,%xmm13 + + vpsrld $31,%xmm2,%xmm5 + vpaddd %xmm2,%xmm2,%xmm2 + + vpsrld $2,%xmm10,%xmm10 + + vpaddd %xmm8,%xmm13,%xmm13 + vpor %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpxor %xmm0,%xmm3,%xmm3 + vmovdqa 64-128(%rax),%xmm0 + + vpaddd %xmm15,%xmm12,%xmm12 + vpslld $5,%xmm13,%xmm8 + vpandn %xmm11,%xmm14,%xmm7 + + vpand %xmm10,%xmm14,%xmm6 + + vmovdqa %xmm2,16-128(%rax) + vpaddd %xmm2,%xmm12,%xmm12 + vpxor 160-128(%rax),%xmm3,%xmm3 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vpxor %xmm0,%xmm3,%xmm3 + + + vpslld $30,%xmm14,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm12,%xmm12 + + vpsrld $31,%xmm3,%xmm5 + vpaddd %xmm3,%xmm3,%xmm3 + + vpsrld $2,%xmm14,%xmm14 + + vpaddd %xmm8,%xmm12,%xmm12 + vpor %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqa 80-128(%rax),%xmm1 + + vpaddd %xmm15,%xmm11,%xmm11 + vpslld $5,%xmm12,%xmm8 + vpandn %xmm10,%xmm13,%xmm7 + + vpand %xmm14,%xmm13,%xmm6 + + vmovdqa %xmm3,32-128(%rax) + vpaddd %xmm3,%xmm11,%xmm11 + vpxor 176-128(%rax),%xmm4,%xmm4 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm4 + + + vpslld $30,%xmm13,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm11,%xmm11 + + vpsrld $31,%xmm4,%xmm5 + vpaddd %xmm4,%xmm4,%xmm4 + + vpsrld $2,%xmm13,%xmm13 + + vpaddd %xmm8,%xmm11,%xmm11 + vpor %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa 96-128(%rax),%xmm2 + + vpaddd %xmm15,%xmm10,%xmm10 + vpslld $5,%xmm11,%xmm8 + vpandn %xmm14,%xmm12,%xmm7 + + vpand %xmm13,%xmm12,%xmm6 + + vmovdqa %xmm4,48-128(%rax) + vpaddd %xmm4,%xmm10,%xmm10 + vpxor 192-128(%rax),%xmm0,%xmm0 + vpsrld $27,%xmm11,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vpxor %xmm2,%xmm0,%xmm0 + + + vpslld $30,%xmm12,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm10,%xmm10 + + vpsrld $31,%xmm0,%xmm5 + vpaddd %xmm0,%xmm0,%xmm0 + + vpsrld $2,%xmm12,%xmm12 + + vpaddd %xmm8,%xmm10,%xmm10 + vpor %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vmovdqa 0(%rbp),%xmm15 + vpxor %xmm3,%xmm1,%xmm1 + vmovdqa 112-128(%rax),%xmm3 + + vpslld $5,%xmm10,%xmm8 + vpaddd %xmm15,%xmm14,%xmm14 + vpxor %xmm11,%xmm13,%xmm6 + vmovdqa %xmm0,64-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpxor 208-128(%rax),%xmm1,%xmm1 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm12,%xmm6,%xmm6 + vpxor %xmm3,%xmm1,%xmm1 + + vpslld $30,%xmm11,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm14,%xmm14 + vpsrld $31,%xmm1,%xmm5 + vpaddd %xmm1,%xmm1,%xmm1 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpor %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa 128-128(%rax),%xmm4 + + vpslld $5,%xmm14,%xmm8 + vpaddd %xmm15,%xmm13,%xmm13 + vpxor %xmm10,%xmm12,%xmm6 + vmovdqa %xmm1,80-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpxor 224-128(%rax),%xmm2,%xmm2 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm11,%xmm6,%xmm6 + vpxor %xmm4,%xmm2,%xmm2 + + vpslld $30,%xmm10,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm13,%xmm13 + vpsrld $31,%xmm2,%xmm5 + vpaddd %xmm2,%xmm2,%xmm2 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpor %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpxor %xmm0,%xmm3,%xmm3 + vmovdqa 144-128(%rax),%xmm0 + + vpslld $5,%xmm13,%xmm8 + vpaddd %xmm15,%xmm12,%xmm12 + vpxor %xmm14,%xmm11,%xmm6 + vmovdqa %xmm2,96-128(%rax) + vpaddd %xmm2,%xmm12,%xmm12 + vpxor 240-128(%rax),%xmm3,%xmm3 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm10,%xmm6,%xmm6 + vpxor %xmm0,%xmm3,%xmm3 + + vpslld $30,%xmm14,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm12,%xmm12 + vpsrld $31,%xmm3,%xmm5 + vpaddd %xmm3,%xmm3,%xmm3 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpor %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqa 160-128(%rax),%xmm1 + + vpslld $5,%xmm12,%xmm8 + vpaddd %xmm15,%xmm11,%xmm11 + vpxor %xmm13,%xmm10,%xmm6 + vmovdqa %xmm3,112-128(%rax) + vpaddd %xmm3,%xmm11,%xmm11 + vpxor 0-128(%rax),%xmm4,%xmm4 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm14,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm4 + + vpslld $30,%xmm13,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm11,%xmm11 + vpsrld $31,%xmm4,%xmm5 + vpaddd %xmm4,%xmm4,%xmm4 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpor %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa 176-128(%rax),%xmm2 + + vpslld $5,%xmm11,%xmm8 + vpaddd %xmm15,%xmm10,%xmm10 + vpxor %xmm12,%xmm14,%xmm6 + vmovdqa %xmm4,128-128(%rax) + vpaddd %xmm4,%xmm10,%xmm10 + vpxor 16-128(%rax),%xmm0,%xmm0 + vpsrld $27,%xmm11,%xmm9 + vpxor %xmm13,%xmm6,%xmm6 + vpxor %xmm2,%xmm0,%xmm0 + + vpslld $30,%xmm12,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm10,%xmm10 + vpsrld $31,%xmm0,%xmm5 + vpaddd %xmm0,%xmm0,%xmm0 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + vpor %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vpxor %xmm3,%xmm1,%xmm1 + vmovdqa 192-128(%rax),%xmm3 + + vpslld $5,%xmm10,%xmm8 + vpaddd %xmm15,%xmm14,%xmm14 + vpxor %xmm11,%xmm13,%xmm6 + vmovdqa %xmm0,144-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpxor 32-128(%rax),%xmm1,%xmm1 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm12,%xmm6,%xmm6 + vpxor %xmm3,%xmm1,%xmm1 + + vpslld $30,%xmm11,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm14,%xmm14 + vpsrld $31,%xmm1,%xmm5 + vpaddd %xmm1,%xmm1,%xmm1 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpor %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa 208-128(%rax),%xmm4 + + vpslld $5,%xmm14,%xmm8 + vpaddd %xmm15,%xmm13,%xmm13 + vpxor %xmm10,%xmm12,%xmm6 + vmovdqa %xmm1,160-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpxor 48-128(%rax),%xmm2,%xmm2 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm11,%xmm6,%xmm6 + vpxor %xmm4,%xmm2,%xmm2 + + vpslld $30,%xmm10,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm13,%xmm13 + vpsrld $31,%xmm2,%xmm5 + vpaddd %xmm2,%xmm2,%xmm2 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpor %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpxor %xmm0,%xmm3,%xmm3 + vmovdqa 224-128(%rax),%xmm0 + + vpslld $5,%xmm13,%xmm8 + vpaddd %xmm15,%xmm12,%xmm12 + vpxor %xmm14,%xmm11,%xmm6 + vmovdqa %xmm2,176-128(%rax) + vpaddd %xmm2,%xmm12,%xmm12 + vpxor 64-128(%rax),%xmm3,%xmm3 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm10,%xmm6,%xmm6 + vpxor %xmm0,%xmm3,%xmm3 + + vpslld $30,%xmm14,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm12,%xmm12 + vpsrld $31,%xmm3,%xmm5 + vpaddd %xmm3,%xmm3,%xmm3 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpor %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqa 240-128(%rax),%xmm1 + + vpslld $5,%xmm12,%xmm8 + vpaddd %xmm15,%xmm11,%xmm11 + vpxor %xmm13,%xmm10,%xmm6 + vmovdqa %xmm3,192-128(%rax) + vpaddd %xmm3,%xmm11,%xmm11 + vpxor 80-128(%rax),%xmm4,%xmm4 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm14,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm4 + + vpslld $30,%xmm13,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm11,%xmm11 + vpsrld $31,%xmm4,%xmm5 + vpaddd %xmm4,%xmm4,%xmm4 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpor %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa 0-128(%rax),%xmm2 + + vpslld $5,%xmm11,%xmm8 + vpaddd %xmm15,%xmm10,%xmm10 + vpxor %xmm12,%xmm14,%xmm6 + vmovdqa %xmm4,208-128(%rax) + vpaddd %xmm4,%xmm10,%xmm10 + vpxor 96-128(%rax),%xmm0,%xmm0 + vpsrld $27,%xmm11,%xmm9 + vpxor %xmm13,%xmm6,%xmm6 + vpxor %xmm2,%xmm0,%xmm0 + + vpslld $30,%xmm12,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm10,%xmm10 + vpsrld $31,%xmm0,%xmm5 + vpaddd %xmm0,%xmm0,%xmm0 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + vpor %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vpxor %xmm3,%xmm1,%xmm1 + vmovdqa 16-128(%rax),%xmm3 + + vpslld $5,%xmm10,%xmm8 + vpaddd %xmm15,%xmm14,%xmm14 + vpxor %xmm11,%xmm13,%xmm6 + vmovdqa %xmm0,224-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpxor 112-128(%rax),%xmm1,%xmm1 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm12,%xmm6,%xmm6 + vpxor %xmm3,%xmm1,%xmm1 + + vpslld $30,%xmm11,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm14,%xmm14 + vpsrld $31,%xmm1,%xmm5 + vpaddd %xmm1,%xmm1,%xmm1 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpor %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa 32-128(%rax),%xmm4 + + vpslld $5,%xmm14,%xmm8 + vpaddd %xmm15,%xmm13,%xmm13 + vpxor %xmm10,%xmm12,%xmm6 + vmovdqa %xmm1,240-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpxor 128-128(%rax),%xmm2,%xmm2 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm11,%xmm6,%xmm6 + vpxor %xmm4,%xmm2,%xmm2 + + vpslld $30,%xmm10,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm13,%xmm13 + vpsrld $31,%xmm2,%xmm5 + vpaddd %xmm2,%xmm2,%xmm2 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpor %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpxor %xmm0,%xmm3,%xmm3 + vmovdqa 48-128(%rax),%xmm0 + + vpslld $5,%xmm13,%xmm8 + vpaddd %xmm15,%xmm12,%xmm12 + vpxor %xmm14,%xmm11,%xmm6 + vmovdqa %xmm2,0-128(%rax) + vpaddd %xmm2,%xmm12,%xmm12 + vpxor 144-128(%rax),%xmm3,%xmm3 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm10,%xmm6,%xmm6 + vpxor %xmm0,%xmm3,%xmm3 + + vpslld $30,%xmm14,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm12,%xmm12 + vpsrld $31,%xmm3,%xmm5 + vpaddd %xmm3,%xmm3,%xmm3 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpor %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqa 64-128(%rax),%xmm1 + + vpslld $5,%xmm12,%xmm8 + vpaddd %xmm15,%xmm11,%xmm11 + vpxor %xmm13,%xmm10,%xmm6 + vmovdqa %xmm3,16-128(%rax) + vpaddd %xmm3,%xmm11,%xmm11 + vpxor 160-128(%rax),%xmm4,%xmm4 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm14,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm4 + + vpslld $30,%xmm13,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm11,%xmm11 + vpsrld $31,%xmm4,%xmm5 + vpaddd %xmm4,%xmm4,%xmm4 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpor %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa 80-128(%rax),%xmm2 + + vpslld $5,%xmm11,%xmm8 + vpaddd %xmm15,%xmm10,%xmm10 + vpxor %xmm12,%xmm14,%xmm6 + vmovdqa %xmm4,32-128(%rax) + vpaddd %xmm4,%xmm10,%xmm10 + vpxor 176-128(%rax),%xmm0,%xmm0 + vpsrld $27,%xmm11,%xmm9 + vpxor %xmm13,%xmm6,%xmm6 + vpxor %xmm2,%xmm0,%xmm0 + + vpslld $30,%xmm12,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm10,%xmm10 + vpsrld $31,%xmm0,%xmm5 + vpaddd %xmm0,%xmm0,%xmm0 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + vpor %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vpxor %xmm3,%xmm1,%xmm1 + vmovdqa 96-128(%rax),%xmm3 + + vpslld $5,%xmm10,%xmm8 + vpaddd %xmm15,%xmm14,%xmm14 + vpxor %xmm11,%xmm13,%xmm6 + vmovdqa %xmm0,48-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpxor 192-128(%rax),%xmm1,%xmm1 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm12,%xmm6,%xmm6 + vpxor %xmm3,%xmm1,%xmm1 + + vpslld $30,%xmm11,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm14,%xmm14 + vpsrld $31,%xmm1,%xmm5 + vpaddd %xmm1,%xmm1,%xmm1 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpor %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa 112-128(%rax),%xmm4 + + vpslld $5,%xmm14,%xmm8 + vpaddd %xmm15,%xmm13,%xmm13 + vpxor %xmm10,%xmm12,%xmm6 + vmovdqa %xmm1,64-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpxor 208-128(%rax),%xmm2,%xmm2 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm11,%xmm6,%xmm6 + vpxor %xmm4,%xmm2,%xmm2 + + vpslld $30,%xmm10,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm13,%xmm13 + vpsrld $31,%xmm2,%xmm5 + vpaddd %xmm2,%xmm2,%xmm2 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpor %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpxor %xmm0,%xmm3,%xmm3 + vmovdqa 128-128(%rax),%xmm0 + + vpslld $5,%xmm13,%xmm8 + vpaddd %xmm15,%xmm12,%xmm12 + vpxor %xmm14,%xmm11,%xmm6 + vmovdqa %xmm2,80-128(%rax) + vpaddd %xmm2,%xmm12,%xmm12 + vpxor 224-128(%rax),%xmm3,%xmm3 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm10,%xmm6,%xmm6 + vpxor %xmm0,%xmm3,%xmm3 + + vpslld $30,%xmm14,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm12,%xmm12 + vpsrld $31,%xmm3,%xmm5 + vpaddd %xmm3,%xmm3,%xmm3 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpor %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqa 144-128(%rax),%xmm1 + + vpslld $5,%xmm12,%xmm8 + vpaddd %xmm15,%xmm11,%xmm11 + vpxor %xmm13,%xmm10,%xmm6 + vmovdqa %xmm3,96-128(%rax) + vpaddd %xmm3,%xmm11,%xmm11 + vpxor 240-128(%rax),%xmm4,%xmm4 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm14,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm4 + + vpslld $30,%xmm13,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm11,%xmm11 + vpsrld $31,%xmm4,%xmm5 + vpaddd %xmm4,%xmm4,%xmm4 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpor %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa 160-128(%rax),%xmm2 + + vpslld $5,%xmm11,%xmm8 + vpaddd %xmm15,%xmm10,%xmm10 + vpxor %xmm12,%xmm14,%xmm6 + vmovdqa %xmm4,112-128(%rax) + vpaddd %xmm4,%xmm10,%xmm10 + vpxor 0-128(%rax),%xmm0,%xmm0 + vpsrld $27,%xmm11,%xmm9 + vpxor %xmm13,%xmm6,%xmm6 + vpxor %xmm2,%xmm0,%xmm0 + + vpslld $30,%xmm12,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm10,%xmm10 + vpsrld $31,%xmm0,%xmm5 + vpaddd %xmm0,%xmm0,%xmm0 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + vpor %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vmovdqa 32(%rbp),%xmm15 + vpxor %xmm3,%xmm1,%xmm1 + vmovdqa 176-128(%rax),%xmm3 + + vpaddd %xmm15,%xmm14,%xmm14 + vpslld $5,%xmm10,%xmm8 + vpand %xmm12,%xmm13,%xmm7 + vpxor 16-128(%rax),%xmm1,%xmm1 + + vpaddd %xmm7,%xmm14,%xmm14 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm12,%xmm13,%xmm6 + vpxor %xmm3,%xmm1,%xmm1 + + vmovdqu %xmm0,128-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm1,%xmm5 + vpand %xmm11,%xmm6,%xmm6 + vpaddd %xmm1,%xmm1,%xmm1 + + vpslld $30,%xmm11,%xmm7 + vpaddd %xmm6,%xmm14,%xmm14 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpor %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa 192-128(%rax),%xmm4 + + vpaddd %xmm15,%xmm13,%xmm13 + vpslld $5,%xmm14,%xmm8 + vpand %xmm11,%xmm12,%xmm7 + vpxor 32-128(%rax),%xmm2,%xmm2 + + vpaddd %xmm7,%xmm13,%xmm13 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm11,%xmm12,%xmm6 + vpxor %xmm4,%xmm2,%xmm2 + + vmovdqu %xmm1,144-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm2,%xmm5 + vpand %xmm10,%xmm6,%xmm6 + vpaddd %xmm2,%xmm2,%xmm2 + + vpslld $30,%xmm10,%xmm7 + vpaddd %xmm6,%xmm13,%xmm13 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpor %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpxor %xmm0,%xmm3,%xmm3 + vmovdqa 208-128(%rax),%xmm0 + + vpaddd %xmm15,%xmm12,%xmm12 + vpslld $5,%xmm13,%xmm8 + vpand %xmm10,%xmm11,%xmm7 + vpxor 48-128(%rax),%xmm3,%xmm3 + + vpaddd %xmm7,%xmm12,%xmm12 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm10,%xmm11,%xmm6 + vpxor %xmm0,%xmm3,%xmm3 + + vmovdqu %xmm2,160-128(%rax) + vpaddd %xmm2,%xmm12,%xmm12 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm3,%xmm5 + vpand %xmm14,%xmm6,%xmm6 + vpaddd %xmm3,%xmm3,%xmm3 + + vpslld $30,%xmm14,%xmm7 + vpaddd %xmm6,%xmm12,%xmm12 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpor %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqa 224-128(%rax),%xmm1 + + vpaddd %xmm15,%xmm11,%xmm11 + vpslld $5,%xmm12,%xmm8 + vpand %xmm14,%xmm10,%xmm7 + vpxor 64-128(%rax),%xmm4,%xmm4 + + vpaddd %xmm7,%xmm11,%xmm11 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm14,%xmm10,%xmm6 + vpxor %xmm1,%xmm4,%xmm4 + + vmovdqu %xmm3,176-128(%rax) + vpaddd %xmm3,%xmm11,%xmm11 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm4,%xmm5 + vpand %xmm13,%xmm6,%xmm6 + vpaddd %xmm4,%xmm4,%xmm4 + + vpslld $30,%xmm13,%xmm7 + vpaddd %xmm6,%xmm11,%xmm11 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpor %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa 240-128(%rax),%xmm2 + + vpaddd %xmm15,%xmm10,%xmm10 + vpslld $5,%xmm11,%xmm8 + vpand %xmm13,%xmm14,%xmm7 + vpxor 80-128(%rax),%xmm0,%xmm0 + + vpaddd %xmm7,%xmm10,%xmm10 + vpsrld $27,%xmm11,%xmm9 + vpxor %xmm13,%xmm14,%xmm6 + vpxor %xmm2,%xmm0,%xmm0 + + vmovdqu %xmm4,192-128(%rax) + vpaddd %xmm4,%xmm10,%xmm10 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm0,%xmm5 + vpand %xmm12,%xmm6,%xmm6 + vpaddd %xmm0,%xmm0,%xmm0 + + vpslld $30,%xmm12,%xmm7 + vpaddd %xmm6,%xmm10,%xmm10 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + vpor %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vpxor %xmm3,%xmm1,%xmm1 + vmovdqa 0-128(%rax),%xmm3 + + vpaddd %xmm15,%xmm14,%xmm14 + vpslld $5,%xmm10,%xmm8 + vpand %xmm12,%xmm13,%xmm7 + vpxor 96-128(%rax),%xmm1,%xmm1 + + vpaddd %xmm7,%xmm14,%xmm14 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm12,%xmm13,%xmm6 + vpxor %xmm3,%xmm1,%xmm1 + + vmovdqu %xmm0,208-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm1,%xmm5 + vpand %xmm11,%xmm6,%xmm6 + vpaddd %xmm1,%xmm1,%xmm1 + + vpslld $30,%xmm11,%xmm7 + vpaddd %xmm6,%xmm14,%xmm14 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpor %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa 16-128(%rax),%xmm4 + + vpaddd %xmm15,%xmm13,%xmm13 + vpslld $5,%xmm14,%xmm8 + vpand %xmm11,%xmm12,%xmm7 + vpxor 112-128(%rax),%xmm2,%xmm2 + + vpaddd %xmm7,%xmm13,%xmm13 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm11,%xmm12,%xmm6 + vpxor %xmm4,%xmm2,%xmm2 + + vmovdqu %xmm1,224-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm2,%xmm5 + vpand %xmm10,%xmm6,%xmm6 + vpaddd %xmm2,%xmm2,%xmm2 + + vpslld $30,%xmm10,%xmm7 + vpaddd %xmm6,%xmm13,%xmm13 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpor %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpxor %xmm0,%xmm3,%xmm3 + vmovdqa 32-128(%rax),%xmm0 + + vpaddd %xmm15,%xmm12,%xmm12 + vpslld $5,%xmm13,%xmm8 + vpand %xmm10,%xmm11,%xmm7 + vpxor 128-128(%rax),%xmm3,%xmm3 + + vpaddd %xmm7,%xmm12,%xmm12 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm10,%xmm11,%xmm6 + vpxor %xmm0,%xmm3,%xmm3 + + vmovdqu %xmm2,240-128(%rax) + vpaddd %xmm2,%xmm12,%xmm12 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm3,%xmm5 + vpand %xmm14,%xmm6,%xmm6 + vpaddd %xmm3,%xmm3,%xmm3 + + vpslld $30,%xmm14,%xmm7 + vpaddd %xmm6,%xmm12,%xmm12 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpor %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqa 48-128(%rax),%xmm1 + + vpaddd %xmm15,%xmm11,%xmm11 + vpslld $5,%xmm12,%xmm8 + vpand %xmm14,%xmm10,%xmm7 + vpxor 144-128(%rax),%xmm4,%xmm4 + + vpaddd %xmm7,%xmm11,%xmm11 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm14,%xmm10,%xmm6 + vpxor %xmm1,%xmm4,%xmm4 + + vmovdqu %xmm3,0-128(%rax) + vpaddd %xmm3,%xmm11,%xmm11 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm4,%xmm5 + vpand %xmm13,%xmm6,%xmm6 + vpaddd %xmm4,%xmm4,%xmm4 + + vpslld $30,%xmm13,%xmm7 + vpaddd %xmm6,%xmm11,%xmm11 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpor %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa 64-128(%rax),%xmm2 + + vpaddd %xmm15,%xmm10,%xmm10 + vpslld $5,%xmm11,%xmm8 + vpand %xmm13,%xmm14,%xmm7 + vpxor 160-128(%rax),%xmm0,%xmm0 + + vpaddd %xmm7,%xmm10,%xmm10 + vpsrld $27,%xmm11,%xmm9 + vpxor %xmm13,%xmm14,%xmm6 + vpxor %xmm2,%xmm0,%xmm0 + + vmovdqu %xmm4,16-128(%rax) + vpaddd %xmm4,%xmm10,%xmm10 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm0,%xmm5 + vpand %xmm12,%xmm6,%xmm6 + vpaddd %xmm0,%xmm0,%xmm0 + + vpslld $30,%xmm12,%xmm7 + vpaddd %xmm6,%xmm10,%xmm10 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + vpor %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vpxor %xmm3,%xmm1,%xmm1 + vmovdqa 80-128(%rax),%xmm3 + + vpaddd %xmm15,%xmm14,%xmm14 + vpslld $5,%xmm10,%xmm8 + vpand %xmm12,%xmm13,%xmm7 + vpxor 176-128(%rax),%xmm1,%xmm1 + + vpaddd %xmm7,%xmm14,%xmm14 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm12,%xmm13,%xmm6 + vpxor %xmm3,%xmm1,%xmm1 + + vmovdqu %xmm0,32-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm1,%xmm5 + vpand %xmm11,%xmm6,%xmm6 + vpaddd %xmm1,%xmm1,%xmm1 + + vpslld $30,%xmm11,%xmm7 + vpaddd %xmm6,%xmm14,%xmm14 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpor %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa 96-128(%rax),%xmm4 + + vpaddd %xmm15,%xmm13,%xmm13 + vpslld $5,%xmm14,%xmm8 + vpand %xmm11,%xmm12,%xmm7 + vpxor 192-128(%rax),%xmm2,%xmm2 + + vpaddd %xmm7,%xmm13,%xmm13 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm11,%xmm12,%xmm6 + vpxor %xmm4,%xmm2,%xmm2 + + vmovdqu %xmm1,48-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm2,%xmm5 + vpand %xmm10,%xmm6,%xmm6 + vpaddd %xmm2,%xmm2,%xmm2 + + vpslld $30,%xmm10,%xmm7 + vpaddd %xmm6,%xmm13,%xmm13 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpor %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpxor %xmm0,%xmm3,%xmm3 + vmovdqa 112-128(%rax),%xmm0 + + vpaddd %xmm15,%xmm12,%xmm12 + vpslld $5,%xmm13,%xmm8 + vpand %xmm10,%xmm11,%xmm7 + vpxor 208-128(%rax),%xmm3,%xmm3 + + vpaddd %xmm7,%xmm12,%xmm12 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm10,%xmm11,%xmm6 + vpxor %xmm0,%xmm3,%xmm3 + + vmovdqu %xmm2,64-128(%rax) + vpaddd %xmm2,%xmm12,%xmm12 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm3,%xmm5 + vpand %xmm14,%xmm6,%xmm6 + vpaddd %xmm3,%xmm3,%xmm3 + + vpslld $30,%xmm14,%xmm7 + vpaddd %xmm6,%xmm12,%xmm12 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpor %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqa 128-128(%rax),%xmm1 + + vpaddd %xmm15,%xmm11,%xmm11 + vpslld $5,%xmm12,%xmm8 + vpand %xmm14,%xmm10,%xmm7 + vpxor 224-128(%rax),%xmm4,%xmm4 + + vpaddd %xmm7,%xmm11,%xmm11 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm14,%xmm10,%xmm6 + vpxor %xmm1,%xmm4,%xmm4 + + vmovdqu %xmm3,80-128(%rax) + vpaddd %xmm3,%xmm11,%xmm11 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm4,%xmm5 + vpand %xmm13,%xmm6,%xmm6 + vpaddd %xmm4,%xmm4,%xmm4 + + vpslld $30,%xmm13,%xmm7 + vpaddd %xmm6,%xmm11,%xmm11 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpor %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa 144-128(%rax),%xmm2 + + vpaddd %xmm15,%xmm10,%xmm10 + vpslld $5,%xmm11,%xmm8 + vpand %xmm13,%xmm14,%xmm7 + vpxor 240-128(%rax),%xmm0,%xmm0 + + vpaddd %xmm7,%xmm10,%xmm10 + vpsrld $27,%xmm11,%xmm9 + vpxor %xmm13,%xmm14,%xmm6 + vpxor %xmm2,%xmm0,%xmm0 + + vmovdqu %xmm4,96-128(%rax) + vpaddd %xmm4,%xmm10,%xmm10 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm0,%xmm5 + vpand %xmm12,%xmm6,%xmm6 + vpaddd %xmm0,%xmm0,%xmm0 + + vpslld $30,%xmm12,%xmm7 + vpaddd %xmm6,%xmm10,%xmm10 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + vpor %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vpxor %xmm3,%xmm1,%xmm1 + vmovdqa 160-128(%rax),%xmm3 + + vpaddd %xmm15,%xmm14,%xmm14 + vpslld $5,%xmm10,%xmm8 + vpand %xmm12,%xmm13,%xmm7 + vpxor 0-128(%rax),%xmm1,%xmm1 + + vpaddd %xmm7,%xmm14,%xmm14 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm12,%xmm13,%xmm6 + vpxor %xmm3,%xmm1,%xmm1 + + vmovdqu %xmm0,112-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm1,%xmm5 + vpand %xmm11,%xmm6,%xmm6 + vpaddd %xmm1,%xmm1,%xmm1 + + vpslld $30,%xmm11,%xmm7 + vpaddd %xmm6,%xmm14,%xmm14 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpor %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa 176-128(%rax),%xmm4 + + vpaddd %xmm15,%xmm13,%xmm13 + vpslld $5,%xmm14,%xmm8 + vpand %xmm11,%xmm12,%xmm7 + vpxor 16-128(%rax),%xmm2,%xmm2 + + vpaddd %xmm7,%xmm13,%xmm13 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm11,%xmm12,%xmm6 + vpxor %xmm4,%xmm2,%xmm2 + + vmovdqu %xmm1,128-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm2,%xmm5 + vpand %xmm10,%xmm6,%xmm6 + vpaddd %xmm2,%xmm2,%xmm2 + + vpslld $30,%xmm10,%xmm7 + vpaddd %xmm6,%xmm13,%xmm13 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpor %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpxor %xmm0,%xmm3,%xmm3 + vmovdqa 192-128(%rax),%xmm0 + + vpaddd %xmm15,%xmm12,%xmm12 + vpslld $5,%xmm13,%xmm8 + vpand %xmm10,%xmm11,%xmm7 + vpxor 32-128(%rax),%xmm3,%xmm3 + + vpaddd %xmm7,%xmm12,%xmm12 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm10,%xmm11,%xmm6 + vpxor %xmm0,%xmm3,%xmm3 + + vmovdqu %xmm2,144-128(%rax) + vpaddd %xmm2,%xmm12,%xmm12 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm3,%xmm5 + vpand %xmm14,%xmm6,%xmm6 + vpaddd %xmm3,%xmm3,%xmm3 + + vpslld $30,%xmm14,%xmm7 + vpaddd %xmm6,%xmm12,%xmm12 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpor %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqa 208-128(%rax),%xmm1 + + vpaddd %xmm15,%xmm11,%xmm11 + vpslld $5,%xmm12,%xmm8 + vpand %xmm14,%xmm10,%xmm7 + vpxor 48-128(%rax),%xmm4,%xmm4 + + vpaddd %xmm7,%xmm11,%xmm11 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm14,%xmm10,%xmm6 + vpxor %xmm1,%xmm4,%xmm4 + + vmovdqu %xmm3,160-128(%rax) + vpaddd %xmm3,%xmm11,%xmm11 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm4,%xmm5 + vpand %xmm13,%xmm6,%xmm6 + vpaddd %xmm4,%xmm4,%xmm4 + + vpslld $30,%xmm13,%xmm7 + vpaddd %xmm6,%xmm11,%xmm11 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpor %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa 224-128(%rax),%xmm2 + + vpaddd %xmm15,%xmm10,%xmm10 + vpslld $5,%xmm11,%xmm8 + vpand %xmm13,%xmm14,%xmm7 + vpxor 64-128(%rax),%xmm0,%xmm0 + + vpaddd %xmm7,%xmm10,%xmm10 + vpsrld $27,%xmm11,%xmm9 + vpxor %xmm13,%xmm14,%xmm6 + vpxor %xmm2,%xmm0,%xmm0 + + vmovdqu %xmm4,176-128(%rax) + vpaddd %xmm4,%xmm10,%xmm10 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm0,%xmm5 + vpand %xmm12,%xmm6,%xmm6 + vpaddd %xmm0,%xmm0,%xmm0 + + vpslld $30,%xmm12,%xmm7 + vpaddd %xmm6,%xmm10,%xmm10 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + vpor %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vmovdqa 64(%rbp),%xmm15 + vpxor %xmm3,%xmm1,%xmm1 + vmovdqa 240-128(%rax),%xmm3 + + vpslld $5,%xmm10,%xmm8 + vpaddd %xmm15,%xmm14,%xmm14 + vpxor %xmm11,%xmm13,%xmm6 + vmovdqa %xmm0,192-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpxor 80-128(%rax),%xmm1,%xmm1 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm12,%xmm6,%xmm6 + vpxor %xmm3,%xmm1,%xmm1 + + vpslld $30,%xmm11,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm14,%xmm14 + vpsrld $31,%xmm1,%xmm5 + vpaddd %xmm1,%xmm1,%xmm1 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpor %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa 0-128(%rax),%xmm4 + + vpslld $5,%xmm14,%xmm8 + vpaddd %xmm15,%xmm13,%xmm13 + vpxor %xmm10,%xmm12,%xmm6 + vmovdqa %xmm1,208-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpxor 96-128(%rax),%xmm2,%xmm2 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm11,%xmm6,%xmm6 + vpxor %xmm4,%xmm2,%xmm2 + + vpslld $30,%xmm10,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm13,%xmm13 + vpsrld $31,%xmm2,%xmm5 + vpaddd %xmm2,%xmm2,%xmm2 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpor %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpxor %xmm0,%xmm3,%xmm3 + vmovdqa 16-128(%rax),%xmm0 + + vpslld $5,%xmm13,%xmm8 + vpaddd %xmm15,%xmm12,%xmm12 + vpxor %xmm14,%xmm11,%xmm6 + vmovdqa %xmm2,224-128(%rax) + vpaddd %xmm2,%xmm12,%xmm12 + vpxor 112-128(%rax),%xmm3,%xmm3 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm10,%xmm6,%xmm6 + vpxor %xmm0,%xmm3,%xmm3 + + vpslld $30,%xmm14,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm12,%xmm12 + vpsrld $31,%xmm3,%xmm5 + vpaddd %xmm3,%xmm3,%xmm3 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpor %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqa 32-128(%rax),%xmm1 + + vpslld $5,%xmm12,%xmm8 + vpaddd %xmm15,%xmm11,%xmm11 + vpxor %xmm13,%xmm10,%xmm6 + vmovdqa %xmm3,240-128(%rax) + vpaddd %xmm3,%xmm11,%xmm11 + vpxor 128-128(%rax),%xmm4,%xmm4 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm14,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm4 + + vpslld $30,%xmm13,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm11,%xmm11 + vpsrld $31,%xmm4,%xmm5 + vpaddd %xmm4,%xmm4,%xmm4 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpor %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa 48-128(%rax),%xmm2 + + vpslld $5,%xmm11,%xmm8 + vpaddd %xmm15,%xmm10,%xmm10 + vpxor %xmm12,%xmm14,%xmm6 + vmovdqa %xmm4,0-128(%rax) + vpaddd %xmm4,%xmm10,%xmm10 + vpxor 144-128(%rax),%xmm0,%xmm0 + vpsrld $27,%xmm11,%xmm9 + vpxor %xmm13,%xmm6,%xmm6 + vpxor %xmm2,%xmm0,%xmm0 + + vpslld $30,%xmm12,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm10,%xmm10 + vpsrld $31,%xmm0,%xmm5 + vpaddd %xmm0,%xmm0,%xmm0 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + vpor %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vpxor %xmm3,%xmm1,%xmm1 + vmovdqa 64-128(%rax),%xmm3 + + vpslld $5,%xmm10,%xmm8 + vpaddd %xmm15,%xmm14,%xmm14 + vpxor %xmm11,%xmm13,%xmm6 + vmovdqa %xmm0,16-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpxor 160-128(%rax),%xmm1,%xmm1 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm12,%xmm6,%xmm6 + vpxor %xmm3,%xmm1,%xmm1 + + vpslld $30,%xmm11,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm14,%xmm14 + vpsrld $31,%xmm1,%xmm5 + vpaddd %xmm1,%xmm1,%xmm1 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpor %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa 80-128(%rax),%xmm4 + + vpslld $5,%xmm14,%xmm8 + vpaddd %xmm15,%xmm13,%xmm13 + vpxor %xmm10,%xmm12,%xmm6 + vmovdqa %xmm1,32-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpxor 176-128(%rax),%xmm2,%xmm2 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm11,%xmm6,%xmm6 + vpxor %xmm4,%xmm2,%xmm2 + + vpslld $30,%xmm10,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm13,%xmm13 + vpsrld $31,%xmm2,%xmm5 + vpaddd %xmm2,%xmm2,%xmm2 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpor %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpxor %xmm0,%xmm3,%xmm3 + vmovdqa 96-128(%rax),%xmm0 + + vpslld $5,%xmm13,%xmm8 + vpaddd %xmm15,%xmm12,%xmm12 + vpxor %xmm14,%xmm11,%xmm6 + vmovdqa %xmm2,48-128(%rax) + vpaddd %xmm2,%xmm12,%xmm12 + vpxor 192-128(%rax),%xmm3,%xmm3 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm10,%xmm6,%xmm6 + vpxor %xmm0,%xmm3,%xmm3 + + vpslld $30,%xmm14,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm12,%xmm12 + vpsrld $31,%xmm3,%xmm5 + vpaddd %xmm3,%xmm3,%xmm3 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpor %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqa 112-128(%rax),%xmm1 + + vpslld $5,%xmm12,%xmm8 + vpaddd %xmm15,%xmm11,%xmm11 + vpxor %xmm13,%xmm10,%xmm6 + vmovdqa %xmm3,64-128(%rax) + vpaddd %xmm3,%xmm11,%xmm11 + vpxor 208-128(%rax),%xmm4,%xmm4 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm14,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm4 + + vpslld $30,%xmm13,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm11,%xmm11 + vpsrld $31,%xmm4,%xmm5 + vpaddd %xmm4,%xmm4,%xmm4 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpor %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa 128-128(%rax),%xmm2 + + vpslld $5,%xmm11,%xmm8 + vpaddd %xmm15,%xmm10,%xmm10 + vpxor %xmm12,%xmm14,%xmm6 + vmovdqa %xmm4,80-128(%rax) + vpaddd %xmm4,%xmm10,%xmm10 + vpxor 224-128(%rax),%xmm0,%xmm0 + vpsrld $27,%xmm11,%xmm9 + vpxor %xmm13,%xmm6,%xmm6 + vpxor %xmm2,%xmm0,%xmm0 + + vpslld $30,%xmm12,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm10,%xmm10 + vpsrld $31,%xmm0,%xmm5 + vpaddd %xmm0,%xmm0,%xmm0 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + vpor %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vpxor %xmm3,%xmm1,%xmm1 + vmovdqa 144-128(%rax),%xmm3 + + vpslld $5,%xmm10,%xmm8 + vpaddd %xmm15,%xmm14,%xmm14 + vpxor %xmm11,%xmm13,%xmm6 + vmovdqa %xmm0,96-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpxor 240-128(%rax),%xmm1,%xmm1 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm12,%xmm6,%xmm6 + vpxor %xmm3,%xmm1,%xmm1 + + vpslld $30,%xmm11,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm14,%xmm14 + vpsrld $31,%xmm1,%xmm5 + vpaddd %xmm1,%xmm1,%xmm1 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpor %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa 160-128(%rax),%xmm4 + + vpslld $5,%xmm14,%xmm8 + vpaddd %xmm15,%xmm13,%xmm13 + vpxor %xmm10,%xmm12,%xmm6 + vmovdqa %xmm1,112-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpxor 0-128(%rax),%xmm2,%xmm2 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm11,%xmm6,%xmm6 + vpxor %xmm4,%xmm2,%xmm2 + + vpslld $30,%xmm10,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm13,%xmm13 + vpsrld $31,%xmm2,%xmm5 + vpaddd %xmm2,%xmm2,%xmm2 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpor %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpxor %xmm0,%xmm3,%xmm3 + vmovdqa 176-128(%rax),%xmm0 + + vpslld $5,%xmm13,%xmm8 + vpaddd %xmm15,%xmm12,%xmm12 + vpxor %xmm14,%xmm11,%xmm6 + vpaddd %xmm2,%xmm12,%xmm12 + vpxor 16-128(%rax),%xmm3,%xmm3 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm10,%xmm6,%xmm6 + vpxor %xmm0,%xmm3,%xmm3 + + vpslld $30,%xmm14,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm12,%xmm12 + vpsrld $31,%xmm3,%xmm5 + vpaddd %xmm3,%xmm3,%xmm3 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpor %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqa 192-128(%rax),%xmm1 + + vpslld $5,%xmm12,%xmm8 + vpaddd %xmm15,%xmm11,%xmm11 + vpxor %xmm13,%xmm10,%xmm6 + vpaddd %xmm3,%xmm11,%xmm11 + vpxor 32-128(%rax),%xmm4,%xmm4 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm14,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm4 + + vpslld $30,%xmm13,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm11,%xmm11 + vpsrld $31,%xmm4,%xmm5 + vpaddd %xmm4,%xmm4,%xmm4 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpor %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa 208-128(%rax),%xmm2 + + vpslld $5,%xmm11,%xmm8 + vpaddd %xmm15,%xmm10,%xmm10 + vpxor %xmm12,%xmm14,%xmm6 + vpaddd %xmm4,%xmm10,%xmm10 + vpxor 48-128(%rax),%xmm0,%xmm0 + vpsrld $27,%xmm11,%xmm9 + vpxor %xmm13,%xmm6,%xmm6 + vpxor %xmm2,%xmm0,%xmm0 + + vpslld $30,%xmm12,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm10,%xmm10 + vpsrld $31,%xmm0,%xmm5 + vpaddd %xmm0,%xmm0,%xmm0 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + vpor %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vpxor %xmm3,%xmm1,%xmm1 + vmovdqa 224-128(%rax),%xmm3 + + vpslld $5,%xmm10,%xmm8 + vpaddd %xmm15,%xmm14,%xmm14 + vpxor %xmm11,%xmm13,%xmm6 + vpaddd %xmm0,%xmm14,%xmm14 + vpxor 64-128(%rax),%xmm1,%xmm1 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm12,%xmm6,%xmm6 + vpxor %xmm3,%xmm1,%xmm1 + + vpslld $30,%xmm11,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm14,%xmm14 + vpsrld $31,%xmm1,%xmm5 + vpaddd %xmm1,%xmm1,%xmm1 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpor %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa 240-128(%rax),%xmm4 + + vpslld $5,%xmm14,%xmm8 + vpaddd %xmm15,%xmm13,%xmm13 + vpxor %xmm10,%xmm12,%xmm6 + vpaddd %xmm1,%xmm13,%xmm13 + vpxor 80-128(%rax),%xmm2,%xmm2 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm11,%xmm6,%xmm6 + vpxor %xmm4,%xmm2,%xmm2 + + vpslld $30,%xmm10,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm13,%xmm13 + vpsrld $31,%xmm2,%xmm5 + vpaddd %xmm2,%xmm2,%xmm2 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpor %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpxor %xmm0,%xmm3,%xmm3 + vmovdqa 0-128(%rax),%xmm0 + + vpslld $5,%xmm13,%xmm8 + vpaddd %xmm15,%xmm12,%xmm12 + vpxor %xmm14,%xmm11,%xmm6 + vpaddd %xmm2,%xmm12,%xmm12 + vpxor 96-128(%rax),%xmm3,%xmm3 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm10,%xmm6,%xmm6 + vpxor %xmm0,%xmm3,%xmm3 + + vpslld $30,%xmm14,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm12,%xmm12 + vpsrld $31,%xmm3,%xmm5 + vpaddd %xmm3,%xmm3,%xmm3 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpor %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqa 16-128(%rax),%xmm1 + + vpslld $5,%xmm12,%xmm8 + vpaddd %xmm15,%xmm11,%xmm11 + vpxor %xmm13,%xmm10,%xmm6 + vpaddd %xmm3,%xmm11,%xmm11 + vpxor 112-128(%rax),%xmm4,%xmm4 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm14,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm4 + + vpslld $30,%xmm13,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm11,%xmm11 + vpsrld $31,%xmm4,%xmm5 + vpaddd %xmm4,%xmm4,%xmm4 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpor %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpslld $5,%xmm11,%xmm8 + vpaddd %xmm15,%xmm10,%xmm10 + vpxor %xmm12,%xmm14,%xmm6 + + vpsrld $27,%xmm11,%xmm9 + vpaddd %xmm4,%xmm10,%xmm10 + vpxor %xmm13,%xmm6,%xmm6 + + vpslld $30,%xmm12,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm10,%xmm10 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + vpor %xmm7,%xmm12,%xmm12 + movl $1,%ecx + cmpl 0(%rbx),%ecx + cmovgeq %rbp,%r8 + cmpl 4(%rbx),%ecx + cmovgeq %rbp,%r9 + cmpl 8(%rbx),%ecx + cmovgeq %rbp,%r10 + cmpl 12(%rbx),%ecx + cmovgeq %rbp,%r11 + vmovdqu (%rbx),%xmm6 + vpxor %xmm8,%xmm8,%xmm8 + vmovdqa %xmm6,%xmm7 + vpcmpgtd %xmm8,%xmm7,%xmm7 + vpaddd %xmm7,%xmm6,%xmm6 + + vpand %xmm7,%xmm10,%xmm10 + vpand %xmm7,%xmm11,%xmm11 + vpaddd 0(%rdi),%xmm10,%xmm10 + vpand %xmm7,%xmm12,%xmm12 + vpaddd 32(%rdi),%xmm11,%xmm11 + vpand %xmm7,%xmm13,%xmm13 + vpaddd 64(%rdi),%xmm12,%xmm12 + vpand %xmm7,%xmm14,%xmm14 + vpaddd 96(%rdi),%xmm13,%xmm13 + vpaddd 128(%rdi),%xmm14,%xmm14 + vmovdqu %xmm10,0(%rdi) + vmovdqu %xmm11,32(%rdi) + vmovdqu %xmm12,64(%rdi) + vmovdqu %xmm13,96(%rdi) + vmovdqu %xmm14,128(%rdi) + + vmovdqu %xmm6,(%rbx) + vmovdqu 96(%rbp),%xmm5 + decl %edx + jnz .Loop_avx + + movl 280(%rsp),%edx + leaq 16(%rdi),%rdi + leaq 64(%rsi),%rsi + decl %edx + jnz .Loop_grande_avx + +.Ldone_avx: + movq 272(%rsp),%rax +.cfi_def_cfa %rax,8 + vzeroupper + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha1_multi_block_avx,.-sha1_multi_block_avx +.type sha1_multi_block_avx2,@function +.align 32 +sha1_multi_block_avx2: +.cfi_startproc +_avx2_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + subq $576,%rsp + andq $-256,%rsp + movq %rax,544(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xa0,0x04,0x06,0x23,0x08 +.Lbody_avx2: + leaq K_XX_XX(%rip),%rbp + shrl $1,%edx + + vzeroupper +.Loop_grande_avx2: + movl %edx,552(%rsp) + xorl %edx,%edx + leaq 512(%rsp),%rbx + movq 0(%rsi),%r12 + movl 8(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,0(%rbx) + cmovleq %rbp,%r12 + movq 16(%rsi),%r13 + movl 24(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,4(%rbx) + cmovleq %rbp,%r13 + movq 32(%rsi),%r14 + movl 40(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,8(%rbx) + cmovleq %rbp,%r14 + movq 48(%rsi),%r15 + movl 56(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,12(%rbx) + cmovleq %rbp,%r15 + movq 64(%rsi),%r8 + movl 72(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,16(%rbx) + cmovleq %rbp,%r8 + movq 80(%rsi),%r9 + movl 88(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,20(%rbx) + cmovleq %rbp,%r9 + movq 96(%rsi),%r10 + movl 104(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,24(%rbx) + cmovleq %rbp,%r10 + movq 112(%rsi),%r11 + movl 120(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,28(%rbx) + cmovleq %rbp,%r11 + vmovdqu 0(%rdi),%ymm0 + leaq 128(%rsp),%rax + vmovdqu 32(%rdi),%ymm1 + leaq 256+128(%rsp),%rbx + vmovdqu 64(%rdi),%ymm2 + vmovdqu 96(%rdi),%ymm3 + vmovdqu 128(%rdi),%ymm4 + vmovdqu 96(%rbp),%ymm9 + jmp .Loop_avx2 + +.align 32 +.Loop_avx2: + vmovdqa -32(%rbp),%ymm15 + vmovd (%r12),%xmm10 + leaq 64(%r12),%r12 + vmovd (%r8),%xmm12 + leaq 64(%r8),%r8 + vmovd (%r13),%xmm7 + leaq 64(%r13),%r13 + vmovd (%r9),%xmm6 + leaq 64(%r9),%r9 + vpinsrd $1,(%r14),%xmm10,%xmm10 + leaq 64(%r14),%r14 + vpinsrd $1,(%r10),%xmm12,%xmm12 + leaq 64(%r10),%r10 + vpinsrd $1,(%r15),%xmm7,%xmm7 + leaq 64(%r15),%r15 + vpunpckldq %ymm7,%ymm10,%ymm10 + vpinsrd $1,(%r11),%xmm6,%xmm6 + leaq 64(%r11),%r11 + vpunpckldq %ymm6,%ymm12,%ymm12 + vmovd -60(%r12),%xmm11 + vinserti128 $1,%xmm12,%ymm10,%ymm10 + vmovd -60(%r8),%xmm8 + vpshufb %ymm9,%ymm10,%ymm10 + vmovd -60(%r13),%xmm7 + vmovd -60(%r9),%xmm6 + vpinsrd $1,-60(%r14),%xmm11,%xmm11 + vpinsrd $1,-60(%r10),%xmm8,%xmm8 + vpinsrd $1,-60(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm11,%ymm11 + vpinsrd $1,-60(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm4,%ymm4 + vpslld $5,%ymm0,%ymm7 + vpandn %ymm3,%ymm1,%ymm6 + vpand %ymm2,%ymm1,%ymm5 + + vmovdqa %ymm10,0-128(%rax) + vpaddd %ymm10,%ymm4,%ymm4 + vinserti128 $1,%xmm8,%ymm11,%ymm11 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vmovd -56(%r12),%xmm12 + + vpslld $30,%ymm1,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vmovd -56(%r8),%xmm8 + vpaddd %ymm5,%ymm4,%ymm4 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpshufb %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vmovd -56(%r13),%xmm7 + vmovd -56(%r9),%xmm6 + vpinsrd $1,-56(%r14),%xmm12,%xmm12 + vpinsrd $1,-56(%r10),%xmm8,%xmm8 + vpinsrd $1,-56(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm12,%ymm12 + vpinsrd $1,-56(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm3,%ymm3 + vpslld $5,%ymm4,%ymm7 + vpandn %ymm2,%ymm0,%ymm6 + vpand %ymm1,%ymm0,%ymm5 + + vmovdqa %ymm11,32-128(%rax) + vpaddd %ymm11,%ymm3,%ymm3 + vinserti128 $1,%xmm8,%ymm12,%ymm12 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vmovd -52(%r12),%xmm13 + + vpslld $30,%ymm0,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vmovd -52(%r8),%xmm8 + vpaddd %ymm5,%ymm3,%ymm3 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpshufb %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vmovd -52(%r13),%xmm7 + vmovd -52(%r9),%xmm6 + vpinsrd $1,-52(%r14),%xmm13,%xmm13 + vpinsrd $1,-52(%r10),%xmm8,%xmm8 + vpinsrd $1,-52(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm13,%ymm13 + vpinsrd $1,-52(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm2,%ymm2 + vpslld $5,%ymm3,%ymm7 + vpandn %ymm1,%ymm4,%ymm6 + vpand %ymm0,%ymm4,%ymm5 + + vmovdqa %ymm12,64-128(%rax) + vpaddd %ymm12,%ymm2,%ymm2 + vinserti128 $1,%xmm8,%ymm13,%ymm13 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vmovd -48(%r12),%xmm14 + + vpslld $30,%ymm4,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vmovd -48(%r8),%xmm8 + vpaddd %ymm5,%ymm2,%ymm2 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpshufb %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vmovd -48(%r13),%xmm7 + vmovd -48(%r9),%xmm6 + vpinsrd $1,-48(%r14),%xmm14,%xmm14 + vpinsrd $1,-48(%r10),%xmm8,%xmm8 + vpinsrd $1,-48(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm14,%ymm14 + vpinsrd $1,-48(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm1,%ymm1 + vpslld $5,%ymm2,%ymm7 + vpandn %ymm0,%ymm3,%ymm6 + vpand %ymm4,%ymm3,%ymm5 + + vmovdqa %ymm13,96-128(%rax) + vpaddd %ymm13,%ymm1,%ymm1 + vinserti128 $1,%xmm8,%ymm14,%ymm14 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vmovd -44(%r12),%xmm10 + + vpslld $30,%ymm3,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vmovd -44(%r8),%xmm8 + vpaddd %ymm5,%ymm1,%ymm1 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpshufb %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vmovd -44(%r13),%xmm7 + vmovd -44(%r9),%xmm6 + vpinsrd $1,-44(%r14),%xmm10,%xmm10 + vpinsrd $1,-44(%r10),%xmm8,%xmm8 + vpinsrd $1,-44(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm10,%ymm10 + vpinsrd $1,-44(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm0,%ymm0 + vpslld $5,%ymm1,%ymm7 + vpandn %ymm4,%ymm2,%ymm6 + vpand %ymm3,%ymm2,%ymm5 + + vmovdqa %ymm14,128-128(%rax) + vpaddd %ymm14,%ymm0,%ymm0 + vinserti128 $1,%xmm8,%ymm10,%ymm10 + vpsrld $27,%ymm1,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vmovd -40(%r12),%xmm11 + + vpslld $30,%ymm2,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vmovd -40(%r8),%xmm8 + vpaddd %ymm5,%ymm0,%ymm0 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + vpshufb %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vmovd -40(%r13),%xmm7 + vmovd -40(%r9),%xmm6 + vpinsrd $1,-40(%r14),%xmm11,%xmm11 + vpinsrd $1,-40(%r10),%xmm8,%xmm8 + vpinsrd $1,-40(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm11,%ymm11 + vpinsrd $1,-40(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm4,%ymm4 + vpslld $5,%ymm0,%ymm7 + vpandn %ymm3,%ymm1,%ymm6 + vpand %ymm2,%ymm1,%ymm5 + + vmovdqa %ymm10,160-128(%rax) + vpaddd %ymm10,%ymm4,%ymm4 + vinserti128 $1,%xmm8,%ymm11,%ymm11 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vmovd -36(%r12),%xmm12 + + vpslld $30,%ymm1,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vmovd -36(%r8),%xmm8 + vpaddd %ymm5,%ymm4,%ymm4 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpshufb %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vmovd -36(%r13),%xmm7 + vmovd -36(%r9),%xmm6 + vpinsrd $1,-36(%r14),%xmm12,%xmm12 + vpinsrd $1,-36(%r10),%xmm8,%xmm8 + vpinsrd $1,-36(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm12,%ymm12 + vpinsrd $1,-36(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm3,%ymm3 + vpslld $5,%ymm4,%ymm7 + vpandn %ymm2,%ymm0,%ymm6 + vpand %ymm1,%ymm0,%ymm5 + + vmovdqa %ymm11,192-128(%rax) + vpaddd %ymm11,%ymm3,%ymm3 + vinserti128 $1,%xmm8,%ymm12,%ymm12 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vmovd -32(%r12),%xmm13 + + vpslld $30,%ymm0,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vmovd -32(%r8),%xmm8 + vpaddd %ymm5,%ymm3,%ymm3 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpshufb %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vmovd -32(%r13),%xmm7 + vmovd -32(%r9),%xmm6 + vpinsrd $1,-32(%r14),%xmm13,%xmm13 + vpinsrd $1,-32(%r10),%xmm8,%xmm8 + vpinsrd $1,-32(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm13,%ymm13 + vpinsrd $1,-32(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm2,%ymm2 + vpslld $5,%ymm3,%ymm7 + vpandn %ymm1,%ymm4,%ymm6 + vpand %ymm0,%ymm4,%ymm5 + + vmovdqa %ymm12,224-128(%rax) + vpaddd %ymm12,%ymm2,%ymm2 + vinserti128 $1,%xmm8,%ymm13,%ymm13 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vmovd -28(%r12),%xmm14 + + vpslld $30,%ymm4,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vmovd -28(%r8),%xmm8 + vpaddd %ymm5,%ymm2,%ymm2 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpshufb %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vmovd -28(%r13),%xmm7 + vmovd -28(%r9),%xmm6 + vpinsrd $1,-28(%r14),%xmm14,%xmm14 + vpinsrd $1,-28(%r10),%xmm8,%xmm8 + vpinsrd $1,-28(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm14,%ymm14 + vpinsrd $1,-28(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm1,%ymm1 + vpslld $5,%ymm2,%ymm7 + vpandn %ymm0,%ymm3,%ymm6 + vpand %ymm4,%ymm3,%ymm5 + + vmovdqa %ymm13,256-256-128(%rbx) + vpaddd %ymm13,%ymm1,%ymm1 + vinserti128 $1,%xmm8,%ymm14,%ymm14 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vmovd -24(%r12),%xmm10 + + vpslld $30,%ymm3,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vmovd -24(%r8),%xmm8 + vpaddd %ymm5,%ymm1,%ymm1 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpshufb %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vmovd -24(%r13),%xmm7 + vmovd -24(%r9),%xmm6 + vpinsrd $1,-24(%r14),%xmm10,%xmm10 + vpinsrd $1,-24(%r10),%xmm8,%xmm8 + vpinsrd $1,-24(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm10,%ymm10 + vpinsrd $1,-24(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm0,%ymm0 + vpslld $5,%ymm1,%ymm7 + vpandn %ymm4,%ymm2,%ymm6 + vpand %ymm3,%ymm2,%ymm5 + + vmovdqa %ymm14,288-256-128(%rbx) + vpaddd %ymm14,%ymm0,%ymm0 + vinserti128 $1,%xmm8,%ymm10,%ymm10 + vpsrld $27,%ymm1,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vmovd -20(%r12),%xmm11 + + vpslld $30,%ymm2,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vmovd -20(%r8),%xmm8 + vpaddd %ymm5,%ymm0,%ymm0 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + vpshufb %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vmovd -20(%r13),%xmm7 + vmovd -20(%r9),%xmm6 + vpinsrd $1,-20(%r14),%xmm11,%xmm11 + vpinsrd $1,-20(%r10),%xmm8,%xmm8 + vpinsrd $1,-20(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm11,%ymm11 + vpinsrd $1,-20(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm4,%ymm4 + vpslld $5,%ymm0,%ymm7 + vpandn %ymm3,%ymm1,%ymm6 + vpand %ymm2,%ymm1,%ymm5 + + vmovdqa %ymm10,320-256-128(%rbx) + vpaddd %ymm10,%ymm4,%ymm4 + vinserti128 $1,%xmm8,%ymm11,%ymm11 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vmovd -16(%r12),%xmm12 + + vpslld $30,%ymm1,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vmovd -16(%r8),%xmm8 + vpaddd %ymm5,%ymm4,%ymm4 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpshufb %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vmovd -16(%r13),%xmm7 + vmovd -16(%r9),%xmm6 + vpinsrd $1,-16(%r14),%xmm12,%xmm12 + vpinsrd $1,-16(%r10),%xmm8,%xmm8 + vpinsrd $1,-16(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm12,%ymm12 + vpinsrd $1,-16(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm3,%ymm3 + vpslld $5,%ymm4,%ymm7 + vpandn %ymm2,%ymm0,%ymm6 + vpand %ymm1,%ymm0,%ymm5 + + vmovdqa %ymm11,352-256-128(%rbx) + vpaddd %ymm11,%ymm3,%ymm3 + vinserti128 $1,%xmm8,%ymm12,%ymm12 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vmovd -12(%r12),%xmm13 + + vpslld $30,%ymm0,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vmovd -12(%r8),%xmm8 + vpaddd %ymm5,%ymm3,%ymm3 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpshufb %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vmovd -12(%r13),%xmm7 + vmovd -12(%r9),%xmm6 + vpinsrd $1,-12(%r14),%xmm13,%xmm13 + vpinsrd $1,-12(%r10),%xmm8,%xmm8 + vpinsrd $1,-12(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm13,%ymm13 + vpinsrd $1,-12(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm2,%ymm2 + vpslld $5,%ymm3,%ymm7 + vpandn %ymm1,%ymm4,%ymm6 + vpand %ymm0,%ymm4,%ymm5 + + vmovdqa %ymm12,384-256-128(%rbx) + vpaddd %ymm12,%ymm2,%ymm2 + vinserti128 $1,%xmm8,%ymm13,%ymm13 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vmovd -8(%r12),%xmm14 + + vpslld $30,%ymm4,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vmovd -8(%r8),%xmm8 + vpaddd %ymm5,%ymm2,%ymm2 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpshufb %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vmovd -8(%r13),%xmm7 + vmovd -8(%r9),%xmm6 + vpinsrd $1,-8(%r14),%xmm14,%xmm14 + vpinsrd $1,-8(%r10),%xmm8,%xmm8 + vpinsrd $1,-8(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm14,%ymm14 + vpinsrd $1,-8(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm1,%ymm1 + vpslld $5,%ymm2,%ymm7 + vpandn %ymm0,%ymm3,%ymm6 + vpand %ymm4,%ymm3,%ymm5 + + vmovdqa %ymm13,416-256-128(%rbx) + vpaddd %ymm13,%ymm1,%ymm1 + vinserti128 $1,%xmm8,%ymm14,%ymm14 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vmovd -4(%r12),%xmm10 + + vpslld $30,%ymm3,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vmovd -4(%r8),%xmm8 + vpaddd %ymm5,%ymm1,%ymm1 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpshufb %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vmovdqa 0-128(%rax),%ymm11 + vmovd -4(%r13),%xmm7 + vmovd -4(%r9),%xmm6 + vpinsrd $1,-4(%r14),%xmm10,%xmm10 + vpinsrd $1,-4(%r10),%xmm8,%xmm8 + vpinsrd $1,-4(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm10,%ymm10 + vpinsrd $1,-4(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm0,%ymm0 + prefetcht0 63(%r12) + vpslld $5,%ymm1,%ymm7 + vpandn %ymm4,%ymm2,%ymm6 + vpand %ymm3,%ymm2,%ymm5 + + vmovdqa %ymm14,448-256-128(%rbx) + vpaddd %ymm14,%ymm0,%ymm0 + vinserti128 $1,%xmm8,%ymm10,%ymm10 + vpsrld $27,%ymm1,%ymm8 + prefetcht0 63(%r13) + vpxor %ymm6,%ymm5,%ymm5 + + vpslld $30,%ymm2,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + prefetcht0 63(%r14) + vpaddd %ymm5,%ymm0,%ymm0 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + prefetcht0 63(%r15) + vpshufb %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vmovdqa 32-128(%rax),%ymm12 + vpxor %ymm13,%ymm11,%ymm11 + vmovdqa 64-128(%rax),%ymm13 + + vpaddd %ymm15,%ymm4,%ymm4 + vpslld $5,%ymm0,%ymm7 + vpandn %ymm3,%ymm1,%ymm6 + prefetcht0 63(%r8) + vpand %ymm2,%ymm1,%ymm5 + + vmovdqa %ymm10,480-256-128(%rbx) + vpaddd %ymm10,%ymm4,%ymm4 + vpxor 256-256-128(%rbx),%ymm11,%ymm11 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vpxor %ymm13,%ymm11,%ymm11 + prefetcht0 63(%r9) + + vpslld $30,%ymm1,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm4,%ymm4 + prefetcht0 63(%r10) + vpsrld $31,%ymm11,%ymm9 + vpaddd %ymm11,%ymm11,%ymm11 + + vpsrld $2,%ymm1,%ymm1 + prefetcht0 63(%r11) + vpaddd %ymm7,%ymm4,%ymm4 + vpor %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vpxor %ymm14,%ymm12,%ymm12 + vmovdqa 96-128(%rax),%ymm14 + + vpaddd %ymm15,%ymm3,%ymm3 + vpslld $5,%ymm4,%ymm7 + vpandn %ymm2,%ymm0,%ymm6 + + vpand %ymm1,%ymm0,%ymm5 + + vmovdqa %ymm11,0-128(%rax) + vpaddd %ymm11,%ymm3,%ymm3 + vpxor 288-256-128(%rbx),%ymm12,%ymm12 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vpxor %ymm14,%ymm12,%ymm12 + + + vpslld $30,%ymm0,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm3,%ymm3 + + vpsrld $31,%ymm12,%ymm9 + vpaddd %ymm12,%ymm12,%ymm12 + + vpsrld $2,%ymm0,%ymm0 + + vpaddd %ymm7,%ymm3,%ymm3 + vpor %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vpxor %ymm10,%ymm13,%ymm13 + vmovdqa 128-128(%rax),%ymm10 + + vpaddd %ymm15,%ymm2,%ymm2 + vpslld $5,%ymm3,%ymm7 + vpandn %ymm1,%ymm4,%ymm6 + + vpand %ymm0,%ymm4,%ymm5 + + vmovdqa %ymm12,32-128(%rax) + vpaddd %ymm12,%ymm2,%ymm2 + vpxor 320-256-128(%rbx),%ymm13,%ymm13 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vpxor %ymm10,%ymm13,%ymm13 + + + vpslld $30,%ymm4,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm2,%ymm2 + + vpsrld $31,%ymm13,%ymm9 + vpaddd %ymm13,%ymm13,%ymm13 + + vpsrld $2,%ymm4,%ymm4 + + vpaddd %ymm7,%ymm2,%ymm2 + vpor %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vpxor %ymm11,%ymm14,%ymm14 + vmovdqa 160-128(%rax),%ymm11 + + vpaddd %ymm15,%ymm1,%ymm1 + vpslld $5,%ymm2,%ymm7 + vpandn %ymm0,%ymm3,%ymm6 + + vpand %ymm4,%ymm3,%ymm5 + + vmovdqa %ymm13,64-128(%rax) + vpaddd %ymm13,%ymm1,%ymm1 + vpxor 352-256-128(%rbx),%ymm14,%ymm14 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vpxor %ymm11,%ymm14,%ymm14 + + + vpslld $30,%ymm3,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm1,%ymm1 + + vpsrld $31,%ymm14,%ymm9 + vpaddd %ymm14,%ymm14,%ymm14 + + vpsrld $2,%ymm3,%ymm3 + + vpaddd %ymm7,%ymm1,%ymm1 + vpor %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vpxor %ymm12,%ymm10,%ymm10 + vmovdqa 192-128(%rax),%ymm12 + + vpaddd %ymm15,%ymm0,%ymm0 + vpslld $5,%ymm1,%ymm7 + vpandn %ymm4,%ymm2,%ymm6 + + vpand %ymm3,%ymm2,%ymm5 + + vmovdqa %ymm14,96-128(%rax) + vpaddd %ymm14,%ymm0,%ymm0 + vpxor 384-256-128(%rbx),%ymm10,%ymm10 + vpsrld $27,%ymm1,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vpxor %ymm12,%ymm10,%ymm10 + + + vpslld $30,%ymm2,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm0,%ymm0 + + vpsrld $31,%ymm10,%ymm9 + vpaddd %ymm10,%ymm10,%ymm10 + + vpsrld $2,%ymm2,%ymm2 + + vpaddd %ymm7,%ymm0,%ymm0 + vpor %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vmovdqa 0(%rbp),%ymm15 + vpxor %ymm13,%ymm11,%ymm11 + vmovdqa 224-128(%rax),%ymm13 + + vpslld $5,%ymm0,%ymm7 + vpaddd %ymm15,%ymm4,%ymm4 + vpxor %ymm1,%ymm3,%ymm5 + vmovdqa %ymm10,128-128(%rax) + vpaddd %ymm10,%ymm4,%ymm4 + vpxor 416-256-128(%rbx),%ymm11,%ymm11 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm2,%ymm5,%ymm5 + vpxor %ymm13,%ymm11,%ymm11 + + vpslld $30,%ymm1,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm4,%ymm4 + vpsrld $31,%ymm11,%ymm9 + vpaddd %ymm11,%ymm11,%ymm11 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpor %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vpxor %ymm14,%ymm12,%ymm12 + vmovdqa 256-256-128(%rbx),%ymm14 + + vpslld $5,%ymm4,%ymm7 + vpaddd %ymm15,%ymm3,%ymm3 + vpxor %ymm0,%ymm2,%ymm5 + vmovdqa %ymm11,160-128(%rax) + vpaddd %ymm11,%ymm3,%ymm3 + vpxor 448-256-128(%rbx),%ymm12,%ymm12 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm14,%ymm12,%ymm12 + + vpslld $30,%ymm0,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm3,%ymm3 + vpsrld $31,%ymm12,%ymm9 + vpaddd %ymm12,%ymm12,%ymm12 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpor %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vpxor %ymm10,%ymm13,%ymm13 + vmovdqa 288-256-128(%rbx),%ymm10 + + vpslld $5,%ymm3,%ymm7 + vpaddd %ymm15,%ymm2,%ymm2 + vpxor %ymm4,%ymm1,%ymm5 + vmovdqa %ymm12,192-128(%rax) + vpaddd %ymm12,%ymm2,%ymm2 + vpxor 480-256-128(%rbx),%ymm13,%ymm13 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm0,%ymm5,%ymm5 + vpxor %ymm10,%ymm13,%ymm13 + + vpslld $30,%ymm4,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm2,%ymm2 + vpsrld $31,%ymm13,%ymm9 + vpaddd %ymm13,%ymm13,%ymm13 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpor %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vpxor %ymm11,%ymm14,%ymm14 + vmovdqa 320-256-128(%rbx),%ymm11 + + vpslld $5,%ymm2,%ymm7 + vpaddd %ymm15,%ymm1,%ymm1 + vpxor %ymm3,%ymm0,%ymm5 + vmovdqa %ymm13,224-128(%rax) + vpaddd %ymm13,%ymm1,%ymm1 + vpxor 0-128(%rax),%ymm14,%ymm14 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm4,%ymm5,%ymm5 + vpxor %ymm11,%ymm14,%ymm14 + + vpslld $30,%ymm3,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm1,%ymm1 + vpsrld $31,%ymm14,%ymm9 + vpaddd %ymm14,%ymm14,%ymm14 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpor %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vpxor %ymm12,%ymm10,%ymm10 + vmovdqa 352-256-128(%rbx),%ymm12 + + vpslld $5,%ymm1,%ymm7 + vpaddd %ymm15,%ymm0,%ymm0 + vpxor %ymm2,%ymm4,%ymm5 + vmovdqa %ymm14,256-256-128(%rbx) + vpaddd %ymm14,%ymm0,%ymm0 + vpxor 32-128(%rax),%ymm10,%ymm10 + vpsrld $27,%ymm1,%ymm8 + vpxor %ymm3,%ymm5,%ymm5 + vpxor %ymm12,%ymm10,%ymm10 + + vpslld $30,%ymm2,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm0,%ymm0 + vpsrld $31,%ymm10,%ymm9 + vpaddd %ymm10,%ymm10,%ymm10 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + vpor %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vpxor %ymm13,%ymm11,%ymm11 + vmovdqa 384-256-128(%rbx),%ymm13 + + vpslld $5,%ymm0,%ymm7 + vpaddd %ymm15,%ymm4,%ymm4 + vpxor %ymm1,%ymm3,%ymm5 + vmovdqa %ymm10,288-256-128(%rbx) + vpaddd %ymm10,%ymm4,%ymm4 + vpxor 64-128(%rax),%ymm11,%ymm11 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm2,%ymm5,%ymm5 + vpxor %ymm13,%ymm11,%ymm11 + + vpslld $30,%ymm1,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm4,%ymm4 + vpsrld $31,%ymm11,%ymm9 + vpaddd %ymm11,%ymm11,%ymm11 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpor %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vpxor %ymm14,%ymm12,%ymm12 + vmovdqa 416-256-128(%rbx),%ymm14 + + vpslld $5,%ymm4,%ymm7 + vpaddd %ymm15,%ymm3,%ymm3 + vpxor %ymm0,%ymm2,%ymm5 + vmovdqa %ymm11,320-256-128(%rbx) + vpaddd %ymm11,%ymm3,%ymm3 + vpxor 96-128(%rax),%ymm12,%ymm12 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm14,%ymm12,%ymm12 + + vpslld $30,%ymm0,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm3,%ymm3 + vpsrld $31,%ymm12,%ymm9 + vpaddd %ymm12,%ymm12,%ymm12 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpor %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vpxor %ymm10,%ymm13,%ymm13 + vmovdqa 448-256-128(%rbx),%ymm10 + + vpslld $5,%ymm3,%ymm7 + vpaddd %ymm15,%ymm2,%ymm2 + vpxor %ymm4,%ymm1,%ymm5 + vmovdqa %ymm12,352-256-128(%rbx) + vpaddd %ymm12,%ymm2,%ymm2 + vpxor 128-128(%rax),%ymm13,%ymm13 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm0,%ymm5,%ymm5 + vpxor %ymm10,%ymm13,%ymm13 + + vpslld $30,%ymm4,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm2,%ymm2 + vpsrld $31,%ymm13,%ymm9 + vpaddd %ymm13,%ymm13,%ymm13 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpor %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vpxor %ymm11,%ymm14,%ymm14 + vmovdqa 480-256-128(%rbx),%ymm11 + + vpslld $5,%ymm2,%ymm7 + vpaddd %ymm15,%ymm1,%ymm1 + vpxor %ymm3,%ymm0,%ymm5 + vmovdqa %ymm13,384-256-128(%rbx) + vpaddd %ymm13,%ymm1,%ymm1 + vpxor 160-128(%rax),%ymm14,%ymm14 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm4,%ymm5,%ymm5 + vpxor %ymm11,%ymm14,%ymm14 + + vpslld $30,%ymm3,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm1,%ymm1 + vpsrld $31,%ymm14,%ymm9 + vpaddd %ymm14,%ymm14,%ymm14 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpor %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vpxor %ymm12,%ymm10,%ymm10 + vmovdqa 0-128(%rax),%ymm12 + + vpslld $5,%ymm1,%ymm7 + vpaddd %ymm15,%ymm0,%ymm0 + vpxor %ymm2,%ymm4,%ymm5 + vmovdqa %ymm14,416-256-128(%rbx) + vpaddd %ymm14,%ymm0,%ymm0 + vpxor 192-128(%rax),%ymm10,%ymm10 + vpsrld $27,%ymm1,%ymm8 + vpxor %ymm3,%ymm5,%ymm5 + vpxor %ymm12,%ymm10,%ymm10 + + vpslld $30,%ymm2,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm0,%ymm0 + vpsrld $31,%ymm10,%ymm9 + vpaddd %ymm10,%ymm10,%ymm10 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + vpor %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vpxor %ymm13,%ymm11,%ymm11 + vmovdqa 32-128(%rax),%ymm13 + + vpslld $5,%ymm0,%ymm7 + vpaddd %ymm15,%ymm4,%ymm4 + vpxor %ymm1,%ymm3,%ymm5 + vmovdqa %ymm10,448-256-128(%rbx) + vpaddd %ymm10,%ymm4,%ymm4 + vpxor 224-128(%rax),%ymm11,%ymm11 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm2,%ymm5,%ymm5 + vpxor %ymm13,%ymm11,%ymm11 + + vpslld $30,%ymm1,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm4,%ymm4 + vpsrld $31,%ymm11,%ymm9 + vpaddd %ymm11,%ymm11,%ymm11 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpor %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vpxor %ymm14,%ymm12,%ymm12 + vmovdqa 64-128(%rax),%ymm14 + + vpslld $5,%ymm4,%ymm7 + vpaddd %ymm15,%ymm3,%ymm3 + vpxor %ymm0,%ymm2,%ymm5 + vmovdqa %ymm11,480-256-128(%rbx) + vpaddd %ymm11,%ymm3,%ymm3 + vpxor 256-256-128(%rbx),%ymm12,%ymm12 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm14,%ymm12,%ymm12 + + vpslld $30,%ymm0,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm3,%ymm3 + vpsrld $31,%ymm12,%ymm9 + vpaddd %ymm12,%ymm12,%ymm12 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpor %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vpxor %ymm10,%ymm13,%ymm13 + vmovdqa 96-128(%rax),%ymm10 + + vpslld $5,%ymm3,%ymm7 + vpaddd %ymm15,%ymm2,%ymm2 + vpxor %ymm4,%ymm1,%ymm5 + vmovdqa %ymm12,0-128(%rax) + vpaddd %ymm12,%ymm2,%ymm2 + vpxor 288-256-128(%rbx),%ymm13,%ymm13 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm0,%ymm5,%ymm5 + vpxor %ymm10,%ymm13,%ymm13 + + vpslld $30,%ymm4,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm2,%ymm2 + vpsrld $31,%ymm13,%ymm9 + vpaddd %ymm13,%ymm13,%ymm13 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpor %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vpxor %ymm11,%ymm14,%ymm14 + vmovdqa 128-128(%rax),%ymm11 + + vpslld $5,%ymm2,%ymm7 + vpaddd %ymm15,%ymm1,%ymm1 + vpxor %ymm3,%ymm0,%ymm5 + vmovdqa %ymm13,32-128(%rax) + vpaddd %ymm13,%ymm1,%ymm1 + vpxor 320-256-128(%rbx),%ymm14,%ymm14 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm4,%ymm5,%ymm5 + vpxor %ymm11,%ymm14,%ymm14 + + vpslld $30,%ymm3,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm1,%ymm1 + vpsrld $31,%ymm14,%ymm9 + vpaddd %ymm14,%ymm14,%ymm14 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpor %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vpxor %ymm12,%ymm10,%ymm10 + vmovdqa 160-128(%rax),%ymm12 + + vpslld $5,%ymm1,%ymm7 + vpaddd %ymm15,%ymm0,%ymm0 + vpxor %ymm2,%ymm4,%ymm5 + vmovdqa %ymm14,64-128(%rax) + vpaddd %ymm14,%ymm0,%ymm0 + vpxor 352-256-128(%rbx),%ymm10,%ymm10 + vpsrld $27,%ymm1,%ymm8 + vpxor %ymm3,%ymm5,%ymm5 + vpxor %ymm12,%ymm10,%ymm10 + + vpslld $30,%ymm2,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm0,%ymm0 + vpsrld $31,%ymm10,%ymm9 + vpaddd %ymm10,%ymm10,%ymm10 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + vpor %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vpxor %ymm13,%ymm11,%ymm11 + vmovdqa 192-128(%rax),%ymm13 + + vpslld $5,%ymm0,%ymm7 + vpaddd %ymm15,%ymm4,%ymm4 + vpxor %ymm1,%ymm3,%ymm5 + vmovdqa %ymm10,96-128(%rax) + vpaddd %ymm10,%ymm4,%ymm4 + vpxor 384-256-128(%rbx),%ymm11,%ymm11 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm2,%ymm5,%ymm5 + vpxor %ymm13,%ymm11,%ymm11 + + vpslld $30,%ymm1,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm4,%ymm4 + vpsrld $31,%ymm11,%ymm9 + vpaddd %ymm11,%ymm11,%ymm11 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpor %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vpxor %ymm14,%ymm12,%ymm12 + vmovdqa 224-128(%rax),%ymm14 + + vpslld $5,%ymm4,%ymm7 + vpaddd %ymm15,%ymm3,%ymm3 + vpxor %ymm0,%ymm2,%ymm5 + vmovdqa %ymm11,128-128(%rax) + vpaddd %ymm11,%ymm3,%ymm3 + vpxor 416-256-128(%rbx),%ymm12,%ymm12 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm14,%ymm12,%ymm12 + + vpslld $30,%ymm0,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm3,%ymm3 + vpsrld $31,%ymm12,%ymm9 + vpaddd %ymm12,%ymm12,%ymm12 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpor %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vpxor %ymm10,%ymm13,%ymm13 + vmovdqa 256-256-128(%rbx),%ymm10 + + vpslld $5,%ymm3,%ymm7 + vpaddd %ymm15,%ymm2,%ymm2 + vpxor %ymm4,%ymm1,%ymm5 + vmovdqa %ymm12,160-128(%rax) + vpaddd %ymm12,%ymm2,%ymm2 + vpxor 448-256-128(%rbx),%ymm13,%ymm13 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm0,%ymm5,%ymm5 + vpxor %ymm10,%ymm13,%ymm13 + + vpslld $30,%ymm4,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm2,%ymm2 + vpsrld $31,%ymm13,%ymm9 + vpaddd %ymm13,%ymm13,%ymm13 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpor %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vpxor %ymm11,%ymm14,%ymm14 + vmovdqa 288-256-128(%rbx),%ymm11 + + vpslld $5,%ymm2,%ymm7 + vpaddd %ymm15,%ymm1,%ymm1 + vpxor %ymm3,%ymm0,%ymm5 + vmovdqa %ymm13,192-128(%rax) + vpaddd %ymm13,%ymm1,%ymm1 + vpxor 480-256-128(%rbx),%ymm14,%ymm14 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm4,%ymm5,%ymm5 + vpxor %ymm11,%ymm14,%ymm14 + + vpslld $30,%ymm3,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm1,%ymm1 + vpsrld $31,%ymm14,%ymm9 + vpaddd %ymm14,%ymm14,%ymm14 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpor %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vpxor %ymm12,%ymm10,%ymm10 + vmovdqa 320-256-128(%rbx),%ymm12 + + vpslld $5,%ymm1,%ymm7 + vpaddd %ymm15,%ymm0,%ymm0 + vpxor %ymm2,%ymm4,%ymm5 + vmovdqa %ymm14,224-128(%rax) + vpaddd %ymm14,%ymm0,%ymm0 + vpxor 0-128(%rax),%ymm10,%ymm10 + vpsrld $27,%ymm1,%ymm8 + vpxor %ymm3,%ymm5,%ymm5 + vpxor %ymm12,%ymm10,%ymm10 + + vpslld $30,%ymm2,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm0,%ymm0 + vpsrld $31,%ymm10,%ymm9 + vpaddd %ymm10,%ymm10,%ymm10 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + vpor %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vmovdqa 32(%rbp),%ymm15 + vpxor %ymm13,%ymm11,%ymm11 + vmovdqa 352-256-128(%rbx),%ymm13 + + vpaddd %ymm15,%ymm4,%ymm4 + vpslld $5,%ymm0,%ymm7 + vpand %ymm2,%ymm3,%ymm6 + vpxor 32-128(%rax),%ymm11,%ymm11 + + vpaddd %ymm6,%ymm4,%ymm4 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm2,%ymm3,%ymm5 + vpxor %ymm13,%ymm11,%ymm11 + + vmovdqu %ymm10,256-256-128(%rbx) + vpaddd %ymm10,%ymm4,%ymm4 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm11,%ymm9 + vpand %ymm1,%ymm5,%ymm5 + vpaddd %ymm11,%ymm11,%ymm11 + + vpslld $30,%ymm1,%ymm6 + vpaddd %ymm5,%ymm4,%ymm4 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpor %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vpxor %ymm14,%ymm12,%ymm12 + vmovdqa 384-256-128(%rbx),%ymm14 + + vpaddd %ymm15,%ymm3,%ymm3 + vpslld $5,%ymm4,%ymm7 + vpand %ymm1,%ymm2,%ymm6 + vpxor 64-128(%rax),%ymm12,%ymm12 + + vpaddd %ymm6,%ymm3,%ymm3 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm1,%ymm2,%ymm5 + vpxor %ymm14,%ymm12,%ymm12 + + vmovdqu %ymm11,288-256-128(%rbx) + vpaddd %ymm11,%ymm3,%ymm3 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm12,%ymm9 + vpand %ymm0,%ymm5,%ymm5 + vpaddd %ymm12,%ymm12,%ymm12 + + vpslld $30,%ymm0,%ymm6 + vpaddd %ymm5,%ymm3,%ymm3 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpor %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vpxor %ymm10,%ymm13,%ymm13 + vmovdqa 416-256-128(%rbx),%ymm10 + + vpaddd %ymm15,%ymm2,%ymm2 + vpslld $5,%ymm3,%ymm7 + vpand %ymm0,%ymm1,%ymm6 + vpxor 96-128(%rax),%ymm13,%ymm13 + + vpaddd %ymm6,%ymm2,%ymm2 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm0,%ymm1,%ymm5 + vpxor %ymm10,%ymm13,%ymm13 + + vmovdqu %ymm12,320-256-128(%rbx) + vpaddd %ymm12,%ymm2,%ymm2 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm13,%ymm9 + vpand %ymm4,%ymm5,%ymm5 + vpaddd %ymm13,%ymm13,%ymm13 + + vpslld $30,%ymm4,%ymm6 + vpaddd %ymm5,%ymm2,%ymm2 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpor %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vpxor %ymm11,%ymm14,%ymm14 + vmovdqa 448-256-128(%rbx),%ymm11 + + vpaddd %ymm15,%ymm1,%ymm1 + vpslld $5,%ymm2,%ymm7 + vpand %ymm4,%ymm0,%ymm6 + vpxor 128-128(%rax),%ymm14,%ymm14 + + vpaddd %ymm6,%ymm1,%ymm1 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm4,%ymm0,%ymm5 + vpxor %ymm11,%ymm14,%ymm14 + + vmovdqu %ymm13,352-256-128(%rbx) + vpaddd %ymm13,%ymm1,%ymm1 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm14,%ymm9 + vpand %ymm3,%ymm5,%ymm5 + vpaddd %ymm14,%ymm14,%ymm14 + + vpslld $30,%ymm3,%ymm6 + vpaddd %ymm5,%ymm1,%ymm1 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpor %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vpxor %ymm12,%ymm10,%ymm10 + vmovdqa 480-256-128(%rbx),%ymm12 + + vpaddd %ymm15,%ymm0,%ymm0 + vpslld $5,%ymm1,%ymm7 + vpand %ymm3,%ymm4,%ymm6 + vpxor 160-128(%rax),%ymm10,%ymm10 + + vpaddd %ymm6,%ymm0,%ymm0 + vpsrld $27,%ymm1,%ymm8 + vpxor %ymm3,%ymm4,%ymm5 + vpxor %ymm12,%ymm10,%ymm10 + + vmovdqu %ymm14,384-256-128(%rbx) + vpaddd %ymm14,%ymm0,%ymm0 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm10,%ymm9 + vpand %ymm2,%ymm5,%ymm5 + vpaddd %ymm10,%ymm10,%ymm10 + + vpslld $30,%ymm2,%ymm6 + vpaddd %ymm5,%ymm0,%ymm0 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + vpor %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vpxor %ymm13,%ymm11,%ymm11 + vmovdqa 0-128(%rax),%ymm13 + + vpaddd %ymm15,%ymm4,%ymm4 + vpslld $5,%ymm0,%ymm7 + vpand %ymm2,%ymm3,%ymm6 + vpxor 192-128(%rax),%ymm11,%ymm11 + + vpaddd %ymm6,%ymm4,%ymm4 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm2,%ymm3,%ymm5 + vpxor %ymm13,%ymm11,%ymm11 + + vmovdqu %ymm10,416-256-128(%rbx) + vpaddd %ymm10,%ymm4,%ymm4 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm11,%ymm9 + vpand %ymm1,%ymm5,%ymm5 + vpaddd %ymm11,%ymm11,%ymm11 + + vpslld $30,%ymm1,%ymm6 + vpaddd %ymm5,%ymm4,%ymm4 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpor %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vpxor %ymm14,%ymm12,%ymm12 + vmovdqa 32-128(%rax),%ymm14 + + vpaddd %ymm15,%ymm3,%ymm3 + vpslld $5,%ymm4,%ymm7 + vpand %ymm1,%ymm2,%ymm6 + vpxor 224-128(%rax),%ymm12,%ymm12 + + vpaddd %ymm6,%ymm3,%ymm3 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm1,%ymm2,%ymm5 + vpxor %ymm14,%ymm12,%ymm12 + + vmovdqu %ymm11,448-256-128(%rbx) + vpaddd %ymm11,%ymm3,%ymm3 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm12,%ymm9 + vpand %ymm0,%ymm5,%ymm5 + vpaddd %ymm12,%ymm12,%ymm12 + + vpslld $30,%ymm0,%ymm6 + vpaddd %ymm5,%ymm3,%ymm3 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpor %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vpxor %ymm10,%ymm13,%ymm13 + vmovdqa 64-128(%rax),%ymm10 + + vpaddd %ymm15,%ymm2,%ymm2 + vpslld $5,%ymm3,%ymm7 + vpand %ymm0,%ymm1,%ymm6 + vpxor 256-256-128(%rbx),%ymm13,%ymm13 + + vpaddd %ymm6,%ymm2,%ymm2 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm0,%ymm1,%ymm5 + vpxor %ymm10,%ymm13,%ymm13 + + vmovdqu %ymm12,480-256-128(%rbx) + vpaddd %ymm12,%ymm2,%ymm2 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm13,%ymm9 + vpand %ymm4,%ymm5,%ymm5 + vpaddd %ymm13,%ymm13,%ymm13 + + vpslld $30,%ymm4,%ymm6 + vpaddd %ymm5,%ymm2,%ymm2 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpor %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vpxor %ymm11,%ymm14,%ymm14 + vmovdqa 96-128(%rax),%ymm11 + + vpaddd %ymm15,%ymm1,%ymm1 + vpslld $5,%ymm2,%ymm7 + vpand %ymm4,%ymm0,%ymm6 + vpxor 288-256-128(%rbx),%ymm14,%ymm14 + + vpaddd %ymm6,%ymm1,%ymm1 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm4,%ymm0,%ymm5 + vpxor %ymm11,%ymm14,%ymm14 + + vmovdqu %ymm13,0-128(%rax) + vpaddd %ymm13,%ymm1,%ymm1 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm14,%ymm9 + vpand %ymm3,%ymm5,%ymm5 + vpaddd %ymm14,%ymm14,%ymm14 + + vpslld $30,%ymm3,%ymm6 + vpaddd %ymm5,%ymm1,%ymm1 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpor %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vpxor %ymm12,%ymm10,%ymm10 + vmovdqa 128-128(%rax),%ymm12 + + vpaddd %ymm15,%ymm0,%ymm0 + vpslld $5,%ymm1,%ymm7 + vpand %ymm3,%ymm4,%ymm6 + vpxor 320-256-128(%rbx),%ymm10,%ymm10 + + vpaddd %ymm6,%ymm0,%ymm0 + vpsrld $27,%ymm1,%ymm8 + vpxor %ymm3,%ymm4,%ymm5 + vpxor %ymm12,%ymm10,%ymm10 + + vmovdqu %ymm14,32-128(%rax) + vpaddd %ymm14,%ymm0,%ymm0 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm10,%ymm9 + vpand %ymm2,%ymm5,%ymm5 + vpaddd %ymm10,%ymm10,%ymm10 + + vpslld $30,%ymm2,%ymm6 + vpaddd %ymm5,%ymm0,%ymm0 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + vpor %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vpxor %ymm13,%ymm11,%ymm11 + vmovdqa 160-128(%rax),%ymm13 + + vpaddd %ymm15,%ymm4,%ymm4 + vpslld $5,%ymm0,%ymm7 + vpand %ymm2,%ymm3,%ymm6 + vpxor 352-256-128(%rbx),%ymm11,%ymm11 + + vpaddd %ymm6,%ymm4,%ymm4 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm2,%ymm3,%ymm5 + vpxor %ymm13,%ymm11,%ymm11 + + vmovdqu %ymm10,64-128(%rax) + vpaddd %ymm10,%ymm4,%ymm4 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm11,%ymm9 + vpand %ymm1,%ymm5,%ymm5 + vpaddd %ymm11,%ymm11,%ymm11 + + vpslld $30,%ymm1,%ymm6 + vpaddd %ymm5,%ymm4,%ymm4 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpor %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vpxor %ymm14,%ymm12,%ymm12 + vmovdqa 192-128(%rax),%ymm14 + + vpaddd %ymm15,%ymm3,%ymm3 + vpslld $5,%ymm4,%ymm7 + vpand %ymm1,%ymm2,%ymm6 + vpxor 384-256-128(%rbx),%ymm12,%ymm12 + + vpaddd %ymm6,%ymm3,%ymm3 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm1,%ymm2,%ymm5 + vpxor %ymm14,%ymm12,%ymm12 + + vmovdqu %ymm11,96-128(%rax) + vpaddd %ymm11,%ymm3,%ymm3 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm12,%ymm9 + vpand %ymm0,%ymm5,%ymm5 + vpaddd %ymm12,%ymm12,%ymm12 + + vpslld $30,%ymm0,%ymm6 + vpaddd %ymm5,%ymm3,%ymm3 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpor %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vpxor %ymm10,%ymm13,%ymm13 + vmovdqa 224-128(%rax),%ymm10 + + vpaddd %ymm15,%ymm2,%ymm2 + vpslld $5,%ymm3,%ymm7 + vpand %ymm0,%ymm1,%ymm6 + vpxor 416-256-128(%rbx),%ymm13,%ymm13 + + vpaddd %ymm6,%ymm2,%ymm2 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm0,%ymm1,%ymm5 + vpxor %ymm10,%ymm13,%ymm13 + + vmovdqu %ymm12,128-128(%rax) + vpaddd %ymm12,%ymm2,%ymm2 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm13,%ymm9 + vpand %ymm4,%ymm5,%ymm5 + vpaddd %ymm13,%ymm13,%ymm13 + + vpslld $30,%ymm4,%ymm6 + vpaddd %ymm5,%ymm2,%ymm2 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpor %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vpxor %ymm11,%ymm14,%ymm14 + vmovdqa 256-256-128(%rbx),%ymm11 + + vpaddd %ymm15,%ymm1,%ymm1 + vpslld $5,%ymm2,%ymm7 + vpand %ymm4,%ymm0,%ymm6 + vpxor 448-256-128(%rbx),%ymm14,%ymm14 + + vpaddd %ymm6,%ymm1,%ymm1 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm4,%ymm0,%ymm5 + vpxor %ymm11,%ymm14,%ymm14 + + vmovdqu %ymm13,160-128(%rax) + vpaddd %ymm13,%ymm1,%ymm1 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm14,%ymm9 + vpand %ymm3,%ymm5,%ymm5 + vpaddd %ymm14,%ymm14,%ymm14 + + vpslld $30,%ymm3,%ymm6 + vpaddd %ymm5,%ymm1,%ymm1 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpor %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vpxor %ymm12,%ymm10,%ymm10 + vmovdqa 288-256-128(%rbx),%ymm12 + + vpaddd %ymm15,%ymm0,%ymm0 + vpslld $5,%ymm1,%ymm7 + vpand %ymm3,%ymm4,%ymm6 + vpxor 480-256-128(%rbx),%ymm10,%ymm10 + + vpaddd %ymm6,%ymm0,%ymm0 + vpsrld $27,%ymm1,%ymm8 + vpxor %ymm3,%ymm4,%ymm5 + vpxor %ymm12,%ymm10,%ymm10 + + vmovdqu %ymm14,192-128(%rax) + vpaddd %ymm14,%ymm0,%ymm0 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm10,%ymm9 + vpand %ymm2,%ymm5,%ymm5 + vpaddd %ymm10,%ymm10,%ymm10 + + vpslld $30,%ymm2,%ymm6 + vpaddd %ymm5,%ymm0,%ymm0 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + vpor %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vpxor %ymm13,%ymm11,%ymm11 + vmovdqa 320-256-128(%rbx),%ymm13 + + vpaddd %ymm15,%ymm4,%ymm4 + vpslld $5,%ymm0,%ymm7 + vpand %ymm2,%ymm3,%ymm6 + vpxor 0-128(%rax),%ymm11,%ymm11 + + vpaddd %ymm6,%ymm4,%ymm4 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm2,%ymm3,%ymm5 + vpxor %ymm13,%ymm11,%ymm11 + + vmovdqu %ymm10,224-128(%rax) + vpaddd %ymm10,%ymm4,%ymm4 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm11,%ymm9 + vpand %ymm1,%ymm5,%ymm5 + vpaddd %ymm11,%ymm11,%ymm11 + + vpslld $30,%ymm1,%ymm6 + vpaddd %ymm5,%ymm4,%ymm4 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpor %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vpxor %ymm14,%ymm12,%ymm12 + vmovdqa 352-256-128(%rbx),%ymm14 + + vpaddd %ymm15,%ymm3,%ymm3 + vpslld $5,%ymm4,%ymm7 + vpand %ymm1,%ymm2,%ymm6 + vpxor 32-128(%rax),%ymm12,%ymm12 + + vpaddd %ymm6,%ymm3,%ymm3 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm1,%ymm2,%ymm5 + vpxor %ymm14,%ymm12,%ymm12 + + vmovdqu %ymm11,256-256-128(%rbx) + vpaddd %ymm11,%ymm3,%ymm3 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm12,%ymm9 + vpand %ymm0,%ymm5,%ymm5 + vpaddd %ymm12,%ymm12,%ymm12 + + vpslld $30,%ymm0,%ymm6 + vpaddd %ymm5,%ymm3,%ymm3 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpor %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vpxor %ymm10,%ymm13,%ymm13 + vmovdqa 384-256-128(%rbx),%ymm10 + + vpaddd %ymm15,%ymm2,%ymm2 + vpslld $5,%ymm3,%ymm7 + vpand %ymm0,%ymm1,%ymm6 + vpxor 64-128(%rax),%ymm13,%ymm13 + + vpaddd %ymm6,%ymm2,%ymm2 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm0,%ymm1,%ymm5 + vpxor %ymm10,%ymm13,%ymm13 + + vmovdqu %ymm12,288-256-128(%rbx) + vpaddd %ymm12,%ymm2,%ymm2 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm13,%ymm9 + vpand %ymm4,%ymm5,%ymm5 + vpaddd %ymm13,%ymm13,%ymm13 + + vpslld $30,%ymm4,%ymm6 + vpaddd %ymm5,%ymm2,%ymm2 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpor %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vpxor %ymm11,%ymm14,%ymm14 + vmovdqa 416-256-128(%rbx),%ymm11 + + vpaddd %ymm15,%ymm1,%ymm1 + vpslld $5,%ymm2,%ymm7 + vpand %ymm4,%ymm0,%ymm6 + vpxor 96-128(%rax),%ymm14,%ymm14 + + vpaddd %ymm6,%ymm1,%ymm1 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm4,%ymm0,%ymm5 + vpxor %ymm11,%ymm14,%ymm14 + + vmovdqu %ymm13,320-256-128(%rbx) + vpaddd %ymm13,%ymm1,%ymm1 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm14,%ymm9 + vpand %ymm3,%ymm5,%ymm5 + vpaddd %ymm14,%ymm14,%ymm14 + + vpslld $30,%ymm3,%ymm6 + vpaddd %ymm5,%ymm1,%ymm1 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpor %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vpxor %ymm12,%ymm10,%ymm10 + vmovdqa 448-256-128(%rbx),%ymm12 + + vpaddd %ymm15,%ymm0,%ymm0 + vpslld $5,%ymm1,%ymm7 + vpand %ymm3,%ymm4,%ymm6 + vpxor 128-128(%rax),%ymm10,%ymm10 + + vpaddd %ymm6,%ymm0,%ymm0 + vpsrld $27,%ymm1,%ymm8 + vpxor %ymm3,%ymm4,%ymm5 + vpxor %ymm12,%ymm10,%ymm10 + + vmovdqu %ymm14,352-256-128(%rbx) + vpaddd %ymm14,%ymm0,%ymm0 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm10,%ymm9 + vpand %ymm2,%ymm5,%ymm5 + vpaddd %ymm10,%ymm10,%ymm10 + + vpslld $30,%ymm2,%ymm6 + vpaddd %ymm5,%ymm0,%ymm0 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + vpor %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vmovdqa 64(%rbp),%ymm15 + vpxor %ymm13,%ymm11,%ymm11 + vmovdqa 480-256-128(%rbx),%ymm13 + + vpslld $5,%ymm0,%ymm7 + vpaddd %ymm15,%ymm4,%ymm4 + vpxor %ymm1,%ymm3,%ymm5 + vmovdqa %ymm10,384-256-128(%rbx) + vpaddd %ymm10,%ymm4,%ymm4 + vpxor 160-128(%rax),%ymm11,%ymm11 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm2,%ymm5,%ymm5 + vpxor %ymm13,%ymm11,%ymm11 + + vpslld $30,%ymm1,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm4,%ymm4 + vpsrld $31,%ymm11,%ymm9 + vpaddd %ymm11,%ymm11,%ymm11 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpor %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vpxor %ymm14,%ymm12,%ymm12 + vmovdqa 0-128(%rax),%ymm14 + + vpslld $5,%ymm4,%ymm7 + vpaddd %ymm15,%ymm3,%ymm3 + vpxor %ymm0,%ymm2,%ymm5 + vmovdqa %ymm11,416-256-128(%rbx) + vpaddd %ymm11,%ymm3,%ymm3 + vpxor 192-128(%rax),%ymm12,%ymm12 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm14,%ymm12,%ymm12 + + vpslld $30,%ymm0,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm3,%ymm3 + vpsrld $31,%ymm12,%ymm9 + vpaddd %ymm12,%ymm12,%ymm12 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpor %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vpxor %ymm10,%ymm13,%ymm13 + vmovdqa 32-128(%rax),%ymm10 + + vpslld $5,%ymm3,%ymm7 + vpaddd %ymm15,%ymm2,%ymm2 + vpxor %ymm4,%ymm1,%ymm5 + vmovdqa %ymm12,448-256-128(%rbx) + vpaddd %ymm12,%ymm2,%ymm2 + vpxor 224-128(%rax),%ymm13,%ymm13 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm0,%ymm5,%ymm5 + vpxor %ymm10,%ymm13,%ymm13 + + vpslld $30,%ymm4,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm2,%ymm2 + vpsrld $31,%ymm13,%ymm9 + vpaddd %ymm13,%ymm13,%ymm13 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpor %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vpxor %ymm11,%ymm14,%ymm14 + vmovdqa 64-128(%rax),%ymm11 + + vpslld $5,%ymm2,%ymm7 + vpaddd %ymm15,%ymm1,%ymm1 + vpxor %ymm3,%ymm0,%ymm5 + vmovdqa %ymm13,480-256-128(%rbx) + vpaddd %ymm13,%ymm1,%ymm1 + vpxor 256-256-128(%rbx),%ymm14,%ymm14 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm4,%ymm5,%ymm5 + vpxor %ymm11,%ymm14,%ymm14 + + vpslld $30,%ymm3,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm1,%ymm1 + vpsrld $31,%ymm14,%ymm9 + vpaddd %ymm14,%ymm14,%ymm14 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpor %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vpxor %ymm12,%ymm10,%ymm10 + vmovdqa 96-128(%rax),%ymm12 + + vpslld $5,%ymm1,%ymm7 + vpaddd %ymm15,%ymm0,%ymm0 + vpxor %ymm2,%ymm4,%ymm5 + vmovdqa %ymm14,0-128(%rax) + vpaddd %ymm14,%ymm0,%ymm0 + vpxor 288-256-128(%rbx),%ymm10,%ymm10 + vpsrld $27,%ymm1,%ymm8 + vpxor %ymm3,%ymm5,%ymm5 + vpxor %ymm12,%ymm10,%ymm10 + + vpslld $30,%ymm2,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm0,%ymm0 + vpsrld $31,%ymm10,%ymm9 + vpaddd %ymm10,%ymm10,%ymm10 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + vpor %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vpxor %ymm13,%ymm11,%ymm11 + vmovdqa 128-128(%rax),%ymm13 + + vpslld $5,%ymm0,%ymm7 + vpaddd %ymm15,%ymm4,%ymm4 + vpxor %ymm1,%ymm3,%ymm5 + vmovdqa %ymm10,32-128(%rax) + vpaddd %ymm10,%ymm4,%ymm4 + vpxor 320-256-128(%rbx),%ymm11,%ymm11 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm2,%ymm5,%ymm5 + vpxor %ymm13,%ymm11,%ymm11 + + vpslld $30,%ymm1,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm4,%ymm4 + vpsrld $31,%ymm11,%ymm9 + vpaddd %ymm11,%ymm11,%ymm11 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpor %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vpxor %ymm14,%ymm12,%ymm12 + vmovdqa 160-128(%rax),%ymm14 + + vpslld $5,%ymm4,%ymm7 + vpaddd %ymm15,%ymm3,%ymm3 + vpxor %ymm0,%ymm2,%ymm5 + vmovdqa %ymm11,64-128(%rax) + vpaddd %ymm11,%ymm3,%ymm3 + vpxor 352-256-128(%rbx),%ymm12,%ymm12 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm14,%ymm12,%ymm12 + + vpslld $30,%ymm0,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm3,%ymm3 + vpsrld $31,%ymm12,%ymm9 + vpaddd %ymm12,%ymm12,%ymm12 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpor %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vpxor %ymm10,%ymm13,%ymm13 + vmovdqa 192-128(%rax),%ymm10 + + vpslld $5,%ymm3,%ymm7 + vpaddd %ymm15,%ymm2,%ymm2 + vpxor %ymm4,%ymm1,%ymm5 + vmovdqa %ymm12,96-128(%rax) + vpaddd %ymm12,%ymm2,%ymm2 + vpxor 384-256-128(%rbx),%ymm13,%ymm13 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm0,%ymm5,%ymm5 + vpxor %ymm10,%ymm13,%ymm13 + + vpslld $30,%ymm4,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm2,%ymm2 + vpsrld $31,%ymm13,%ymm9 + vpaddd %ymm13,%ymm13,%ymm13 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpor %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vpxor %ymm11,%ymm14,%ymm14 + vmovdqa 224-128(%rax),%ymm11 + + vpslld $5,%ymm2,%ymm7 + vpaddd %ymm15,%ymm1,%ymm1 + vpxor %ymm3,%ymm0,%ymm5 + vmovdqa %ymm13,128-128(%rax) + vpaddd %ymm13,%ymm1,%ymm1 + vpxor 416-256-128(%rbx),%ymm14,%ymm14 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm4,%ymm5,%ymm5 + vpxor %ymm11,%ymm14,%ymm14 + + vpslld $30,%ymm3,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm1,%ymm1 + vpsrld $31,%ymm14,%ymm9 + vpaddd %ymm14,%ymm14,%ymm14 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpor %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vpxor %ymm12,%ymm10,%ymm10 + vmovdqa 256-256-128(%rbx),%ymm12 + + vpslld $5,%ymm1,%ymm7 + vpaddd %ymm15,%ymm0,%ymm0 + vpxor %ymm2,%ymm4,%ymm5 + vmovdqa %ymm14,160-128(%rax) + vpaddd %ymm14,%ymm0,%ymm0 + vpxor 448-256-128(%rbx),%ymm10,%ymm10 + vpsrld $27,%ymm1,%ymm8 + vpxor %ymm3,%ymm5,%ymm5 + vpxor %ymm12,%ymm10,%ymm10 + + vpslld $30,%ymm2,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm0,%ymm0 + vpsrld $31,%ymm10,%ymm9 + vpaddd %ymm10,%ymm10,%ymm10 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + vpor %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vpxor %ymm13,%ymm11,%ymm11 + vmovdqa 288-256-128(%rbx),%ymm13 + + vpslld $5,%ymm0,%ymm7 + vpaddd %ymm15,%ymm4,%ymm4 + vpxor %ymm1,%ymm3,%ymm5 + vmovdqa %ymm10,192-128(%rax) + vpaddd %ymm10,%ymm4,%ymm4 + vpxor 480-256-128(%rbx),%ymm11,%ymm11 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm2,%ymm5,%ymm5 + vpxor %ymm13,%ymm11,%ymm11 + + vpslld $30,%ymm1,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm4,%ymm4 + vpsrld $31,%ymm11,%ymm9 + vpaddd %ymm11,%ymm11,%ymm11 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpor %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vpxor %ymm14,%ymm12,%ymm12 + vmovdqa 320-256-128(%rbx),%ymm14 + + vpslld $5,%ymm4,%ymm7 + vpaddd %ymm15,%ymm3,%ymm3 + vpxor %ymm0,%ymm2,%ymm5 + vmovdqa %ymm11,224-128(%rax) + vpaddd %ymm11,%ymm3,%ymm3 + vpxor 0-128(%rax),%ymm12,%ymm12 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm14,%ymm12,%ymm12 + + vpslld $30,%ymm0,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm3,%ymm3 + vpsrld $31,%ymm12,%ymm9 + vpaddd %ymm12,%ymm12,%ymm12 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpor %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vpxor %ymm10,%ymm13,%ymm13 + vmovdqa 352-256-128(%rbx),%ymm10 + + vpslld $5,%ymm3,%ymm7 + vpaddd %ymm15,%ymm2,%ymm2 + vpxor %ymm4,%ymm1,%ymm5 + vpaddd %ymm12,%ymm2,%ymm2 + vpxor 32-128(%rax),%ymm13,%ymm13 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm0,%ymm5,%ymm5 + vpxor %ymm10,%ymm13,%ymm13 + + vpslld $30,%ymm4,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm2,%ymm2 + vpsrld $31,%ymm13,%ymm9 + vpaddd %ymm13,%ymm13,%ymm13 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpor %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vpxor %ymm11,%ymm14,%ymm14 + vmovdqa 384-256-128(%rbx),%ymm11 + + vpslld $5,%ymm2,%ymm7 + vpaddd %ymm15,%ymm1,%ymm1 + vpxor %ymm3,%ymm0,%ymm5 + vpaddd %ymm13,%ymm1,%ymm1 + vpxor 64-128(%rax),%ymm14,%ymm14 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm4,%ymm5,%ymm5 + vpxor %ymm11,%ymm14,%ymm14 + + vpslld $30,%ymm3,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm1,%ymm1 + vpsrld $31,%ymm14,%ymm9 + vpaddd %ymm14,%ymm14,%ymm14 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpor %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vpxor %ymm12,%ymm10,%ymm10 + vmovdqa 416-256-128(%rbx),%ymm12 + + vpslld $5,%ymm1,%ymm7 + vpaddd %ymm15,%ymm0,%ymm0 + vpxor %ymm2,%ymm4,%ymm5 + vpaddd %ymm14,%ymm0,%ymm0 + vpxor 96-128(%rax),%ymm10,%ymm10 + vpsrld $27,%ymm1,%ymm8 + vpxor %ymm3,%ymm5,%ymm5 + vpxor %ymm12,%ymm10,%ymm10 + + vpslld $30,%ymm2,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm0,%ymm0 + vpsrld $31,%ymm10,%ymm9 + vpaddd %ymm10,%ymm10,%ymm10 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + vpor %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vpxor %ymm13,%ymm11,%ymm11 + vmovdqa 448-256-128(%rbx),%ymm13 + + vpslld $5,%ymm0,%ymm7 + vpaddd %ymm15,%ymm4,%ymm4 + vpxor %ymm1,%ymm3,%ymm5 + vpaddd %ymm10,%ymm4,%ymm4 + vpxor 128-128(%rax),%ymm11,%ymm11 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm2,%ymm5,%ymm5 + vpxor %ymm13,%ymm11,%ymm11 + + vpslld $30,%ymm1,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm4,%ymm4 + vpsrld $31,%ymm11,%ymm9 + vpaddd %ymm11,%ymm11,%ymm11 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpor %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vpxor %ymm14,%ymm12,%ymm12 + vmovdqa 480-256-128(%rbx),%ymm14 + + vpslld $5,%ymm4,%ymm7 + vpaddd %ymm15,%ymm3,%ymm3 + vpxor %ymm0,%ymm2,%ymm5 + vpaddd %ymm11,%ymm3,%ymm3 + vpxor 160-128(%rax),%ymm12,%ymm12 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm14,%ymm12,%ymm12 + + vpslld $30,%ymm0,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm3,%ymm3 + vpsrld $31,%ymm12,%ymm9 + vpaddd %ymm12,%ymm12,%ymm12 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpor %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vpxor %ymm10,%ymm13,%ymm13 + vmovdqa 0-128(%rax),%ymm10 + + vpslld $5,%ymm3,%ymm7 + vpaddd %ymm15,%ymm2,%ymm2 + vpxor %ymm4,%ymm1,%ymm5 + vpaddd %ymm12,%ymm2,%ymm2 + vpxor 192-128(%rax),%ymm13,%ymm13 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm0,%ymm5,%ymm5 + vpxor %ymm10,%ymm13,%ymm13 + + vpslld $30,%ymm4,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm2,%ymm2 + vpsrld $31,%ymm13,%ymm9 + vpaddd %ymm13,%ymm13,%ymm13 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpor %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vpxor %ymm11,%ymm14,%ymm14 + vmovdqa 32-128(%rax),%ymm11 + + vpslld $5,%ymm2,%ymm7 + vpaddd %ymm15,%ymm1,%ymm1 + vpxor %ymm3,%ymm0,%ymm5 + vpaddd %ymm13,%ymm1,%ymm1 + vpxor 224-128(%rax),%ymm14,%ymm14 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm4,%ymm5,%ymm5 + vpxor %ymm11,%ymm14,%ymm14 + + vpslld $30,%ymm3,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm1,%ymm1 + vpsrld $31,%ymm14,%ymm9 + vpaddd %ymm14,%ymm14,%ymm14 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpor %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vpslld $5,%ymm1,%ymm7 + vpaddd %ymm15,%ymm0,%ymm0 + vpxor %ymm2,%ymm4,%ymm5 + + vpsrld $27,%ymm1,%ymm8 + vpaddd %ymm14,%ymm0,%ymm0 + vpxor %ymm3,%ymm5,%ymm5 + + vpslld $30,%ymm2,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm0,%ymm0 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + vpor %ymm6,%ymm2,%ymm2 + movl $1,%ecx + leaq 512(%rsp),%rbx + cmpl 0(%rbx),%ecx + cmovgeq %rbp,%r12 + cmpl 4(%rbx),%ecx + cmovgeq %rbp,%r13 + cmpl 8(%rbx),%ecx + cmovgeq %rbp,%r14 + cmpl 12(%rbx),%ecx + cmovgeq %rbp,%r15 + cmpl 16(%rbx),%ecx + cmovgeq %rbp,%r8 + cmpl 20(%rbx),%ecx + cmovgeq %rbp,%r9 + cmpl 24(%rbx),%ecx + cmovgeq %rbp,%r10 + cmpl 28(%rbx),%ecx + cmovgeq %rbp,%r11 + vmovdqu (%rbx),%ymm5 + vpxor %ymm7,%ymm7,%ymm7 + vmovdqa %ymm5,%ymm6 + vpcmpgtd %ymm7,%ymm6,%ymm6 + vpaddd %ymm6,%ymm5,%ymm5 + + vpand %ymm6,%ymm0,%ymm0 + vpand %ymm6,%ymm1,%ymm1 + vpaddd 0(%rdi),%ymm0,%ymm0 + vpand %ymm6,%ymm2,%ymm2 + vpaddd 32(%rdi),%ymm1,%ymm1 + vpand %ymm6,%ymm3,%ymm3 + vpaddd 64(%rdi),%ymm2,%ymm2 + vpand %ymm6,%ymm4,%ymm4 + vpaddd 96(%rdi),%ymm3,%ymm3 + vpaddd 128(%rdi),%ymm4,%ymm4 + vmovdqu %ymm0,0(%rdi) + vmovdqu %ymm1,32(%rdi) + vmovdqu %ymm2,64(%rdi) + vmovdqu %ymm3,96(%rdi) + vmovdqu %ymm4,128(%rdi) + + vmovdqu %ymm5,(%rbx) + leaq 256+128(%rsp),%rbx + vmovdqu 96(%rbp),%ymm9 + decl %edx + jnz .Loop_avx2 + + + + + + + +.Ldone_avx2: + movq 544(%rsp),%rax +.cfi_def_cfa %rax,8 + vzeroupper + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx2: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha1_multi_block_avx2,.-sha1_multi_block_avx2 .align 256 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 diff --git a/secure/lib/libcrypto/amd64/sha1-x86_64.S b/secure/lib/libcrypto/amd64/sha1-x86_64.S index cf36e17d312..342db5203d1 100644 --- a/secure/lib/libcrypto/amd64/sha1-x86_64.S +++ b/secure/lib/libcrypto/amd64/sha1-x86_64.S @@ -15,6 +15,14 @@ sha1_block_data_order: jz .Lialu testl $536870912,%r10d jnz _shaext_shortcut + andl $296,%r10d + cmpl $296,%r10d + je _avx2_shortcut + andl $268435456,%r8d + andl $1073741824,%r9d + orl %r9d,%r8d + cmpl $1342177280,%r8d + je _avx_shortcut jmp _ssse3_shortcut .align 16 @@ -2606,6 +2614,2827 @@ _ssse3_shortcut: .byte 0xf3,0xc3 .cfi_endproc .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 +.type sha1_block_data_order_avx,@function +.align 16 +sha1_block_data_order_avx: +_avx_shortcut: +.cfi_startproc + movq %rsp,%r11 +.cfi_def_cfa_register %r11 + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + leaq -64(%rsp),%rsp + vzeroupper + andq $-64,%rsp + movq %rdi,%r8 + movq %rsi,%r9 + movq %rdx,%r10 + + shlq $6,%r10 + addq %r9,%r10 + leaq K_XX_XX+64(%rip),%r14 + + movl 0(%r8),%eax + movl 4(%r8),%ebx + movl 8(%r8),%ecx + movl 12(%r8),%edx + movl %ebx,%esi + movl 16(%r8),%ebp + movl %ecx,%edi + xorl %edx,%edi + andl %edi,%esi + + vmovdqa 64(%r14),%xmm6 + vmovdqa -64(%r14),%xmm11 + vmovdqu 0(%r9),%xmm0 + vmovdqu 16(%r9),%xmm1 + vmovdqu 32(%r9),%xmm2 + vmovdqu 48(%r9),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + addq $64,%r9 + vpshufb %xmm6,%xmm1,%xmm1 + vpshufb %xmm6,%xmm2,%xmm2 + vpshufb %xmm6,%xmm3,%xmm3 + vpaddd %xmm11,%xmm0,%xmm4 + vpaddd %xmm11,%xmm1,%xmm5 + vpaddd %xmm11,%xmm2,%xmm6 + vmovdqa %xmm4,0(%rsp) + vmovdqa %xmm5,16(%rsp) + vmovdqa %xmm6,32(%rsp) + jmp .Loop_avx +.align 16 +.Loop_avx: + shrdl $2,%ebx,%ebx + xorl %edx,%esi + vpalignr $8,%xmm0,%xmm1,%xmm4 + movl %eax,%edi + addl 0(%rsp),%ebp + vpaddd %xmm3,%xmm11,%xmm9 + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrldq $4,%xmm3,%xmm8 + addl %esi,%ebp + andl %ebx,%edi + vpxor %xmm0,%xmm4,%xmm4 + xorl %ecx,%ebx + addl %eax,%ebp + vpxor %xmm2,%xmm8,%xmm8 + shrdl $7,%eax,%eax + xorl %ecx,%edi + movl %ebp,%esi + addl 4(%rsp),%edx + vpxor %xmm8,%xmm4,%xmm4 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vmovdqa %xmm9,48(%rsp) + addl %edi,%edx + andl %eax,%esi + vpsrld $31,%xmm4,%xmm8 + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%esi + vpslldq $12,%xmm4,%xmm10 + vpaddd %xmm4,%xmm4,%xmm4 + movl %edx,%edi + addl 8(%rsp),%ecx + xorl %eax,%ebp + shldl $5,%edx,%edx + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm4,%xmm4 + addl %esi,%ecx + andl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm4,%xmm4 + shrdl $7,%edx,%edx + xorl %eax,%edi + movl %ecx,%esi + addl 12(%rsp),%ebx + vpxor %xmm10,%xmm4,%xmm4 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + addl %edi,%ebx + andl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %ebp,%esi + vpalignr $8,%xmm1,%xmm2,%xmm5 + movl %ebx,%edi + addl 16(%rsp),%eax + vpaddd %xmm4,%xmm11,%xmm9 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrldq $4,%xmm4,%xmm8 + addl %esi,%eax + andl %ecx,%edi + vpxor %xmm1,%xmm5,%xmm5 + xorl %edx,%ecx + addl %ebx,%eax + vpxor %xmm3,%xmm8,%xmm8 + shrdl $7,%ebx,%ebx + xorl %edx,%edi + movl %eax,%esi + addl 20(%rsp),%ebp + vpxor %xmm8,%xmm5,%xmm5 + xorl %ecx,%ebx + shldl $5,%eax,%eax + vmovdqa %xmm9,0(%rsp) + addl %edi,%ebp + andl %ebx,%esi + vpsrld $31,%xmm5,%xmm8 + xorl %ecx,%ebx + addl %eax,%ebp + shrdl $7,%eax,%eax + xorl %ecx,%esi + vpslldq $12,%xmm5,%xmm10 + vpaddd %xmm5,%xmm5,%xmm5 + movl %ebp,%edi + addl 24(%rsp),%edx + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm5,%xmm5 + addl %esi,%edx + andl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm5,%xmm5 + shrdl $7,%ebp,%ebp + xorl %ebx,%edi + movl %edx,%esi + addl 28(%rsp),%ecx + vpxor %xmm10,%xmm5,%xmm5 + xorl %eax,%ebp + shldl $5,%edx,%edx + vmovdqa -32(%r14),%xmm11 + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + vpalignr $8,%xmm2,%xmm3,%xmm6 + movl %ecx,%edi + addl 32(%rsp),%ebx + vpaddd %xmm5,%xmm11,%xmm9 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + vpsrldq $4,%xmm5,%xmm8 + addl %esi,%ebx + andl %edx,%edi + vpxor %xmm2,%xmm6,%xmm6 + xorl %ebp,%edx + addl %ecx,%ebx + vpxor %xmm4,%xmm8,%xmm8 + shrdl $7,%ecx,%ecx + xorl %ebp,%edi + movl %ebx,%esi + addl 36(%rsp),%eax + vpxor %xmm8,%xmm6,%xmm6 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vmovdqa %xmm9,16(%rsp) + addl %edi,%eax + andl %ecx,%esi + vpsrld $31,%xmm6,%xmm8 + xorl %edx,%ecx + addl %ebx,%eax + shrdl $7,%ebx,%ebx + xorl %edx,%esi + vpslldq $12,%xmm6,%xmm10 + vpaddd %xmm6,%xmm6,%xmm6 + movl %eax,%edi + addl 40(%rsp),%ebp + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm6,%xmm6 + addl %esi,%ebp + andl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm6,%xmm6 + shrdl $7,%eax,%eax + xorl %ecx,%edi + movl %ebp,%esi + addl 44(%rsp),%edx + vpxor %xmm10,%xmm6,%xmm6 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + addl %edi,%edx + andl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%esi + vpalignr $8,%xmm3,%xmm4,%xmm7 + movl %edx,%edi + addl 48(%rsp),%ecx + vpaddd %xmm6,%xmm11,%xmm9 + xorl %eax,%ebp + shldl $5,%edx,%edx + vpsrldq $4,%xmm6,%xmm8 + addl %esi,%ecx + andl %ebp,%edi + vpxor %xmm3,%xmm7,%xmm7 + xorl %eax,%ebp + addl %edx,%ecx + vpxor %xmm5,%xmm8,%xmm8 + shrdl $7,%edx,%edx + xorl %eax,%edi + movl %ecx,%esi + addl 52(%rsp),%ebx + vpxor %xmm8,%xmm7,%xmm7 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + vmovdqa %xmm9,32(%rsp) + addl %edi,%ebx + andl %edx,%esi + vpsrld $31,%xmm7,%xmm8 + xorl %ebp,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %ebp,%esi + vpslldq $12,%xmm7,%xmm10 + vpaddd %xmm7,%xmm7,%xmm7 + movl %ebx,%edi + addl 56(%rsp),%eax + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm7,%xmm7 + addl %esi,%eax + andl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm7,%xmm7 + shrdl $7,%ebx,%ebx + xorl %edx,%edi + movl %eax,%esi + addl 60(%rsp),%ebp + vpxor %xmm10,%xmm7,%xmm7 + xorl %ecx,%ebx + shldl $5,%eax,%eax + addl %edi,%ebp + andl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm6,%xmm7,%xmm8 + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%esi + movl %ebp,%edi + addl 0(%rsp),%edx + vpxor %xmm1,%xmm0,%xmm0 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vpaddd %xmm7,%xmm11,%xmm9 + addl %esi,%edx + andl %eax,%edi + vpxor %xmm8,%xmm0,%xmm0 + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%edi + vpsrld $30,%xmm0,%xmm8 + vmovdqa %xmm9,48(%rsp) + movl %edx,%esi + addl 4(%rsp),%ecx + xorl %eax,%ebp + shldl $5,%edx,%edx + vpslld $2,%xmm0,%xmm0 + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + movl %ecx,%edi + addl 8(%rsp),%ebx + vpor %xmm8,%xmm0,%xmm0 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + addl %esi,%ebx + andl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 12(%rsp),%eax + xorl %ebp,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm7,%xmm0,%xmm8 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + vpxor %xmm2,%xmm1,%xmm1 + addl %esi,%ebp + xorl %ecx,%edi + vpaddd %xmm0,%xmm11,%xmm9 + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpxor %xmm8,%xmm1,%xmm1 + addl 20(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + vpsrld $30,%xmm1,%xmm8 + vmovdqa %xmm9,0(%rsp) + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpslld $2,%xmm1,%xmm1 + addl 24(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpor %xmm8,%xmm1,%xmm1 + addl 28(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpalignr $8,%xmm0,%xmm1,%xmm8 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + vpxor %xmm3,%xmm2,%xmm2 + addl %esi,%eax + xorl %edx,%edi + vpaddd %xmm1,%xmm11,%xmm9 + vmovdqa 0(%r14),%xmm11 + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpxor %xmm8,%xmm2,%xmm2 + addl 36(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm8 + vmovdqa %xmm9,16(%rsp) + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpslld $2,%xmm2,%xmm2 + addl 40(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpor %xmm8,%xmm2,%xmm2 + addl 44(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpalignr $8,%xmm1,%xmm2,%xmm8 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + vpxor %xmm4,%xmm3,%xmm3 + addl %esi,%ebx + xorl %ebp,%edi + vpaddd %xmm2,%xmm11,%xmm9 + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpxor %xmm8,%xmm3,%xmm3 + addl 52(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm8 + vmovdqa %xmm9,32(%rsp) + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + addl 56(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpor %xmm8,%xmm3,%xmm3 + addl 60(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpalignr $8,%xmm2,%xmm3,%xmm8 + vpxor %xmm0,%xmm4,%xmm4 + addl 0(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + vpxor %xmm5,%xmm4,%xmm4 + addl %esi,%ecx + xorl %eax,%edi + vpaddd %xmm3,%xmm11,%xmm9 + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpxor %xmm8,%xmm4,%xmm4 + addl 4(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + vpsrld $30,%xmm4,%xmm8 + vmovdqa %xmm9,48(%rsp) + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpslld $2,%xmm4,%xmm4 + addl 8(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpor %xmm8,%xmm4,%xmm4 + addl 12(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm3,%xmm4,%xmm8 + vpxor %xmm1,%xmm5,%xmm5 + addl 16(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + vpxor %xmm6,%xmm5,%xmm5 + addl %esi,%edx + xorl %ebx,%edi + vpaddd %xmm4,%xmm11,%xmm9 + shrdl $7,%eax,%eax + addl %ebp,%edx + vpxor %xmm8,%xmm5,%xmm5 + addl 20(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + vpsrld $30,%xmm5,%xmm8 + vmovdqa %xmm9,0(%rsp) + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpslld $2,%xmm5,%xmm5 + addl 24(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpor %xmm8,%xmm5,%xmm5 + addl 28(%rsp),%eax + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm4,%xmm5,%xmm8 + vpxor %xmm2,%xmm6,%xmm6 + addl 32(%rsp),%ebp + andl %ecx,%esi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vpxor %xmm7,%xmm6,%xmm6 + movl %eax,%edi + xorl %ecx,%esi + vpaddd %xmm5,%xmm11,%xmm9 + shldl $5,%eax,%eax + addl %esi,%ebp + vpxor %xmm8,%xmm6,%xmm6 + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 36(%rsp),%edx + vpsrld $30,%xmm6,%xmm8 + vmovdqa %xmm9,16(%rsp) + andl %ebx,%edi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%esi + vpslld $2,%xmm6,%xmm6 + xorl %ebx,%edi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + addl 40(%rsp),%ecx + andl %eax,%esi + vpor %xmm8,%xmm6,%xmm6 + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%edi + xorl %eax,%esi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 44(%rsp),%ebx + andl %ebp,%edi + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%esi + xorl %ebp,%edi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + vpalignr $8,%xmm5,%xmm6,%xmm8 + vpxor %xmm3,%xmm7,%xmm7 + addl 48(%rsp),%eax + andl %edx,%esi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + vpxor %xmm0,%xmm7,%xmm7 + movl %ebx,%edi + xorl %edx,%esi + vpaddd %xmm6,%xmm11,%xmm9 + vmovdqa 32(%r14),%xmm11 + shldl $5,%ebx,%ebx + addl %esi,%eax + vpxor %xmm8,%xmm7,%xmm7 + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 52(%rsp),%ebp + vpsrld $30,%xmm7,%xmm8 + vmovdqa %xmm9,32(%rsp) + andl %ecx,%edi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + vpslld $2,%xmm7,%xmm7 + xorl %ecx,%edi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + addl 56(%rsp),%edx + andl %ebx,%esi + vpor %xmm8,%xmm7,%xmm7 + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%edi + xorl %ebx,%esi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 60(%rsp),%ecx + andl %eax,%edi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%esi + xorl %eax,%edi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + vpalignr $8,%xmm6,%xmm7,%xmm8 + vpxor %xmm4,%xmm0,%xmm0 + addl 0(%rsp),%ebx + andl %ebp,%esi + xorl %eax,%ebp + shrdl $7,%edx,%edx + vpxor %xmm1,%xmm0,%xmm0 + movl %ecx,%edi + xorl %ebp,%esi + vpaddd %xmm7,%xmm11,%xmm9 + shldl $5,%ecx,%ecx + addl %esi,%ebx + vpxor %xmm8,%xmm0,%xmm0 + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 4(%rsp),%eax + vpsrld $30,%xmm0,%xmm8 + vmovdqa %xmm9,48(%rsp) + andl %edx,%edi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + vpslld $2,%xmm0,%xmm0 + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + addl 8(%rsp),%ebp + andl %ecx,%esi + vpor %xmm8,%xmm0,%xmm0 + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%edi + xorl %ecx,%esi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 12(%rsp),%edx + andl %ebx,%edi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%esi + xorl %ebx,%edi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + vpalignr $8,%xmm7,%xmm0,%xmm8 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%rsp),%ecx + andl %eax,%esi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + vpxor %xmm2,%xmm1,%xmm1 + movl %edx,%edi + xorl %eax,%esi + vpaddd %xmm0,%xmm11,%xmm9 + shldl $5,%edx,%edx + addl %esi,%ecx + vpxor %xmm8,%xmm1,%xmm1 + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 20(%rsp),%ebx + vpsrld $30,%xmm1,%xmm8 + vmovdqa %xmm9,0(%rsp) + andl %ebp,%edi + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%esi + vpslld $2,%xmm1,%xmm1 + xorl %ebp,%edi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + addl 24(%rsp),%eax + andl %edx,%esi + vpor %xmm8,%xmm1,%xmm1 + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%edi + xorl %edx,%esi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 28(%rsp),%ebp + andl %ecx,%edi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + xorl %ecx,%edi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm0,%xmm1,%xmm8 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%rsp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vpxor %xmm3,%xmm2,%xmm2 + movl %ebp,%edi + xorl %ebx,%esi + vpaddd %xmm1,%xmm11,%xmm9 + shldl $5,%ebp,%ebp + addl %esi,%edx + vpxor %xmm8,%xmm2,%xmm2 + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 36(%rsp),%ecx + vpsrld $30,%xmm2,%xmm8 + vmovdqa %xmm9,16(%rsp) + andl %eax,%edi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%esi + vpslld $2,%xmm2,%xmm2 + xorl %eax,%edi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + addl 40(%rsp),%ebx + andl %ebp,%esi + vpor %xmm8,%xmm2,%xmm2 + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%edi + xorl %ebp,%esi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 44(%rsp),%eax + andl %edx,%edi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + addl %ebx,%eax + vpalignr $8,%xmm1,%xmm2,%xmm8 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + vpxor %xmm4,%xmm3,%xmm3 + addl %esi,%ebp + xorl %ecx,%edi + vpaddd %xmm2,%xmm11,%xmm9 + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpxor %xmm8,%xmm3,%xmm3 + addl 52(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + vpsrld $30,%xmm3,%xmm8 + vmovdqa %xmm9,32(%rsp) + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpslld $2,%xmm3,%xmm3 + addl 56(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpor %xmm8,%xmm3,%xmm3 + addl 60(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 0(%rsp),%eax + vpaddd %xmm3,%xmm11,%xmm9 + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + vmovdqa %xmm9,48(%rsp) + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 4(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 8(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 12(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + cmpq %r10,%r9 + je .Ldone_avx + vmovdqa 64(%r14),%xmm6 + vmovdqa -64(%r14),%xmm11 + vmovdqu 0(%r9),%xmm0 + vmovdqu 16(%r9),%xmm1 + vmovdqu 32(%r9),%xmm2 + vmovdqu 48(%r9),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + addq $64,%r9 + addl 16(%rsp),%ebx + xorl %ebp,%esi + vpshufb %xmm6,%xmm1,%xmm1 + movl %ecx,%edi + shldl $5,%ecx,%ecx + vpaddd %xmm11,%xmm0,%xmm4 + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vmovdqa %xmm4,0(%rsp) + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + vpshufb %xmm6,%xmm2,%xmm2 + movl %edx,%edi + shldl $5,%edx,%edx + vpaddd %xmm11,%xmm1,%xmm5 + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vmovdqa %xmm5,16(%rsp) + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + vpshufb %xmm6,%xmm3,%xmm3 + movl %ebp,%edi + shldl $5,%ebp,%ebp + vpaddd %xmm11,%xmm2,%xmm6 + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + vmovdqa %xmm6,32(%rsp) + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + addl 12(%r8),%edx + movl %eax,0(%r8) + addl 16(%r8),%ebp + movl %esi,4(%r8) + movl %esi,%ebx + movl %ecx,8(%r8) + movl %ecx,%edi + movl %edx,12(%r8) + xorl %edx,%edi + movl %ebp,16(%r8) + andl %edi,%esi + jmp .Loop_avx + +.align 16 +.Ldone_avx: + addl 16(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vzeroupper + + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + movl %eax,0(%r8) + addl 12(%r8),%edx + movl %esi,4(%r8) + addl 16(%r8),%ebp + movl %ecx,8(%r8) + movl %edx,12(%r8) + movl %ebp,16(%r8) + movq -40(%r11),%r14 +.cfi_restore %r14 + movq -32(%r11),%r13 +.cfi_restore %r13 + movq -24(%r11),%r12 +.cfi_restore %r12 + movq -16(%r11),%rbp +.cfi_restore %rbp + movq -8(%r11),%rbx +.cfi_restore %rbx + leaq (%r11),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha1_block_data_order_avx,.-sha1_block_data_order_avx +.type sha1_block_data_order_avx2,@function +.align 16 +sha1_block_data_order_avx2: +_avx2_shortcut: +.cfi_startproc + movq %rsp,%r11 +.cfi_def_cfa_register %r11 + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + vzeroupper + movq %rdi,%r8 + movq %rsi,%r9 + movq %rdx,%r10 + + leaq -640(%rsp),%rsp + shlq $6,%r10 + leaq 64(%r9),%r13 + andq $-128,%rsp + addq %r9,%r10 + leaq K_XX_XX+64(%rip),%r14 + + movl 0(%r8),%eax + cmpq %r10,%r13 + cmovaeq %r9,%r13 + movl 4(%r8),%ebp + movl 8(%r8),%ecx + movl 12(%r8),%edx + movl 16(%r8),%esi + vmovdqu 64(%r14),%ymm6 + + vmovdqu (%r9),%xmm0 + vmovdqu 16(%r9),%xmm1 + vmovdqu 32(%r9),%xmm2 + vmovdqu 48(%r9),%xmm3 + leaq 64(%r9),%r9 + vinserti128 $1,(%r13),%ymm0,%ymm0 + vinserti128 $1,16(%r13),%ymm1,%ymm1 + vpshufb %ymm6,%ymm0,%ymm0 + vinserti128 $1,32(%r13),%ymm2,%ymm2 + vpshufb %ymm6,%ymm1,%ymm1 + vinserti128 $1,48(%r13),%ymm3,%ymm3 + vpshufb %ymm6,%ymm2,%ymm2 + vmovdqu -64(%r14),%ymm11 + vpshufb %ymm6,%ymm3,%ymm3 + + vpaddd %ymm11,%ymm0,%ymm4 + vpaddd %ymm11,%ymm1,%ymm5 + vmovdqu %ymm4,0(%rsp) + vpaddd %ymm11,%ymm2,%ymm6 + vmovdqu %ymm5,32(%rsp) + vpaddd %ymm11,%ymm3,%ymm7 + vmovdqu %ymm6,64(%rsp) + vmovdqu %ymm7,96(%rsp) + vpalignr $8,%ymm0,%ymm1,%ymm4 + vpsrldq $4,%ymm3,%ymm8 + vpxor %ymm0,%ymm4,%ymm4 + vpxor %ymm2,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $31,%ymm4,%ymm8 + vpslldq $12,%ymm4,%ymm10 + vpaddd %ymm4,%ymm4,%ymm4 + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm4,%ymm4 + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm4,%ymm4 + vpxor %ymm10,%ymm4,%ymm4 + vpaddd %ymm11,%ymm4,%ymm9 + vmovdqu %ymm9,128(%rsp) + vpalignr $8,%ymm1,%ymm2,%ymm5 + vpsrldq $4,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm3,%ymm8,%ymm8 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $31,%ymm5,%ymm8 + vmovdqu -32(%r14),%ymm11 + vpslldq $12,%ymm5,%ymm10 + vpaddd %ymm5,%ymm5,%ymm5 + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm5,%ymm5 + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm10,%ymm5,%ymm5 + vpaddd %ymm11,%ymm5,%ymm9 + vmovdqu %ymm9,160(%rsp) + vpalignr $8,%ymm2,%ymm3,%ymm6 + vpsrldq $4,%ymm5,%ymm8 + vpxor %ymm2,%ymm6,%ymm6 + vpxor %ymm4,%ymm8,%ymm8 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $31,%ymm6,%ymm8 + vpslldq $12,%ymm6,%ymm10 + vpaddd %ymm6,%ymm6,%ymm6 + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm6,%ymm6 + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm6,%ymm6 + vpxor %ymm10,%ymm6,%ymm6 + vpaddd %ymm11,%ymm6,%ymm9 + vmovdqu %ymm9,192(%rsp) + vpalignr $8,%ymm3,%ymm4,%ymm7 + vpsrldq $4,%ymm6,%ymm8 + vpxor %ymm3,%ymm7,%ymm7 + vpxor %ymm5,%ymm8,%ymm8 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm7,%ymm8 + vpslldq $12,%ymm7,%ymm10 + vpaddd %ymm7,%ymm7,%ymm7 + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm7,%ymm7 + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm7,%ymm7 + vpxor %ymm10,%ymm7,%ymm7 + vpaddd %ymm11,%ymm7,%ymm9 + vmovdqu %ymm9,224(%rsp) + leaq 128(%rsp),%r13 + jmp .Loop_avx2 +.align 32 +.Loop_avx2: + rorxl $2,%ebp,%ebx + andnl %edx,%ebp,%edi + andl %ecx,%ebp + xorl %edi,%ebp + jmp .Lalign32_1 +.align 32 +.Lalign32_1: + vpalignr $8,%ymm6,%ymm7,%ymm8 + vpxor %ymm4,%ymm0,%ymm0 + addl -128(%r13),%esi + andnl %ecx,%eax,%edi + vpxor %ymm1,%ymm0,%ymm0 + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + vpxor %ymm8,%ymm0,%ymm0 + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + vpsrld $30,%ymm0,%ymm8 + vpslld $2,%ymm0,%ymm0 + addl -124(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + vpor %ymm8,%ymm0,%ymm0 + addl %r12d,%edx + xorl %edi,%esi + addl -120(%r13),%ecx + andnl %ebp,%edx,%edi + vpaddd %ymm11,%ymm0,%ymm9 + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + vmovdqu %ymm9,256(%rsp) + addl %r12d,%ecx + xorl %edi,%edx + addl -116(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + addl -96(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + vpalignr $8,%ymm7,%ymm0,%ymm8 + vpxor %ymm5,%ymm1,%ymm1 + addl -92(%r13),%eax + andnl %edx,%ebp,%edi + vpxor %ymm2,%ymm1,%ymm1 + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + vpxor %ymm8,%ymm1,%ymm1 + andl %ecx,%ebp + addl %r12d,%eax + xorl %edi,%ebp + vpsrld $30,%ymm1,%ymm8 + vpslld $2,%ymm1,%ymm1 + addl -88(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + vpor %ymm8,%ymm1,%ymm1 + addl %r12d,%esi + xorl %edi,%eax + addl -84(%r13),%edx + andnl %ebx,%esi,%edi + vpaddd %ymm11,%ymm1,%ymm9 + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + vmovdqu %ymm9,288(%rsp) + addl %r12d,%edx + xorl %edi,%esi + addl -64(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + addl -60(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + vpalignr $8,%ymm0,%ymm1,%ymm8 + vpxor %ymm6,%ymm2,%ymm2 + addl -56(%r13),%ebp + andnl %esi,%ebx,%edi + vpxor %ymm3,%ymm2,%ymm2 + vmovdqu 0(%r14),%ymm11 + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + vpxor %ymm8,%ymm2,%ymm2 + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + vpsrld $30,%ymm2,%ymm8 + vpslld $2,%ymm2,%ymm2 + addl -52(%r13),%eax + andnl %edx,%ebp,%edi + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + vpor %ymm8,%ymm2,%ymm2 + addl %r12d,%eax + xorl %edi,%ebp + addl -32(%r13),%esi + andnl %ecx,%eax,%edi + vpaddd %ymm11,%ymm2,%ymm9 + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + vmovdqu %ymm9,320(%rsp) + addl %r12d,%esi + xorl %edi,%eax + addl -28(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + addl %r12d,%edx + xorl %edi,%esi + addl -24(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + vpalignr $8,%ymm1,%ymm2,%ymm8 + vpxor %ymm7,%ymm3,%ymm3 + addl -20(%r13),%ebx + andnl %eax,%ecx,%edi + vpxor %ymm4,%ymm3,%ymm3 + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + vpxor %ymm8,%ymm3,%ymm3 + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + vpsrld $30,%ymm3,%ymm8 + vpslld $2,%ymm3,%ymm3 + addl 0(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + vpor %ymm8,%ymm3,%ymm3 + addl %r12d,%ebp + xorl %edi,%ebx + addl 4(%r13),%eax + andnl %edx,%ebp,%edi + vpaddd %ymm11,%ymm3,%ymm9 + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + vmovdqu %ymm9,352(%rsp) + addl %r12d,%eax + xorl %edi,%ebp + addl 8(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl 12(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + vpalignr $8,%ymm2,%ymm3,%ymm8 + vpxor %ymm0,%ymm4,%ymm4 + addl 32(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + vpxor %ymm5,%ymm4,%ymm4 + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + vpxor %ymm8,%ymm4,%ymm4 + addl %r12d,%ecx + xorl %ebp,%edx + addl 36(%r13),%ebx + vpsrld $30,%ymm4,%ymm8 + vpslld $2,%ymm4,%ymm4 + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + vpor %ymm8,%ymm4,%ymm4 + addl 40(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + vpaddd %ymm11,%ymm4,%ymm9 + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl 44(%r13),%eax + vmovdqu %ymm9,384(%rsp) + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl 64(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + vpalignr $8,%ymm3,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + addl 68(%r13),%edx + leal (%rdx,%rax,1),%edx + vpxor %ymm6,%ymm5,%ymm5 + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + vpxor %ymm8,%ymm5,%ymm5 + addl %r12d,%edx + xorl %ebx,%esi + addl 72(%r13),%ecx + vpsrld $30,%ymm5,%ymm8 + vpslld $2,%ymm5,%ymm5 + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + vpor %ymm8,%ymm5,%ymm5 + addl 76(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + vpaddd %ymm11,%ymm5,%ymm9 + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl 96(%r13),%ebp + vmovdqu %ymm9,416(%rsp) + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl 100(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + vpalignr $8,%ymm4,%ymm5,%ymm8 + vpxor %ymm2,%ymm6,%ymm6 + addl 104(%r13),%esi + leal (%rsi,%rbp,1),%esi + vpxor %ymm7,%ymm6,%ymm6 + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + vpxor %ymm8,%ymm6,%ymm6 + addl %r12d,%esi + xorl %ecx,%eax + addl 108(%r13),%edx + leaq 256(%r13),%r13 + vpsrld $30,%ymm6,%ymm8 + vpslld $2,%ymm6,%ymm6 + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + vpor %ymm8,%ymm6,%ymm6 + addl -128(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + vpaddd %ymm11,%ymm6,%ymm9 + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -124(%r13),%ebx + vmovdqu %ymm9,448(%rsp) + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -120(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + vpalignr $8,%ymm5,%ymm6,%ymm8 + vpxor %ymm3,%ymm7,%ymm7 + addl -116(%r13),%eax + leal (%rax,%rbx,1),%eax + vpxor %ymm0,%ymm7,%ymm7 + vmovdqu 32(%r14),%ymm11 + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + vpxor %ymm8,%ymm7,%ymm7 + addl %r12d,%eax + xorl %edx,%ebp + addl -96(%r13),%esi + vpsrld $30,%ymm7,%ymm8 + vpslld $2,%ymm7,%ymm7 + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + vpor %ymm8,%ymm7,%ymm7 + addl -92(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + vpaddd %ymm11,%ymm7,%ymm9 + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -88(%r13),%ecx + vmovdqu %ymm9,480(%rsp) + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -84(%r13),%ebx + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + jmp .Lalign32_2 +.align 32 +.Lalign32_2: + vpalignr $8,%ymm6,%ymm7,%ymm8 + vpxor %ymm4,%ymm0,%ymm0 + addl -64(%r13),%ebp + xorl %esi,%ecx + vpxor %ymm1,%ymm0,%ymm0 + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + vpxor %ymm8,%ymm0,%ymm0 + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + vpsrld $30,%ymm0,%ymm8 + vpslld $2,%ymm0,%ymm0 + addl %r12d,%ebp + andl %edi,%ebx + addl -60(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + vpor %ymm8,%ymm0,%ymm0 + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + vpaddd %ymm11,%ymm0,%ymm9 + addl %r12d,%eax + andl %edi,%ebp + addl -56(%r13),%esi + xorl %ecx,%ebp + vmovdqu %ymm9,512(%rsp) + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + addl -52(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + andl %edi,%esi + addl -32(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + andl %edi,%edx + vpalignr $8,%ymm7,%ymm0,%ymm8 + vpxor %ymm5,%ymm1,%ymm1 + addl -28(%r13),%ebx + xorl %eax,%edx + vpxor %ymm2,%ymm1,%ymm1 + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + vpxor %ymm8,%ymm1,%ymm1 + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + vpsrld $30,%ymm1,%ymm8 + vpslld $2,%ymm1,%ymm1 + addl %r12d,%ebx + andl %edi,%ecx + addl -24(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + vpor %ymm8,%ymm1,%ymm1 + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + vpaddd %ymm11,%ymm1,%ymm9 + addl %r12d,%ebp + andl %edi,%ebx + addl -20(%r13),%eax + xorl %edx,%ebx + vmovdqu %ymm9,544(%rsp) + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl 0(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + addl 4(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + andl %edi,%esi + vpalignr $8,%ymm0,%ymm1,%ymm8 + vpxor %ymm6,%ymm2,%ymm2 + addl 8(%r13),%ecx + xorl %ebp,%esi + vpxor %ymm3,%ymm2,%ymm2 + movl %eax,%edi + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + vpxor %ymm8,%ymm2,%ymm2 + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + vpsrld $30,%ymm2,%ymm8 + vpslld $2,%ymm2,%ymm2 + addl %r12d,%ecx + andl %edi,%edx + addl 12(%r13),%ebx + xorl %eax,%edx + movl %esi,%edi + xorl %eax,%edi + vpor %ymm8,%ymm2,%ymm2 + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + vpaddd %ymm11,%ymm2,%ymm9 + addl %r12d,%ebx + andl %edi,%ecx + addl 32(%r13),%ebp + xorl %esi,%ecx + vmovdqu %ymm9,576(%rsp) + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl 36(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl 40(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + vpalignr $8,%ymm1,%ymm2,%ymm8 + vpxor %ymm7,%ymm3,%ymm3 + addl 44(%r13),%edx + xorl %ebx,%eax + vpxor %ymm4,%ymm3,%ymm3 + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + vpxor %ymm8,%ymm3,%ymm3 + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + vpsrld $30,%ymm3,%ymm8 + vpslld $2,%ymm3,%ymm3 + addl %r12d,%edx + andl %edi,%esi + addl 64(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + xorl %ebp,%edi + vpor %ymm8,%ymm3,%ymm3 + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + vpaddd %ymm11,%ymm3,%ymm9 + addl %r12d,%ecx + andl %edi,%edx + addl 68(%r13),%ebx + xorl %eax,%edx + vmovdqu %ymm9,608(%rsp) + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + addl 72(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl 76(%r13),%eax + xorl %edx,%ebx + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl 96(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl 100(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl 104(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl 108(%r13),%ebx + leaq 256(%r13),%r13 + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -128(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl -124(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -120(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -116(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -96(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -92(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -88(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl -84(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -64(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -60(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -56(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -52(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -32(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl -28(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -24(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -20(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + addl %r12d,%edx + leaq 128(%r9),%r13 + leaq 128(%r9),%rdi + cmpq %r10,%r13 + cmovaeq %r9,%r13 + + + addl 0(%r8),%edx + addl 4(%r8),%esi + addl 8(%r8),%ebp + movl %edx,0(%r8) + addl 12(%r8),%ebx + movl %esi,4(%r8) + movl %edx,%eax + addl 16(%r8),%ecx + movl %ebp,%r12d + movl %ebp,8(%r8) + movl %ebx,%edx + + movl %ebx,12(%r8) + movl %esi,%ebp + movl %ecx,16(%r8) + + movl %ecx,%esi + movl %r12d,%ecx + + + cmpq %r10,%r9 + je .Ldone_avx2 + vmovdqu 64(%r14),%ymm6 + cmpq %r10,%rdi + ja .Last_avx2 + + vmovdqu -64(%rdi),%xmm0 + vmovdqu -48(%rdi),%xmm1 + vmovdqu -32(%rdi),%xmm2 + vmovdqu -16(%rdi),%xmm3 + vinserti128 $1,0(%r13),%ymm0,%ymm0 + vinserti128 $1,16(%r13),%ymm1,%ymm1 + vinserti128 $1,32(%r13),%ymm2,%ymm2 + vinserti128 $1,48(%r13),%ymm3,%ymm3 + jmp .Last_avx2 + +.align 32 +.Last_avx2: + leaq 128+16(%rsp),%r13 + rorxl $2,%ebp,%ebx + andnl %edx,%ebp,%edi + andl %ecx,%ebp + xorl %edi,%ebp + subq $-128,%r9 + addl -128(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl -124(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + addl %r12d,%edx + xorl %edi,%esi + addl -120(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + addl -116(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + addl -96(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + addl -92(%r13),%eax + andnl %edx,%ebp,%edi + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + addl %r12d,%eax + xorl %edi,%ebp + addl -88(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl -84(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + addl %r12d,%edx + xorl %edi,%esi + addl -64(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + addl -60(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + addl -56(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + addl -52(%r13),%eax + andnl %edx,%ebp,%edi + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + addl %r12d,%eax + xorl %edi,%ebp + addl -32(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl -28(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + addl %r12d,%edx + xorl %edi,%esi + addl -24(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + addl -20(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + addl 0(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + addl 4(%r13),%eax + andnl %edx,%ebp,%edi + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + addl %r12d,%eax + xorl %edi,%ebp + addl 8(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl 12(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl 32(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl 36(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl 40(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl 44(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl 64(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + vmovdqu -64(%r14),%ymm11 + vpshufb %ymm6,%ymm0,%ymm0 + addl 68(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl 72(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl 76(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl 96(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl 100(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + vpshufb %ymm6,%ymm1,%ymm1 + vpaddd %ymm11,%ymm0,%ymm8 + addl 104(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl 108(%r13),%edx + leaq 256(%r13),%r13 + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -128(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -124(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -120(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + vmovdqu %ymm8,0(%rsp) + vpshufb %ymm6,%ymm2,%ymm2 + vpaddd %ymm11,%ymm1,%ymm9 + addl -116(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -96(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -92(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -88(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -84(%r13),%ebx + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + vmovdqu %ymm9,32(%rsp) + vpshufb %ymm6,%ymm3,%ymm3 + vpaddd %ymm11,%ymm2,%ymm6 + addl -64(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl -60(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl -56(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + addl -52(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + andl %edi,%esi + addl -32(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + andl %edi,%edx + jmp .Lalign32_3 +.align 32 +.Lalign32_3: + vmovdqu %ymm6,64(%rsp) + vpaddd %ymm11,%ymm3,%ymm7 + addl -28(%r13),%ebx + xorl %eax,%edx + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + addl -24(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl -20(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl 0(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + addl 4(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + andl %edi,%esi + vmovdqu %ymm7,96(%rsp) + addl 8(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + andl %edi,%edx + addl 12(%r13),%ebx + xorl %eax,%edx + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + addl 32(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl 36(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl 40(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + vpalignr $8,%ymm0,%ymm1,%ymm4 + addl 44(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + vpsrldq $4,%ymm3,%ymm8 + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + vpxor %ymm0,%ymm4,%ymm4 + vpxor %ymm2,%ymm8,%ymm8 + xorl %ebp,%esi + addl %r12d,%edx + vpxor %ymm8,%ymm4,%ymm4 + andl %edi,%esi + addl 64(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + vpsrld $31,%ymm4,%ymm8 + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + vpslldq $12,%ymm4,%ymm10 + vpaddd %ymm4,%ymm4,%ymm4 + rorxl $2,%edx,%esi + xorl %eax,%edx + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm4,%ymm4 + addl %r12d,%ecx + andl %edi,%edx + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm4,%ymm4 + addl 68(%r13),%ebx + xorl %eax,%edx + vpxor %ymm10,%ymm4,%ymm4 + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + vpaddd %ymm11,%ymm4,%ymm9 + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + vmovdqu %ymm9,128(%rsp) + addl %r12d,%ebx + andl %edi,%ecx + addl 72(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl 76(%r13),%eax + xorl %edx,%ebx + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + vpalignr $8,%ymm1,%ymm2,%ymm5 + addl 96(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + vpsrldq $4,%ymm4,%ymm8 + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm3,%ymm8,%ymm8 + addl 100(%r13),%edx + leal (%rdx,%rax,1),%edx + vpxor %ymm8,%ymm5,%ymm5 + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + vpsrld $31,%ymm5,%ymm8 + vmovdqu -32(%r14),%ymm11 + xorl %ebx,%esi + addl 104(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + vpslldq $12,%ymm5,%ymm10 + vpaddd %ymm5,%ymm5,%ymm5 + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm5,%ymm5 + xorl %eax,%edx + addl %r12d,%ecx + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm5,%ymm5 + xorl %ebp,%edx + addl 108(%r13),%ebx + leaq 256(%r13),%r13 + vpxor %ymm10,%ymm5,%ymm5 + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + vpaddd %ymm11,%ymm5,%ymm9 + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + vmovdqu %ymm9,160(%rsp) + addl -128(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + vpalignr $8,%ymm2,%ymm3,%ymm6 + addl -124(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + vpsrldq $4,%ymm5,%ymm8 + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + vpxor %ymm2,%ymm6,%ymm6 + vpxor %ymm4,%ymm8,%ymm8 + addl -120(%r13),%esi + leal (%rsi,%rbp,1),%esi + vpxor %ymm8,%ymm6,%ymm6 + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + vpsrld $31,%ymm6,%ymm8 + xorl %ecx,%eax + addl -116(%r13),%edx + leal (%rdx,%rax,1),%edx + vpslldq $12,%ymm6,%ymm10 + vpaddd %ymm6,%ymm6,%ymm6 + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm6,%ymm6 + xorl %ebp,%esi + addl %r12d,%edx + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm6,%ymm6 + xorl %ebx,%esi + addl -96(%r13),%ecx + vpxor %ymm10,%ymm6,%ymm6 + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + vpaddd %ymm11,%ymm6,%ymm9 + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + vmovdqu %ymm9,192(%rsp) + addl -92(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + vpalignr $8,%ymm3,%ymm4,%ymm7 + addl -88(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + vpsrldq $4,%ymm6,%ymm8 + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + vpxor %ymm3,%ymm7,%ymm7 + vpxor %ymm5,%ymm8,%ymm8 + addl -84(%r13),%eax + leal (%rax,%rbx,1),%eax + vpxor %ymm8,%ymm7,%ymm7 + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + vpsrld $31,%ymm7,%ymm8 + xorl %edx,%ebp + addl -64(%r13),%esi + leal (%rsi,%rbp,1),%esi + vpslldq $12,%ymm7,%ymm10 + vpaddd %ymm7,%ymm7,%ymm7 + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm7,%ymm7 + xorl %ebx,%eax + addl %r12d,%esi + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm7,%ymm7 + xorl %ecx,%eax + addl -60(%r13),%edx + vpxor %ymm10,%ymm7,%ymm7 + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + vpaddd %ymm11,%ymm7,%ymm9 + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + vmovdqu %ymm9,224(%rsp) + addl -56(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -52(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -32(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl -28(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -24(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -20(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + addl %r12d,%edx + leaq 128(%rsp),%r13 + + + addl 0(%r8),%edx + addl 4(%r8),%esi + addl 8(%r8),%ebp + movl %edx,0(%r8) + addl 12(%r8),%ebx + movl %esi,4(%r8) + movl %edx,%eax + addl 16(%r8),%ecx + movl %ebp,%r12d + movl %ebp,8(%r8) + movl %ebx,%edx + + movl %ebx,12(%r8) + movl %esi,%ebp + movl %ecx,16(%r8) + + movl %ecx,%esi + movl %r12d,%ecx + + + cmpq %r10,%r9 + jbe .Loop_avx2 + +.Ldone_avx2: + vzeroupper + movq -40(%r11),%r14 +.cfi_restore %r14 + movq -32(%r11),%r13 +.cfi_restore %r13 + movq -24(%r11),%r12 +.cfi_restore %r12 + movq -16(%r11),%rbp +.cfi_restore %rbp + movq -8(%r11),%rbx +.cfi_restore %rbx + leaq (%r11),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx2: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2 .align 64 K_XX_XX: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 diff --git a/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S b/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S index 63dca42029e..1c77e3d13a8 100644 --- a/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S +++ b/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S @@ -12,6 +12,8 @@ sha256_multi_block: movq OPENSSL_ia32cap_P+4(%rip),%rcx btq $61,%rcx jc _shaext_shortcut + testl $268435456,%ecx + jnz _avx_shortcut movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx @@ -3125,6 +3127,4676 @@ _shaext_shortcut: .byte 0xf3,0xc3 .cfi_endproc .size sha256_multi_block_shaext,.-sha256_multi_block_shaext +.type sha256_multi_block_avx,@function +.align 32 +sha256_multi_block_avx: +.cfi_startproc +_avx_shortcut: + shrq $32,%rcx + cmpl $2,%edx + jb .Lavx + testl $32,%ecx + jnz _avx2_shortcut + jmp .Lavx +.align 32 +.Lavx: + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + subq $288,%rsp + andq $-256,%rsp + movq %rax,272(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x90,0x02,0x06,0x23,0x08 +.Lbody_avx: + leaq K256+128(%rip),%rbp + leaq 256(%rsp),%rbx + leaq 128(%rdi),%rdi + +.Loop_grande_avx: + movl %edx,280(%rsp) + xorl %edx,%edx + movq 0(%rsi),%r8 + movl 8(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,0(%rbx) + cmovleq %rbp,%r8 + movq 16(%rsi),%r9 + movl 24(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,4(%rbx) + cmovleq %rbp,%r9 + movq 32(%rsi),%r10 + movl 40(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,8(%rbx) + cmovleq %rbp,%r10 + movq 48(%rsi),%r11 + movl 56(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,12(%rbx) + cmovleq %rbp,%r11 + testl %edx,%edx + jz .Ldone_avx + + vmovdqu 0-128(%rdi),%xmm8 + leaq 128(%rsp),%rax + vmovdqu 32-128(%rdi),%xmm9 + vmovdqu 64-128(%rdi),%xmm10 + vmovdqu 96-128(%rdi),%xmm11 + vmovdqu 128-128(%rdi),%xmm12 + vmovdqu 160-128(%rdi),%xmm13 + vmovdqu 192-128(%rdi),%xmm14 + vmovdqu 224-128(%rdi),%xmm15 + vmovdqu .Lpbswap(%rip),%xmm6 + jmp .Loop_avx + +.align 32 +.Loop_avx: + vpxor %xmm9,%xmm10,%xmm4 + vmovd 0(%r8),%xmm5 + vmovd 0(%r9),%xmm0 + vpinsrd $1,0(%r10),%xmm5,%xmm5 + vpinsrd $1,0(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm12,%xmm7 + vpslld $26,%xmm12,%xmm2 + vmovdqu %xmm5,0-128(%rax) + vpaddd %xmm15,%xmm5,%xmm5 + + vpsrld $11,%xmm12,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm12,%xmm2 + vpaddd -128(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm12,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm12,%xmm2 + vpandn %xmm14,%xmm12,%xmm0 + vpand %xmm13,%xmm12,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm8,%xmm15 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm8,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm8,%xmm9,%xmm3 + + vpxor %xmm1,%xmm15,%xmm15 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm8,%xmm1 + + vpslld $19,%xmm8,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm15,%xmm7 + + vpsrld $22,%xmm8,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm8,%xmm2 + vpxor %xmm4,%xmm9,%xmm15 + vpaddd %xmm5,%xmm11,%xmm11 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm15,%xmm15 + vpaddd %xmm7,%xmm15,%xmm15 + vmovd 4(%r8),%xmm5 + vmovd 4(%r9),%xmm0 + vpinsrd $1,4(%r10),%xmm5,%xmm5 + vpinsrd $1,4(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm11,%xmm7 + vpslld $26,%xmm11,%xmm2 + vmovdqu %xmm5,16-128(%rax) + vpaddd %xmm14,%xmm5,%xmm5 + + vpsrld $11,%xmm11,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm11,%xmm2 + vpaddd -96(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm11,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm11,%xmm2 + vpandn %xmm13,%xmm11,%xmm0 + vpand %xmm12,%xmm11,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm15,%xmm14 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm15,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm15,%xmm8,%xmm4 + + vpxor %xmm1,%xmm14,%xmm14 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm15,%xmm1 + + vpslld $19,%xmm15,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm14,%xmm7 + + vpsrld $22,%xmm15,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm15,%xmm2 + vpxor %xmm3,%xmm8,%xmm14 + vpaddd %xmm5,%xmm10,%xmm10 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm14,%xmm14 + vpaddd %xmm7,%xmm14,%xmm14 + vmovd 8(%r8),%xmm5 + vmovd 8(%r9),%xmm0 + vpinsrd $1,8(%r10),%xmm5,%xmm5 + vpinsrd $1,8(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm10,%xmm7 + vpslld $26,%xmm10,%xmm2 + vmovdqu %xmm5,32-128(%rax) + vpaddd %xmm13,%xmm5,%xmm5 + + vpsrld $11,%xmm10,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm10,%xmm2 + vpaddd -64(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm10,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm10,%xmm2 + vpandn %xmm12,%xmm10,%xmm0 + vpand %xmm11,%xmm10,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm14,%xmm13 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm14,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm14,%xmm15,%xmm3 + + vpxor %xmm1,%xmm13,%xmm13 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm14,%xmm1 + + vpslld $19,%xmm14,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm13,%xmm7 + + vpsrld $22,%xmm14,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm14,%xmm2 + vpxor %xmm4,%xmm15,%xmm13 + vpaddd %xmm5,%xmm9,%xmm9 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm13,%xmm13 + vpaddd %xmm7,%xmm13,%xmm13 + vmovd 12(%r8),%xmm5 + vmovd 12(%r9),%xmm0 + vpinsrd $1,12(%r10),%xmm5,%xmm5 + vpinsrd $1,12(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm9,%xmm7 + vpslld $26,%xmm9,%xmm2 + vmovdqu %xmm5,48-128(%rax) + vpaddd %xmm12,%xmm5,%xmm5 + + vpsrld $11,%xmm9,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm9,%xmm2 + vpaddd -32(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm9,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm9,%xmm2 + vpandn %xmm11,%xmm9,%xmm0 + vpand %xmm10,%xmm9,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm13,%xmm12 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm13,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm13,%xmm14,%xmm4 + + vpxor %xmm1,%xmm12,%xmm12 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm13,%xmm1 + + vpslld $19,%xmm13,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm12,%xmm7 + + vpsrld $22,%xmm13,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm13,%xmm2 + vpxor %xmm3,%xmm14,%xmm12 + vpaddd %xmm5,%xmm8,%xmm8 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm12,%xmm12 + vpaddd %xmm7,%xmm12,%xmm12 + vmovd 16(%r8),%xmm5 + vmovd 16(%r9),%xmm0 + vpinsrd $1,16(%r10),%xmm5,%xmm5 + vpinsrd $1,16(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm8,%xmm7 + vpslld $26,%xmm8,%xmm2 + vmovdqu %xmm5,64-128(%rax) + vpaddd %xmm11,%xmm5,%xmm5 + + vpsrld $11,%xmm8,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm8,%xmm2 + vpaddd 0(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm8,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm8,%xmm2 + vpandn %xmm10,%xmm8,%xmm0 + vpand %xmm9,%xmm8,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm12,%xmm11 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm12,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm12,%xmm13,%xmm3 + + vpxor %xmm1,%xmm11,%xmm11 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm12,%xmm1 + + vpslld $19,%xmm12,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm11,%xmm7 + + vpsrld $22,%xmm12,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm12,%xmm2 + vpxor %xmm4,%xmm13,%xmm11 + vpaddd %xmm5,%xmm15,%xmm15 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm11,%xmm11 + vpaddd %xmm7,%xmm11,%xmm11 + vmovd 20(%r8),%xmm5 + vmovd 20(%r9),%xmm0 + vpinsrd $1,20(%r10),%xmm5,%xmm5 + vpinsrd $1,20(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm15,%xmm7 + vpslld $26,%xmm15,%xmm2 + vmovdqu %xmm5,80-128(%rax) + vpaddd %xmm10,%xmm5,%xmm5 + + vpsrld $11,%xmm15,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm15,%xmm2 + vpaddd 32(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm15,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm15,%xmm2 + vpandn %xmm9,%xmm15,%xmm0 + vpand %xmm8,%xmm15,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm11,%xmm10 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm11,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm11,%xmm12,%xmm4 + + vpxor %xmm1,%xmm10,%xmm10 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm11,%xmm1 + + vpslld $19,%xmm11,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm10,%xmm7 + + vpsrld $22,%xmm11,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm11,%xmm2 + vpxor %xmm3,%xmm12,%xmm10 + vpaddd %xmm5,%xmm14,%xmm14 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm10,%xmm10 + vpaddd %xmm7,%xmm10,%xmm10 + vmovd 24(%r8),%xmm5 + vmovd 24(%r9),%xmm0 + vpinsrd $1,24(%r10),%xmm5,%xmm5 + vpinsrd $1,24(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm14,%xmm7 + vpslld $26,%xmm14,%xmm2 + vmovdqu %xmm5,96-128(%rax) + vpaddd %xmm9,%xmm5,%xmm5 + + vpsrld $11,%xmm14,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm14,%xmm2 + vpaddd 64(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm14,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm14,%xmm2 + vpandn %xmm8,%xmm14,%xmm0 + vpand %xmm15,%xmm14,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm10,%xmm9 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm10,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm10,%xmm11,%xmm3 + + vpxor %xmm1,%xmm9,%xmm9 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm10,%xmm1 + + vpslld $19,%xmm10,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm9,%xmm7 + + vpsrld $22,%xmm10,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm10,%xmm2 + vpxor %xmm4,%xmm11,%xmm9 + vpaddd %xmm5,%xmm13,%xmm13 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm9,%xmm9 + vpaddd %xmm7,%xmm9,%xmm9 + vmovd 28(%r8),%xmm5 + vmovd 28(%r9),%xmm0 + vpinsrd $1,28(%r10),%xmm5,%xmm5 + vpinsrd $1,28(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm13,%xmm7 + vpslld $26,%xmm13,%xmm2 + vmovdqu %xmm5,112-128(%rax) + vpaddd %xmm8,%xmm5,%xmm5 + + vpsrld $11,%xmm13,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm13,%xmm2 + vpaddd 96(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm13,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm13,%xmm2 + vpandn %xmm15,%xmm13,%xmm0 + vpand %xmm14,%xmm13,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm9,%xmm8 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm9,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm9,%xmm10,%xmm4 + + vpxor %xmm1,%xmm8,%xmm8 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm9,%xmm1 + + vpslld $19,%xmm9,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm8,%xmm7 + + vpsrld $22,%xmm9,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm9,%xmm2 + vpxor %xmm3,%xmm10,%xmm8 + vpaddd %xmm5,%xmm12,%xmm12 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm8,%xmm8 + vpaddd %xmm7,%xmm8,%xmm8 + addq $256,%rbp + vmovd 32(%r8),%xmm5 + vmovd 32(%r9),%xmm0 + vpinsrd $1,32(%r10),%xmm5,%xmm5 + vpinsrd $1,32(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm12,%xmm7 + vpslld $26,%xmm12,%xmm2 + vmovdqu %xmm5,128-128(%rax) + vpaddd %xmm15,%xmm5,%xmm5 + + vpsrld $11,%xmm12,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm12,%xmm2 + vpaddd -128(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm12,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm12,%xmm2 + vpandn %xmm14,%xmm12,%xmm0 + vpand %xmm13,%xmm12,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm8,%xmm15 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm8,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm8,%xmm9,%xmm3 + + vpxor %xmm1,%xmm15,%xmm15 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm8,%xmm1 + + vpslld $19,%xmm8,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm15,%xmm7 + + vpsrld $22,%xmm8,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm8,%xmm2 + vpxor %xmm4,%xmm9,%xmm15 + vpaddd %xmm5,%xmm11,%xmm11 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm15,%xmm15 + vpaddd %xmm7,%xmm15,%xmm15 + vmovd 36(%r8),%xmm5 + vmovd 36(%r9),%xmm0 + vpinsrd $1,36(%r10),%xmm5,%xmm5 + vpinsrd $1,36(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm11,%xmm7 + vpslld $26,%xmm11,%xmm2 + vmovdqu %xmm5,144-128(%rax) + vpaddd %xmm14,%xmm5,%xmm5 + + vpsrld $11,%xmm11,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm11,%xmm2 + vpaddd -96(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm11,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm11,%xmm2 + vpandn %xmm13,%xmm11,%xmm0 + vpand %xmm12,%xmm11,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm15,%xmm14 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm15,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm15,%xmm8,%xmm4 + + vpxor %xmm1,%xmm14,%xmm14 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm15,%xmm1 + + vpslld $19,%xmm15,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm14,%xmm7 + + vpsrld $22,%xmm15,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm15,%xmm2 + vpxor %xmm3,%xmm8,%xmm14 + vpaddd %xmm5,%xmm10,%xmm10 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm14,%xmm14 + vpaddd %xmm7,%xmm14,%xmm14 + vmovd 40(%r8),%xmm5 + vmovd 40(%r9),%xmm0 + vpinsrd $1,40(%r10),%xmm5,%xmm5 + vpinsrd $1,40(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm10,%xmm7 + vpslld $26,%xmm10,%xmm2 + vmovdqu %xmm5,160-128(%rax) + vpaddd %xmm13,%xmm5,%xmm5 + + vpsrld $11,%xmm10,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm10,%xmm2 + vpaddd -64(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm10,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm10,%xmm2 + vpandn %xmm12,%xmm10,%xmm0 + vpand %xmm11,%xmm10,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm14,%xmm13 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm14,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm14,%xmm15,%xmm3 + + vpxor %xmm1,%xmm13,%xmm13 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm14,%xmm1 + + vpslld $19,%xmm14,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm13,%xmm7 + + vpsrld $22,%xmm14,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm14,%xmm2 + vpxor %xmm4,%xmm15,%xmm13 + vpaddd %xmm5,%xmm9,%xmm9 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm13,%xmm13 + vpaddd %xmm7,%xmm13,%xmm13 + vmovd 44(%r8),%xmm5 + vmovd 44(%r9),%xmm0 + vpinsrd $1,44(%r10),%xmm5,%xmm5 + vpinsrd $1,44(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm9,%xmm7 + vpslld $26,%xmm9,%xmm2 + vmovdqu %xmm5,176-128(%rax) + vpaddd %xmm12,%xmm5,%xmm5 + + vpsrld $11,%xmm9,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm9,%xmm2 + vpaddd -32(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm9,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm9,%xmm2 + vpandn %xmm11,%xmm9,%xmm0 + vpand %xmm10,%xmm9,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm13,%xmm12 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm13,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm13,%xmm14,%xmm4 + + vpxor %xmm1,%xmm12,%xmm12 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm13,%xmm1 + + vpslld $19,%xmm13,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm12,%xmm7 + + vpsrld $22,%xmm13,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm13,%xmm2 + vpxor %xmm3,%xmm14,%xmm12 + vpaddd %xmm5,%xmm8,%xmm8 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm12,%xmm12 + vpaddd %xmm7,%xmm12,%xmm12 + vmovd 48(%r8),%xmm5 + vmovd 48(%r9),%xmm0 + vpinsrd $1,48(%r10),%xmm5,%xmm5 + vpinsrd $1,48(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm8,%xmm7 + vpslld $26,%xmm8,%xmm2 + vmovdqu %xmm5,192-128(%rax) + vpaddd %xmm11,%xmm5,%xmm5 + + vpsrld $11,%xmm8,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm8,%xmm2 + vpaddd 0(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm8,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm8,%xmm2 + vpandn %xmm10,%xmm8,%xmm0 + vpand %xmm9,%xmm8,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm12,%xmm11 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm12,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm12,%xmm13,%xmm3 + + vpxor %xmm1,%xmm11,%xmm11 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm12,%xmm1 + + vpslld $19,%xmm12,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm11,%xmm7 + + vpsrld $22,%xmm12,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm12,%xmm2 + vpxor %xmm4,%xmm13,%xmm11 + vpaddd %xmm5,%xmm15,%xmm15 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm11,%xmm11 + vpaddd %xmm7,%xmm11,%xmm11 + vmovd 52(%r8),%xmm5 + vmovd 52(%r9),%xmm0 + vpinsrd $1,52(%r10),%xmm5,%xmm5 + vpinsrd $1,52(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm15,%xmm7 + vpslld $26,%xmm15,%xmm2 + vmovdqu %xmm5,208-128(%rax) + vpaddd %xmm10,%xmm5,%xmm5 + + vpsrld $11,%xmm15,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm15,%xmm2 + vpaddd 32(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm15,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm15,%xmm2 + vpandn %xmm9,%xmm15,%xmm0 + vpand %xmm8,%xmm15,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm11,%xmm10 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm11,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm11,%xmm12,%xmm4 + + vpxor %xmm1,%xmm10,%xmm10 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm11,%xmm1 + + vpslld $19,%xmm11,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm10,%xmm7 + + vpsrld $22,%xmm11,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm11,%xmm2 + vpxor %xmm3,%xmm12,%xmm10 + vpaddd %xmm5,%xmm14,%xmm14 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm10,%xmm10 + vpaddd %xmm7,%xmm10,%xmm10 + vmovd 56(%r8),%xmm5 + vmovd 56(%r9),%xmm0 + vpinsrd $1,56(%r10),%xmm5,%xmm5 + vpinsrd $1,56(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm14,%xmm7 + vpslld $26,%xmm14,%xmm2 + vmovdqu %xmm5,224-128(%rax) + vpaddd %xmm9,%xmm5,%xmm5 + + vpsrld $11,%xmm14,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm14,%xmm2 + vpaddd 64(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm14,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm14,%xmm2 + vpandn %xmm8,%xmm14,%xmm0 + vpand %xmm15,%xmm14,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm10,%xmm9 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm10,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm10,%xmm11,%xmm3 + + vpxor %xmm1,%xmm9,%xmm9 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm10,%xmm1 + + vpslld $19,%xmm10,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm9,%xmm7 + + vpsrld $22,%xmm10,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm10,%xmm2 + vpxor %xmm4,%xmm11,%xmm9 + vpaddd %xmm5,%xmm13,%xmm13 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm9,%xmm9 + vpaddd %xmm7,%xmm9,%xmm9 + vmovd 60(%r8),%xmm5 + leaq 64(%r8),%r8 + vmovd 60(%r9),%xmm0 + leaq 64(%r9),%r9 + vpinsrd $1,60(%r10),%xmm5,%xmm5 + leaq 64(%r10),%r10 + vpinsrd $1,60(%r11),%xmm0,%xmm0 + leaq 64(%r11),%r11 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm13,%xmm7 + vpslld $26,%xmm13,%xmm2 + vmovdqu %xmm5,240-128(%rax) + vpaddd %xmm8,%xmm5,%xmm5 + + vpsrld $11,%xmm13,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm13,%xmm2 + vpaddd 96(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm13,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + prefetcht0 63(%r8) + vpslld $7,%xmm13,%xmm2 + vpandn %xmm15,%xmm13,%xmm0 + vpand %xmm14,%xmm13,%xmm4 + prefetcht0 63(%r9) + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm9,%xmm8 + vpxor %xmm2,%xmm7,%xmm7 + prefetcht0 63(%r10) + vpslld $30,%xmm9,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm9,%xmm10,%xmm4 + prefetcht0 63(%r11) + vpxor %xmm1,%xmm8,%xmm8 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm9,%xmm1 + + vpslld $19,%xmm9,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm8,%xmm7 + + vpsrld $22,%xmm9,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm9,%xmm2 + vpxor %xmm3,%xmm10,%xmm8 + vpaddd %xmm5,%xmm12,%xmm12 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm8,%xmm8 + vpaddd %xmm7,%xmm8,%xmm8 + addq $256,%rbp + vmovdqu 0-128(%rax),%xmm5 + movl $3,%ecx + jmp .Loop_16_xx_avx +.align 32 +.Loop_16_xx_avx: + vmovdqu 16-128(%rax),%xmm6 + vpaddd 144-128(%rax),%xmm5,%xmm5 + + vpsrld $3,%xmm6,%xmm7 + vpsrld $7,%xmm6,%xmm1 + vpslld $25,%xmm6,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm6,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm6,%xmm2 + vmovdqu 224-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm1,%xmm3,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm5,%xmm5 + vpsrld $6,%xmm12,%xmm7 + vpslld $26,%xmm12,%xmm2 + vmovdqu %xmm5,0-128(%rax) + vpaddd %xmm15,%xmm5,%xmm5 + + vpsrld $11,%xmm12,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm12,%xmm2 + vpaddd -128(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm12,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm12,%xmm2 + vpandn %xmm14,%xmm12,%xmm0 + vpand %xmm13,%xmm12,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm8,%xmm15 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm8,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm8,%xmm9,%xmm3 + + vpxor %xmm1,%xmm15,%xmm15 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm8,%xmm1 + + vpslld $19,%xmm8,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm15,%xmm7 + + vpsrld $22,%xmm8,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm8,%xmm2 + vpxor %xmm4,%xmm9,%xmm15 + vpaddd %xmm5,%xmm11,%xmm11 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm15,%xmm15 + vpaddd %xmm7,%xmm15,%xmm15 + vmovdqu 32-128(%rax),%xmm5 + vpaddd 160-128(%rax),%xmm6,%xmm6 + + vpsrld $3,%xmm5,%xmm7 + vpsrld $7,%xmm5,%xmm1 + vpslld $25,%xmm5,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm5,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm5,%xmm2 + vmovdqu 240-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm6,%xmm6 + vpsrld $6,%xmm11,%xmm7 + vpslld $26,%xmm11,%xmm2 + vmovdqu %xmm6,16-128(%rax) + vpaddd %xmm14,%xmm6,%xmm6 + + vpsrld $11,%xmm11,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm11,%xmm2 + vpaddd -96(%rbp),%xmm6,%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm11,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm11,%xmm2 + vpandn %xmm13,%xmm11,%xmm0 + vpand %xmm12,%xmm11,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm15,%xmm14 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm15,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm15,%xmm8,%xmm4 + + vpxor %xmm1,%xmm14,%xmm14 + vpaddd %xmm7,%xmm6,%xmm6 + + vpsrld $13,%xmm15,%xmm1 + + vpslld $19,%xmm15,%xmm2 + vpaddd %xmm0,%xmm6,%xmm6 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm14,%xmm7 + + vpsrld $22,%xmm15,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm15,%xmm2 + vpxor %xmm3,%xmm8,%xmm14 + vpaddd %xmm6,%xmm10,%xmm10 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm6,%xmm14,%xmm14 + vpaddd %xmm7,%xmm14,%xmm14 + vmovdqu 48-128(%rax),%xmm6 + vpaddd 176-128(%rax),%xmm5,%xmm5 + + vpsrld $3,%xmm6,%xmm7 + vpsrld $7,%xmm6,%xmm1 + vpslld $25,%xmm6,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm6,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm6,%xmm2 + vmovdqu 0-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm1,%xmm3,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm5,%xmm5 + vpsrld $6,%xmm10,%xmm7 + vpslld $26,%xmm10,%xmm2 + vmovdqu %xmm5,32-128(%rax) + vpaddd %xmm13,%xmm5,%xmm5 + + vpsrld $11,%xmm10,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm10,%xmm2 + vpaddd -64(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm10,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm10,%xmm2 + vpandn %xmm12,%xmm10,%xmm0 + vpand %xmm11,%xmm10,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm14,%xmm13 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm14,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm14,%xmm15,%xmm3 + + vpxor %xmm1,%xmm13,%xmm13 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm14,%xmm1 + + vpslld $19,%xmm14,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm13,%xmm7 + + vpsrld $22,%xmm14,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm14,%xmm2 + vpxor %xmm4,%xmm15,%xmm13 + vpaddd %xmm5,%xmm9,%xmm9 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm13,%xmm13 + vpaddd %xmm7,%xmm13,%xmm13 + vmovdqu 64-128(%rax),%xmm5 + vpaddd 192-128(%rax),%xmm6,%xmm6 + + vpsrld $3,%xmm5,%xmm7 + vpsrld $7,%xmm5,%xmm1 + vpslld $25,%xmm5,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm5,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm5,%xmm2 + vmovdqu 16-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm6,%xmm6 + vpsrld $6,%xmm9,%xmm7 + vpslld $26,%xmm9,%xmm2 + vmovdqu %xmm6,48-128(%rax) + vpaddd %xmm12,%xmm6,%xmm6 + + vpsrld $11,%xmm9,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm9,%xmm2 + vpaddd -32(%rbp),%xmm6,%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm9,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm9,%xmm2 + vpandn %xmm11,%xmm9,%xmm0 + vpand %xmm10,%xmm9,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm13,%xmm12 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm13,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm13,%xmm14,%xmm4 + + vpxor %xmm1,%xmm12,%xmm12 + vpaddd %xmm7,%xmm6,%xmm6 + + vpsrld $13,%xmm13,%xmm1 + + vpslld $19,%xmm13,%xmm2 + vpaddd %xmm0,%xmm6,%xmm6 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm12,%xmm7 + + vpsrld $22,%xmm13,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm13,%xmm2 + vpxor %xmm3,%xmm14,%xmm12 + vpaddd %xmm6,%xmm8,%xmm8 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm6,%xmm12,%xmm12 + vpaddd %xmm7,%xmm12,%xmm12 + vmovdqu 80-128(%rax),%xmm6 + vpaddd 208-128(%rax),%xmm5,%xmm5 + + vpsrld $3,%xmm6,%xmm7 + vpsrld $7,%xmm6,%xmm1 + vpslld $25,%xmm6,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm6,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm6,%xmm2 + vmovdqu 32-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm1,%xmm3,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm5,%xmm5 + vpsrld $6,%xmm8,%xmm7 + vpslld $26,%xmm8,%xmm2 + vmovdqu %xmm5,64-128(%rax) + vpaddd %xmm11,%xmm5,%xmm5 + + vpsrld $11,%xmm8,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm8,%xmm2 + vpaddd 0(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm8,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm8,%xmm2 + vpandn %xmm10,%xmm8,%xmm0 + vpand %xmm9,%xmm8,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm12,%xmm11 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm12,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm12,%xmm13,%xmm3 + + vpxor %xmm1,%xmm11,%xmm11 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm12,%xmm1 + + vpslld $19,%xmm12,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm11,%xmm7 + + vpsrld $22,%xmm12,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm12,%xmm2 + vpxor %xmm4,%xmm13,%xmm11 + vpaddd %xmm5,%xmm15,%xmm15 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm11,%xmm11 + vpaddd %xmm7,%xmm11,%xmm11 + vmovdqu 96-128(%rax),%xmm5 + vpaddd 224-128(%rax),%xmm6,%xmm6 + + vpsrld $3,%xmm5,%xmm7 + vpsrld $7,%xmm5,%xmm1 + vpslld $25,%xmm5,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm5,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm5,%xmm2 + vmovdqu 48-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm6,%xmm6 + vpsrld $6,%xmm15,%xmm7 + vpslld $26,%xmm15,%xmm2 + vmovdqu %xmm6,80-128(%rax) + vpaddd %xmm10,%xmm6,%xmm6 + + vpsrld $11,%xmm15,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm15,%xmm2 + vpaddd 32(%rbp),%xmm6,%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm15,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm15,%xmm2 + vpandn %xmm9,%xmm15,%xmm0 + vpand %xmm8,%xmm15,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm11,%xmm10 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm11,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm11,%xmm12,%xmm4 + + vpxor %xmm1,%xmm10,%xmm10 + vpaddd %xmm7,%xmm6,%xmm6 + + vpsrld $13,%xmm11,%xmm1 + + vpslld $19,%xmm11,%xmm2 + vpaddd %xmm0,%xmm6,%xmm6 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm10,%xmm7 + + vpsrld $22,%xmm11,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm11,%xmm2 + vpxor %xmm3,%xmm12,%xmm10 + vpaddd %xmm6,%xmm14,%xmm14 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm6,%xmm10,%xmm10 + vpaddd %xmm7,%xmm10,%xmm10 + vmovdqu 112-128(%rax),%xmm6 + vpaddd 240-128(%rax),%xmm5,%xmm5 + + vpsrld $3,%xmm6,%xmm7 + vpsrld $7,%xmm6,%xmm1 + vpslld $25,%xmm6,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm6,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm6,%xmm2 + vmovdqu 64-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm1,%xmm3,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm5,%xmm5 + vpsrld $6,%xmm14,%xmm7 + vpslld $26,%xmm14,%xmm2 + vmovdqu %xmm5,96-128(%rax) + vpaddd %xmm9,%xmm5,%xmm5 + + vpsrld $11,%xmm14,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm14,%xmm2 + vpaddd 64(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm14,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm14,%xmm2 + vpandn %xmm8,%xmm14,%xmm0 + vpand %xmm15,%xmm14,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm10,%xmm9 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm10,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm10,%xmm11,%xmm3 + + vpxor %xmm1,%xmm9,%xmm9 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm10,%xmm1 + + vpslld $19,%xmm10,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm9,%xmm7 + + vpsrld $22,%xmm10,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm10,%xmm2 + vpxor %xmm4,%xmm11,%xmm9 + vpaddd %xmm5,%xmm13,%xmm13 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm9,%xmm9 + vpaddd %xmm7,%xmm9,%xmm9 + vmovdqu 128-128(%rax),%xmm5 + vpaddd 0-128(%rax),%xmm6,%xmm6 + + vpsrld $3,%xmm5,%xmm7 + vpsrld $7,%xmm5,%xmm1 + vpslld $25,%xmm5,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm5,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm5,%xmm2 + vmovdqu 80-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm6,%xmm6 + vpsrld $6,%xmm13,%xmm7 + vpslld $26,%xmm13,%xmm2 + vmovdqu %xmm6,112-128(%rax) + vpaddd %xmm8,%xmm6,%xmm6 + + vpsrld $11,%xmm13,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm13,%xmm2 + vpaddd 96(%rbp),%xmm6,%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm13,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm13,%xmm2 + vpandn %xmm15,%xmm13,%xmm0 + vpand %xmm14,%xmm13,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm9,%xmm8 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm9,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm9,%xmm10,%xmm4 + + vpxor %xmm1,%xmm8,%xmm8 + vpaddd %xmm7,%xmm6,%xmm6 + + vpsrld $13,%xmm9,%xmm1 + + vpslld $19,%xmm9,%xmm2 + vpaddd %xmm0,%xmm6,%xmm6 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm8,%xmm7 + + vpsrld $22,%xmm9,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm9,%xmm2 + vpxor %xmm3,%xmm10,%xmm8 + vpaddd %xmm6,%xmm12,%xmm12 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm6,%xmm8,%xmm8 + vpaddd %xmm7,%xmm8,%xmm8 + addq $256,%rbp + vmovdqu 144-128(%rax),%xmm6 + vpaddd 16-128(%rax),%xmm5,%xmm5 + + vpsrld $3,%xmm6,%xmm7 + vpsrld $7,%xmm6,%xmm1 + vpslld $25,%xmm6,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm6,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm6,%xmm2 + vmovdqu 96-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm1,%xmm3,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm5,%xmm5 + vpsrld $6,%xmm12,%xmm7 + vpslld $26,%xmm12,%xmm2 + vmovdqu %xmm5,128-128(%rax) + vpaddd %xmm15,%xmm5,%xmm5 + + vpsrld $11,%xmm12,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm12,%xmm2 + vpaddd -128(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm12,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm12,%xmm2 + vpandn %xmm14,%xmm12,%xmm0 + vpand %xmm13,%xmm12,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm8,%xmm15 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm8,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm8,%xmm9,%xmm3 + + vpxor %xmm1,%xmm15,%xmm15 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm8,%xmm1 + + vpslld $19,%xmm8,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm15,%xmm7 + + vpsrld $22,%xmm8,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm8,%xmm2 + vpxor %xmm4,%xmm9,%xmm15 + vpaddd %xmm5,%xmm11,%xmm11 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm15,%xmm15 + vpaddd %xmm7,%xmm15,%xmm15 + vmovdqu 160-128(%rax),%xmm5 + vpaddd 32-128(%rax),%xmm6,%xmm6 + + vpsrld $3,%xmm5,%xmm7 + vpsrld $7,%xmm5,%xmm1 + vpslld $25,%xmm5,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm5,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm5,%xmm2 + vmovdqu 112-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm6,%xmm6 + vpsrld $6,%xmm11,%xmm7 + vpslld $26,%xmm11,%xmm2 + vmovdqu %xmm6,144-128(%rax) + vpaddd %xmm14,%xmm6,%xmm6 + + vpsrld $11,%xmm11,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm11,%xmm2 + vpaddd -96(%rbp),%xmm6,%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm11,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm11,%xmm2 + vpandn %xmm13,%xmm11,%xmm0 + vpand %xmm12,%xmm11,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm15,%xmm14 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm15,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm15,%xmm8,%xmm4 + + vpxor %xmm1,%xmm14,%xmm14 + vpaddd %xmm7,%xmm6,%xmm6 + + vpsrld $13,%xmm15,%xmm1 + + vpslld $19,%xmm15,%xmm2 + vpaddd %xmm0,%xmm6,%xmm6 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm14,%xmm7 + + vpsrld $22,%xmm15,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm15,%xmm2 + vpxor %xmm3,%xmm8,%xmm14 + vpaddd %xmm6,%xmm10,%xmm10 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm6,%xmm14,%xmm14 + vpaddd %xmm7,%xmm14,%xmm14 + vmovdqu 176-128(%rax),%xmm6 + vpaddd 48-128(%rax),%xmm5,%xmm5 + + vpsrld $3,%xmm6,%xmm7 + vpsrld $7,%xmm6,%xmm1 + vpslld $25,%xmm6,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm6,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm6,%xmm2 + vmovdqu 128-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm1,%xmm3,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm5,%xmm5 + vpsrld $6,%xmm10,%xmm7 + vpslld $26,%xmm10,%xmm2 + vmovdqu %xmm5,160-128(%rax) + vpaddd %xmm13,%xmm5,%xmm5 + + vpsrld $11,%xmm10,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm10,%xmm2 + vpaddd -64(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm10,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm10,%xmm2 + vpandn %xmm12,%xmm10,%xmm0 + vpand %xmm11,%xmm10,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm14,%xmm13 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm14,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm14,%xmm15,%xmm3 + + vpxor %xmm1,%xmm13,%xmm13 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm14,%xmm1 + + vpslld $19,%xmm14,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm13,%xmm7 + + vpsrld $22,%xmm14,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm14,%xmm2 + vpxor %xmm4,%xmm15,%xmm13 + vpaddd %xmm5,%xmm9,%xmm9 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm13,%xmm13 + vpaddd %xmm7,%xmm13,%xmm13 + vmovdqu 192-128(%rax),%xmm5 + vpaddd 64-128(%rax),%xmm6,%xmm6 + + vpsrld $3,%xmm5,%xmm7 + vpsrld $7,%xmm5,%xmm1 + vpslld $25,%xmm5,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm5,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm5,%xmm2 + vmovdqu 144-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm6,%xmm6 + vpsrld $6,%xmm9,%xmm7 + vpslld $26,%xmm9,%xmm2 + vmovdqu %xmm6,176-128(%rax) + vpaddd %xmm12,%xmm6,%xmm6 + + vpsrld $11,%xmm9,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm9,%xmm2 + vpaddd -32(%rbp),%xmm6,%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm9,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm9,%xmm2 + vpandn %xmm11,%xmm9,%xmm0 + vpand %xmm10,%xmm9,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm13,%xmm12 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm13,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm13,%xmm14,%xmm4 + + vpxor %xmm1,%xmm12,%xmm12 + vpaddd %xmm7,%xmm6,%xmm6 + + vpsrld $13,%xmm13,%xmm1 + + vpslld $19,%xmm13,%xmm2 + vpaddd %xmm0,%xmm6,%xmm6 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm12,%xmm7 + + vpsrld $22,%xmm13,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm13,%xmm2 + vpxor %xmm3,%xmm14,%xmm12 + vpaddd %xmm6,%xmm8,%xmm8 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm6,%xmm12,%xmm12 + vpaddd %xmm7,%xmm12,%xmm12 + vmovdqu 208-128(%rax),%xmm6 + vpaddd 80-128(%rax),%xmm5,%xmm5 + + vpsrld $3,%xmm6,%xmm7 + vpsrld $7,%xmm6,%xmm1 + vpslld $25,%xmm6,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm6,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm6,%xmm2 + vmovdqu 160-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm1,%xmm3,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm5,%xmm5 + vpsrld $6,%xmm8,%xmm7 + vpslld $26,%xmm8,%xmm2 + vmovdqu %xmm5,192-128(%rax) + vpaddd %xmm11,%xmm5,%xmm5 + + vpsrld $11,%xmm8,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm8,%xmm2 + vpaddd 0(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm8,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm8,%xmm2 + vpandn %xmm10,%xmm8,%xmm0 + vpand %xmm9,%xmm8,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm12,%xmm11 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm12,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm12,%xmm13,%xmm3 + + vpxor %xmm1,%xmm11,%xmm11 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm12,%xmm1 + + vpslld $19,%xmm12,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm11,%xmm7 + + vpsrld $22,%xmm12,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm12,%xmm2 + vpxor %xmm4,%xmm13,%xmm11 + vpaddd %xmm5,%xmm15,%xmm15 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm11,%xmm11 + vpaddd %xmm7,%xmm11,%xmm11 + vmovdqu 224-128(%rax),%xmm5 + vpaddd 96-128(%rax),%xmm6,%xmm6 + + vpsrld $3,%xmm5,%xmm7 + vpsrld $7,%xmm5,%xmm1 + vpslld $25,%xmm5,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm5,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm5,%xmm2 + vmovdqu 176-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm6,%xmm6 + vpsrld $6,%xmm15,%xmm7 + vpslld $26,%xmm15,%xmm2 + vmovdqu %xmm6,208-128(%rax) + vpaddd %xmm10,%xmm6,%xmm6 + + vpsrld $11,%xmm15,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm15,%xmm2 + vpaddd 32(%rbp),%xmm6,%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm15,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm15,%xmm2 + vpandn %xmm9,%xmm15,%xmm0 + vpand %xmm8,%xmm15,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm11,%xmm10 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm11,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm11,%xmm12,%xmm4 + + vpxor %xmm1,%xmm10,%xmm10 + vpaddd %xmm7,%xmm6,%xmm6 + + vpsrld $13,%xmm11,%xmm1 + + vpslld $19,%xmm11,%xmm2 + vpaddd %xmm0,%xmm6,%xmm6 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm10,%xmm7 + + vpsrld $22,%xmm11,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm11,%xmm2 + vpxor %xmm3,%xmm12,%xmm10 + vpaddd %xmm6,%xmm14,%xmm14 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm6,%xmm10,%xmm10 + vpaddd %xmm7,%xmm10,%xmm10 + vmovdqu 240-128(%rax),%xmm6 + vpaddd 112-128(%rax),%xmm5,%xmm5 + + vpsrld $3,%xmm6,%xmm7 + vpsrld $7,%xmm6,%xmm1 + vpslld $25,%xmm6,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm6,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm6,%xmm2 + vmovdqu 192-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm1,%xmm3,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm5,%xmm5 + vpsrld $6,%xmm14,%xmm7 + vpslld $26,%xmm14,%xmm2 + vmovdqu %xmm5,224-128(%rax) + vpaddd %xmm9,%xmm5,%xmm5 + + vpsrld $11,%xmm14,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm14,%xmm2 + vpaddd 64(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm14,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm14,%xmm2 + vpandn %xmm8,%xmm14,%xmm0 + vpand %xmm15,%xmm14,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm10,%xmm9 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm10,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm10,%xmm11,%xmm3 + + vpxor %xmm1,%xmm9,%xmm9 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm10,%xmm1 + + vpslld $19,%xmm10,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm9,%xmm7 + + vpsrld $22,%xmm10,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm10,%xmm2 + vpxor %xmm4,%xmm11,%xmm9 + vpaddd %xmm5,%xmm13,%xmm13 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm9,%xmm9 + vpaddd %xmm7,%xmm9,%xmm9 + vmovdqu 0-128(%rax),%xmm5 + vpaddd 128-128(%rax),%xmm6,%xmm6 + + vpsrld $3,%xmm5,%xmm7 + vpsrld $7,%xmm5,%xmm1 + vpslld $25,%xmm5,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm5,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm5,%xmm2 + vmovdqu 208-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm6,%xmm6 + vpsrld $6,%xmm13,%xmm7 + vpslld $26,%xmm13,%xmm2 + vmovdqu %xmm6,240-128(%rax) + vpaddd %xmm8,%xmm6,%xmm6 + + vpsrld $11,%xmm13,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm13,%xmm2 + vpaddd 96(%rbp),%xmm6,%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm13,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm13,%xmm2 + vpandn %xmm15,%xmm13,%xmm0 + vpand %xmm14,%xmm13,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm9,%xmm8 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm9,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm9,%xmm10,%xmm4 + + vpxor %xmm1,%xmm8,%xmm8 + vpaddd %xmm7,%xmm6,%xmm6 + + vpsrld $13,%xmm9,%xmm1 + + vpslld $19,%xmm9,%xmm2 + vpaddd %xmm0,%xmm6,%xmm6 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm8,%xmm7 + + vpsrld $22,%xmm9,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm9,%xmm2 + vpxor %xmm3,%xmm10,%xmm8 + vpaddd %xmm6,%xmm12,%xmm12 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm6,%xmm8,%xmm8 + vpaddd %xmm7,%xmm8,%xmm8 + addq $256,%rbp + decl %ecx + jnz .Loop_16_xx_avx + + movl $1,%ecx + leaq K256+128(%rip),%rbp + cmpl 0(%rbx),%ecx + cmovgeq %rbp,%r8 + cmpl 4(%rbx),%ecx + cmovgeq %rbp,%r9 + cmpl 8(%rbx),%ecx + cmovgeq %rbp,%r10 + cmpl 12(%rbx),%ecx + cmovgeq %rbp,%r11 + vmovdqa (%rbx),%xmm7 + vpxor %xmm0,%xmm0,%xmm0 + vmovdqa %xmm7,%xmm6 + vpcmpgtd %xmm0,%xmm6,%xmm6 + vpaddd %xmm6,%xmm7,%xmm7 + + vmovdqu 0-128(%rdi),%xmm0 + vpand %xmm6,%xmm8,%xmm8 + vmovdqu 32-128(%rdi),%xmm1 + vpand %xmm6,%xmm9,%xmm9 + vmovdqu 64-128(%rdi),%xmm2 + vpand %xmm6,%xmm10,%xmm10 + vmovdqu 96-128(%rdi),%xmm5 + vpand %xmm6,%xmm11,%xmm11 + vpaddd %xmm0,%xmm8,%xmm8 + vmovdqu 128-128(%rdi),%xmm0 + vpand %xmm6,%xmm12,%xmm12 + vpaddd %xmm1,%xmm9,%xmm9 + vmovdqu 160-128(%rdi),%xmm1 + vpand %xmm6,%xmm13,%xmm13 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqu 192-128(%rdi),%xmm2 + vpand %xmm6,%xmm14,%xmm14 + vpaddd %xmm5,%xmm11,%xmm11 + vmovdqu 224-128(%rdi),%xmm5 + vpand %xmm6,%xmm15,%xmm15 + vpaddd %xmm0,%xmm12,%xmm12 + vpaddd %xmm1,%xmm13,%xmm13 + vmovdqu %xmm8,0-128(%rdi) + vpaddd %xmm2,%xmm14,%xmm14 + vmovdqu %xmm9,32-128(%rdi) + vpaddd %xmm5,%xmm15,%xmm15 + vmovdqu %xmm10,64-128(%rdi) + vmovdqu %xmm11,96-128(%rdi) + vmovdqu %xmm12,128-128(%rdi) + vmovdqu %xmm13,160-128(%rdi) + vmovdqu %xmm14,192-128(%rdi) + vmovdqu %xmm15,224-128(%rdi) + + vmovdqu %xmm7,(%rbx) + vmovdqu .Lpbswap(%rip),%xmm6 + decl %edx + jnz .Loop_avx + + movl 280(%rsp),%edx + leaq 16(%rdi),%rdi + leaq 64(%rsi),%rsi + decl %edx + jnz .Loop_grande_avx + +.Ldone_avx: + movq 272(%rsp),%rax +.cfi_def_cfa %rax,8 + vzeroupper + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha256_multi_block_avx,.-sha256_multi_block_avx +.type sha256_multi_block_avx2,@function +.align 32 +sha256_multi_block_avx2: +.cfi_startproc +_avx2_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + subq $576,%rsp + andq $-256,%rsp + movq %rax,544(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xa0,0x04,0x06,0x23,0x08 +.Lbody_avx2: + leaq K256+128(%rip),%rbp + leaq 128(%rdi),%rdi + +.Loop_grande_avx2: + movl %edx,552(%rsp) + xorl %edx,%edx + leaq 512(%rsp),%rbx + movq 0(%rsi),%r12 + movl 8(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,0(%rbx) + cmovleq %rbp,%r12 + movq 16(%rsi),%r13 + movl 24(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,4(%rbx) + cmovleq %rbp,%r13 + movq 32(%rsi),%r14 + movl 40(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,8(%rbx) + cmovleq %rbp,%r14 + movq 48(%rsi),%r15 + movl 56(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,12(%rbx) + cmovleq %rbp,%r15 + movq 64(%rsi),%r8 + movl 72(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,16(%rbx) + cmovleq %rbp,%r8 + movq 80(%rsi),%r9 + movl 88(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,20(%rbx) + cmovleq %rbp,%r9 + movq 96(%rsi),%r10 + movl 104(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,24(%rbx) + cmovleq %rbp,%r10 + movq 112(%rsi),%r11 + movl 120(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,28(%rbx) + cmovleq %rbp,%r11 + vmovdqu 0-128(%rdi),%ymm8 + leaq 128(%rsp),%rax + vmovdqu 32-128(%rdi),%ymm9 + leaq 256+128(%rsp),%rbx + vmovdqu 64-128(%rdi),%ymm10 + vmovdqu 96-128(%rdi),%ymm11 + vmovdqu 128-128(%rdi),%ymm12 + vmovdqu 160-128(%rdi),%ymm13 + vmovdqu 192-128(%rdi),%ymm14 + vmovdqu 224-128(%rdi),%ymm15 + vmovdqu .Lpbswap(%rip),%ymm6 + jmp .Loop_avx2 + +.align 32 +.Loop_avx2: + vpxor %ymm9,%ymm10,%ymm4 + vmovd 0(%r12),%xmm5 + vmovd 0(%r8),%xmm0 + vmovd 0(%r13),%xmm1 + vmovd 0(%r9),%xmm2 + vpinsrd $1,0(%r14),%xmm5,%xmm5 + vpinsrd $1,0(%r10),%xmm0,%xmm0 + vpinsrd $1,0(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,0(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm12,%ymm7 + vpslld $26,%ymm12,%ymm2 + vmovdqu %ymm5,0-128(%rax) + vpaddd %ymm15,%ymm5,%ymm5 + + vpsrld $11,%ymm12,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm12,%ymm2 + vpaddd -128(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm12,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm12,%ymm2 + vpandn %ymm14,%ymm12,%ymm0 + vpand %ymm13,%ymm12,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm8,%ymm15 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm8,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm8,%ymm9,%ymm3 + + vpxor %ymm1,%ymm15,%ymm15 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm8,%ymm1 + + vpslld $19,%ymm8,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm15,%ymm7 + + vpsrld $22,%ymm8,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm8,%ymm2 + vpxor %ymm4,%ymm9,%ymm15 + vpaddd %ymm5,%ymm11,%ymm11 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm15,%ymm15 + vpaddd %ymm7,%ymm15,%ymm15 + vmovd 4(%r12),%xmm5 + vmovd 4(%r8),%xmm0 + vmovd 4(%r13),%xmm1 + vmovd 4(%r9),%xmm2 + vpinsrd $1,4(%r14),%xmm5,%xmm5 + vpinsrd $1,4(%r10),%xmm0,%xmm0 + vpinsrd $1,4(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,4(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm11,%ymm7 + vpslld $26,%ymm11,%ymm2 + vmovdqu %ymm5,32-128(%rax) + vpaddd %ymm14,%ymm5,%ymm5 + + vpsrld $11,%ymm11,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm11,%ymm2 + vpaddd -96(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm11,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm11,%ymm2 + vpandn %ymm13,%ymm11,%ymm0 + vpand %ymm12,%ymm11,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm15,%ymm14 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm15,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm15,%ymm8,%ymm4 + + vpxor %ymm1,%ymm14,%ymm14 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm15,%ymm1 + + vpslld $19,%ymm15,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm14,%ymm7 + + vpsrld $22,%ymm15,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm15,%ymm2 + vpxor %ymm3,%ymm8,%ymm14 + vpaddd %ymm5,%ymm10,%ymm10 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm14,%ymm14 + vpaddd %ymm7,%ymm14,%ymm14 + vmovd 8(%r12),%xmm5 + vmovd 8(%r8),%xmm0 + vmovd 8(%r13),%xmm1 + vmovd 8(%r9),%xmm2 + vpinsrd $1,8(%r14),%xmm5,%xmm5 + vpinsrd $1,8(%r10),%xmm0,%xmm0 + vpinsrd $1,8(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,8(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm10,%ymm7 + vpslld $26,%ymm10,%ymm2 + vmovdqu %ymm5,64-128(%rax) + vpaddd %ymm13,%ymm5,%ymm5 + + vpsrld $11,%ymm10,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm10,%ymm2 + vpaddd -64(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm10,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm10,%ymm2 + vpandn %ymm12,%ymm10,%ymm0 + vpand %ymm11,%ymm10,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm14,%ymm13 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm14,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm14,%ymm15,%ymm3 + + vpxor %ymm1,%ymm13,%ymm13 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm14,%ymm1 + + vpslld $19,%ymm14,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm13,%ymm7 + + vpsrld $22,%ymm14,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm14,%ymm2 + vpxor %ymm4,%ymm15,%ymm13 + vpaddd %ymm5,%ymm9,%ymm9 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm13,%ymm13 + vpaddd %ymm7,%ymm13,%ymm13 + vmovd 12(%r12),%xmm5 + vmovd 12(%r8),%xmm0 + vmovd 12(%r13),%xmm1 + vmovd 12(%r9),%xmm2 + vpinsrd $1,12(%r14),%xmm5,%xmm5 + vpinsrd $1,12(%r10),%xmm0,%xmm0 + vpinsrd $1,12(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,12(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm9,%ymm7 + vpslld $26,%ymm9,%ymm2 + vmovdqu %ymm5,96-128(%rax) + vpaddd %ymm12,%ymm5,%ymm5 + + vpsrld $11,%ymm9,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm9,%ymm2 + vpaddd -32(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm9,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm9,%ymm2 + vpandn %ymm11,%ymm9,%ymm0 + vpand %ymm10,%ymm9,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm13,%ymm12 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm13,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm13,%ymm14,%ymm4 + + vpxor %ymm1,%ymm12,%ymm12 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm13,%ymm1 + + vpslld $19,%ymm13,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm12,%ymm7 + + vpsrld $22,%ymm13,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm13,%ymm2 + vpxor %ymm3,%ymm14,%ymm12 + vpaddd %ymm5,%ymm8,%ymm8 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm12,%ymm12 + vpaddd %ymm7,%ymm12,%ymm12 + vmovd 16(%r12),%xmm5 + vmovd 16(%r8),%xmm0 + vmovd 16(%r13),%xmm1 + vmovd 16(%r9),%xmm2 + vpinsrd $1,16(%r14),%xmm5,%xmm5 + vpinsrd $1,16(%r10),%xmm0,%xmm0 + vpinsrd $1,16(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,16(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm8,%ymm7 + vpslld $26,%ymm8,%ymm2 + vmovdqu %ymm5,128-128(%rax) + vpaddd %ymm11,%ymm5,%ymm5 + + vpsrld $11,%ymm8,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm8,%ymm2 + vpaddd 0(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm8,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm8,%ymm2 + vpandn %ymm10,%ymm8,%ymm0 + vpand %ymm9,%ymm8,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm12,%ymm11 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm12,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm12,%ymm13,%ymm3 + + vpxor %ymm1,%ymm11,%ymm11 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm12,%ymm1 + + vpslld $19,%ymm12,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm11,%ymm7 + + vpsrld $22,%ymm12,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm12,%ymm2 + vpxor %ymm4,%ymm13,%ymm11 + vpaddd %ymm5,%ymm15,%ymm15 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm11,%ymm11 + vpaddd %ymm7,%ymm11,%ymm11 + vmovd 20(%r12),%xmm5 + vmovd 20(%r8),%xmm0 + vmovd 20(%r13),%xmm1 + vmovd 20(%r9),%xmm2 + vpinsrd $1,20(%r14),%xmm5,%xmm5 + vpinsrd $1,20(%r10),%xmm0,%xmm0 + vpinsrd $1,20(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,20(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm15,%ymm7 + vpslld $26,%ymm15,%ymm2 + vmovdqu %ymm5,160-128(%rax) + vpaddd %ymm10,%ymm5,%ymm5 + + vpsrld $11,%ymm15,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm15,%ymm2 + vpaddd 32(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm15,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm15,%ymm2 + vpandn %ymm9,%ymm15,%ymm0 + vpand %ymm8,%ymm15,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm11,%ymm10 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm11,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm11,%ymm12,%ymm4 + + vpxor %ymm1,%ymm10,%ymm10 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm11,%ymm1 + + vpslld $19,%ymm11,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm10,%ymm7 + + vpsrld $22,%ymm11,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm11,%ymm2 + vpxor %ymm3,%ymm12,%ymm10 + vpaddd %ymm5,%ymm14,%ymm14 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm10,%ymm10 + vpaddd %ymm7,%ymm10,%ymm10 + vmovd 24(%r12),%xmm5 + vmovd 24(%r8),%xmm0 + vmovd 24(%r13),%xmm1 + vmovd 24(%r9),%xmm2 + vpinsrd $1,24(%r14),%xmm5,%xmm5 + vpinsrd $1,24(%r10),%xmm0,%xmm0 + vpinsrd $1,24(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,24(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm14,%ymm7 + vpslld $26,%ymm14,%ymm2 + vmovdqu %ymm5,192-128(%rax) + vpaddd %ymm9,%ymm5,%ymm5 + + vpsrld $11,%ymm14,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm14,%ymm2 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm14,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm14,%ymm2 + vpandn %ymm8,%ymm14,%ymm0 + vpand %ymm15,%ymm14,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm10,%ymm9 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm10,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm10,%ymm11,%ymm3 + + vpxor %ymm1,%ymm9,%ymm9 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm10,%ymm1 + + vpslld $19,%ymm10,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm9,%ymm7 + + vpsrld $22,%ymm10,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm10,%ymm2 + vpxor %ymm4,%ymm11,%ymm9 + vpaddd %ymm5,%ymm13,%ymm13 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm9,%ymm9 + vpaddd %ymm7,%ymm9,%ymm9 + vmovd 28(%r12),%xmm5 + vmovd 28(%r8),%xmm0 + vmovd 28(%r13),%xmm1 + vmovd 28(%r9),%xmm2 + vpinsrd $1,28(%r14),%xmm5,%xmm5 + vpinsrd $1,28(%r10),%xmm0,%xmm0 + vpinsrd $1,28(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,28(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm13,%ymm7 + vpslld $26,%ymm13,%ymm2 + vmovdqu %ymm5,224-128(%rax) + vpaddd %ymm8,%ymm5,%ymm5 + + vpsrld $11,%ymm13,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm13,%ymm2 + vpaddd 96(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm13,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm13,%ymm2 + vpandn %ymm15,%ymm13,%ymm0 + vpand %ymm14,%ymm13,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm9,%ymm8 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm9,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm9,%ymm10,%ymm4 + + vpxor %ymm1,%ymm8,%ymm8 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm9,%ymm1 + + vpslld $19,%ymm9,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm8,%ymm7 + + vpsrld $22,%ymm9,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm9,%ymm2 + vpxor %ymm3,%ymm10,%ymm8 + vpaddd %ymm5,%ymm12,%ymm12 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm8,%ymm8 + vpaddd %ymm7,%ymm8,%ymm8 + addq $256,%rbp + vmovd 32(%r12),%xmm5 + vmovd 32(%r8),%xmm0 + vmovd 32(%r13),%xmm1 + vmovd 32(%r9),%xmm2 + vpinsrd $1,32(%r14),%xmm5,%xmm5 + vpinsrd $1,32(%r10),%xmm0,%xmm0 + vpinsrd $1,32(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,32(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm12,%ymm7 + vpslld $26,%ymm12,%ymm2 + vmovdqu %ymm5,256-256-128(%rbx) + vpaddd %ymm15,%ymm5,%ymm5 + + vpsrld $11,%ymm12,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm12,%ymm2 + vpaddd -128(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm12,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm12,%ymm2 + vpandn %ymm14,%ymm12,%ymm0 + vpand %ymm13,%ymm12,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm8,%ymm15 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm8,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm8,%ymm9,%ymm3 + + vpxor %ymm1,%ymm15,%ymm15 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm8,%ymm1 + + vpslld $19,%ymm8,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm15,%ymm7 + + vpsrld $22,%ymm8,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm8,%ymm2 + vpxor %ymm4,%ymm9,%ymm15 + vpaddd %ymm5,%ymm11,%ymm11 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm15,%ymm15 + vpaddd %ymm7,%ymm15,%ymm15 + vmovd 36(%r12),%xmm5 + vmovd 36(%r8),%xmm0 + vmovd 36(%r13),%xmm1 + vmovd 36(%r9),%xmm2 + vpinsrd $1,36(%r14),%xmm5,%xmm5 + vpinsrd $1,36(%r10),%xmm0,%xmm0 + vpinsrd $1,36(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,36(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm11,%ymm7 + vpslld $26,%ymm11,%ymm2 + vmovdqu %ymm5,288-256-128(%rbx) + vpaddd %ymm14,%ymm5,%ymm5 + + vpsrld $11,%ymm11,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm11,%ymm2 + vpaddd -96(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm11,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm11,%ymm2 + vpandn %ymm13,%ymm11,%ymm0 + vpand %ymm12,%ymm11,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm15,%ymm14 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm15,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm15,%ymm8,%ymm4 + + vpxor %ymm1,%ymm14,%ymm14 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm15,%ymm1 + + vpslld $19,%ymm15,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm14,%ymm7 + + vpsrld $22,%ymm15,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm15,%ymm2 + vpxor %ymm3,%ymm8,%ymm14 + vpaddd %ymm5,%ymm10,%ymm10 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm14,%ymm14 + vpaddd %ymm7,%ymm14,%ymm14 + vmovd 40(%r12),%xmm5 + vmovd 40(%r8),%xmm0 + vmovd 40(%r13),%xmm1 + vmovd 40(%r9),%xmm2 + vpinsrd $1,40(%r14),%xmm5,%xmm5 + vpinsrd $1,40(%r10),%xmm0,%xmm0 + vpinsrd $1,40(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,40(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm10,%ymm7 + vpslld $26,%ymm10,%ymm2 + vmovdqu %ymm5,320-256-128(%rbx) + vpaddd %ymm13,%ymm5,%ymm5 + + vpsrld $11,%ymm10,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm10,%ymm2 + vpaddd -64(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm10,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm10,%ymm2 + vpandn %ymm12,%ymm10,%ymm0 + vpand %ymm11,%ymm10,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm14,%ymm13 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm14,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm14,%ymm15,%ymm3 + + vpxor %ymm1,%ymm13,%ymm13 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm14,%ymm1 + + vpslld $19,%ymm14,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm13,%ymm7 + + vpsrld $22,%ymm14,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm14,%ymm2 + vpxor %ymm4,%ymm15,%ymm13 + vpaddd %ymm5,%ymm9,%ymm9 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm13,%ymm13 + vpaddd %ymm7,%ymm13,%ymm13 + vmovd 44(%r12),%xmm5 + vmovd 44(%r8),%xmm0 + vmovd 44(%r13),%xmm1 + vmovd 44(%r9),%xmm2 + vpinsrd $1,44(%r14),%xmm5,%xmm5 + vpinsrd $1,44(%r10),%xmm0,%xmm0 + vpinsrd $1,44(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,44(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm9,%ymm7 + vpslld $26,%ymm9,%ymm2 + vmovdqu %ymm5,352-256-128(%rbx) + vpaddd %ymm12,%ymm5,%ymm5 + + vpsrld $11,%ymm9,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm9,%ymm2 + vpaddd -32(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm9,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm9,%ymm2 + vpandn %ymm11,%ymm9,%ymm0 + vpand %ymm10,%ymm9,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm13,%ymm12 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm13,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm13,%ymm14,%ymm4 + + vpxor %ymm1,%ymm12,%ymm12 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm13,%ymm1 + + vpslld $19,%ymm13,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm12,%ymm7 + + vpsrld $22,%ymm13,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm13,%ymm2 + vpxor %ymm3,%ymm14,%ymm12 + vpaddd %ymm5,%ymm8,%ymm8 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm12,%ymm12 + vpaddd %ymm7,%ymm12,%ymm12 + vmovd 48(%r12),%xmm5 + vmovd 48(%r8),%xmm0 + vmovd 48(%r13),%xmm1 + vmovd 48(%r9),%xmm2 + vpinsrd $1,48(%r14),%xmm5,%xmm5 + vpinsrd $1,48(%r10),%xmm0,%xmm0 + vpinsrd $1,48(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,48(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm8,%ymm7 + vpslld $26,%ymm8,%ymm2 + vmovdqu %ymm5,384-256-128(%rbx) + vpaddd %ymm11,%ymm5,%ymm5 + + vpsrld $11,%ymm8,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm8,%ymm2 + vpaddd 0(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm8,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm8,%ymm2 + vpandn %ymm10,%ymm8,%ymm0 + vpand %ymm9,%ymm8,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm12,%ymm11 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm12,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm12,%ymm13,%ymm3 + + vpxor %ymm1,%ymm11,%ymm11 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm12,%ymm1 + + vpslld $19,%ymm12,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm11,%ymm7 + + vpsrld $22,%ymm12,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm12,%ymm2 + vpxor %ymm4,%ymm13,%ymm11 + vpaddd %ymm5,%ymm15,%ymm15 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm11,%ymm11 + vpaddd %ymm7,%ymm11,%ymm11 + vmovd 52(%r12),%xmm5 + vmovd 52(%r8),%xmm0 + vmovd 52(%r13),%xmm1 + vmovd 52(%r9),%xmm2 + vpinsrd $1,52(%r14),%xmm5,%xmm5 + vpinsrd $1,52(%r10),%xmm0,%xmm0 + vpinsrd $1,52(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,52(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm15,%ymm7 + vpslld $26,%ymm15,%ymm2 + vmovdqu %ymm5,416-256-128(%rbx) + vpaddd %ymm10,%ymm5,%ymm5 + + vpsrld $11,%ymm15,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm15,%ymm2 + vpaddd 32(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm15,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm15,%ymm2 + vpandn %ymm9,%ymm15,%ymm0 + vpand %ymm8,%ymm15,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm11,%ymm10 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm11,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm11,%ymm12,%ymm4 + + vpxor %ymm1,%ymm10,%ymm10 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm11,%ymm1 + + vpslld $19,%ymm11,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm10,%ymm7 + + vpsrld $22,%ymm11,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm11,%ymm2 + vpxor %ymm3,%ymm12,%ymm10 + vpaddd %ymm5,%ymm14,%ymm14 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm10,%ymm10 + vpaddd %ymm7,%ymm10,%ymm10 + vmovd 56(%r12),%xmm5 + vmovd 56(%r8),%xmm0 + vmovd 56(%r13),%xmm1 + vmovd 56(%r9),%xmm2 + vpinsrd $1,56(%r14),%xmm5,%xmm5 + vpinsrd $1,56(%r10),%xmm0,%xmm0 + vpinsrd $1,56(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,56(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm14,%ymm7 + vpslld $26,%ymm14,%ymm2 + vmovdqu %ymm5,448-256-128(%rbx) + vpaddd %ymm9,%ymm5,%ymm5 + + vpsrld $11,%ymm14,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm14,%ymm2 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm14,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm14,%ymm2 + vpandn %ymm8,%ymm14,%ymm0 + vpand %ymm15,%ymm14,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm10,%ymm9 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm10,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm10,%ymm11,%ymm3 + + vpxor %ymm1,%ymm9,%ymm9 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm10,%ymm1 + + vpslld $19,%ymm10,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm9,%ymm7 + + vpsrld $22,%ymm10,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm10,%ymm2 + vpxor %ymm4,%ymm11,%ymm9 + vpaddd %ymm5,%ymm13,%ymm13 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm9,%ymm9 + vpaddd %ymm7,%ymm9,%ymm9 + vmovd 60(%r12),%xmm5 + leaq 64(%r12),%r12 + vmovd 60(%r8),%xmm0 + leaq 64(%r8),%r8 + vmovd 60(%r13),%xmm1 + leaq 64(%r13),%r13 + vmovd 60(%r9),%xmm2 + leaq 64(%r9),%r9 + vpinsrd $1,60(%r14),%xmm5,%xmm5 + leaq 64(%r14),%r14 + vpinsrd $1,60(%r10),%xmm0,%xmm0 + leaq 64(%r10),%r10 + vpinsrd $1,60(%r15),%xmm1,%xmm1 + leaq 64(%r15),%r15 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,60(%r11),%xmm2,%xmm2 + leaq 64(%r11),%r11 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm13,%ymm7 + vpslld $26,%ymm13,%ymm2 + vmovdqu %ymm5,480-256-128(%rbx) + vpaddd %ymm8,%ymm5,%ymm5 + + vpsrld $11,%ymm13,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm13,%ymm2 + vpaddd 96(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm13,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + prefetcht0 63(%r12) + vpslld $7,%ymm13,%ymm2 + vpandn %ymm15,%ymm13,%ymm0 + vpand %ymm14,%ymm13,%ymm4 + prefetcht0 63(%r13) + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm9,%ymm8 + vpxor %ymm2,%ymm7,%ymm7 + prefetcht0 63(%r14) + vpslld $30,%ymm9,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm9,%ymm10,%ymm4 + prefetcht0 63(%r15) + vpxor %ymm1,%ymm8,%ymm8 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm9,%ymm1 + prefetcht0 63(%r8) + vpslld $19,%ymm9,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm4,%ymm3,%ymm3 + prefetcht0 63(%r9) + vpxor %ymm1,%ymm8,%ymm7 + + vpsrld $22,%ymm9,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + prefetcht0 63(%r10) + vpslld $10,%ymm9,%ymm2 + vpxor %ymm3,%ymm10,%ymm8 + vpaddd %ymm5,%ymm12,%ymm12 + prefetcht0 63(%r11) + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm8,%ymm8 + vpaddd %ymm7,%ymm8,%ymm8 + addq $256,%rbp + vmovdqu 0-128(%rax),%ymm5 + movl $3,%ecx + jmp .Loop_16_xx_avx2 +.align 32 +.Loop_16_xx_avx2: + vmovdqu 32-128(%rax),%ymm6 + vpaddd 288-256-128(%rbx),%ymm5,%ymm5 + + vpsrld $3,%ymm6,%ymm7 + vpsrld $7,%ymm6,%ymm1 + vpslld $25,%ymm6,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm6,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm6,%ymm2 + vmovdqu 448-256-128(%rbx),%ymm0 + vpsrld $10,%ymm0,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm5,%ymm5 + vpxor %ymm1,%ymm3,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm5,%ymm5 + vpsrld $6,%ymm12,%ymm7 + vpslld $26,%ymm12,%ymm2 + vmovdqu %ymm5,0-128(%rax) + vpaddd %ymm15,%ymm5,%ymm5 + + vpsrld $11,%ymm12,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm12,%ymm2 + vpaddd -128(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm12,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm12,%ymm2 + vpandn %ymm14,%ymm12,%ymm0 + vpand %ymm13,%ymm12,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm8,%ymm15 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm8,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm8,%ymm9,%ymm3 + + vpxor %ymm1,%ymm15,%ymm15 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm8,%ymm1 + + vpslld $19,%ymm8,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm15,%ymm7 + + vpsrld $22,%ymm8,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm8,%ymm2 + vpxor %ymm4,%ymm9,%ymm15 + vpaddd %ymm5,%ymm11,%ymm11 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm15,%ymm15 + vpaddd %ymm7,%ymm15,%ymm15 + vmovdqu 64-128(%rax),%ymm5 + vpaddd 320-256-128(%rbx),%ymm6,%ymm6 + + vpsrld $3,%ymm5,%ymm7 + vpsrld $7,%ymm5,%ymm1 + vpslld $25,%ymm5,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm5,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm5,%ymm2 + vmovdqu 480-256-128(%rbx),%ymm0 + vpsrld $10,%ymm0,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm1,%ymm4,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm6,%ymm6 + vpsrld $6,%ymm11,%ymm7 + vpslld $26,%ymm11,%ymm2 + vmovdqu %ymm6,32-128(%rax) + vpaddd %ymm14,%ymm6,%ymm6 + + vpsrld $11,%ymm11,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm11,%ymm2 + vpaddd -96(%rbp),%ymm6,%ymm6 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm11,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm11,%ymm2 + vpandn %ymm13,%ymm11,%ymm0 + vpand %ymm12,%ymm11,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm15,%ymm14 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm15,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm15,%ymm8,%ymm4 + + vpxor %ymm1,%ymm14,%ymm14 + vpaddd %ymm7,%ymm6,%ymm6 + + vpsrld $13,%ymm15,%ymm1 + + vpslld $19,%ymm15,%ymm2 + vpaddd %ymm0,%ymm6,%ymm6 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm14,%ymm7 + + vpsrld $22,%ymm15,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm15,%ymm2 + vpxor %ymm3,%ymm8,%ymm14 + vpaddd %ymm6,%ymm10,%ymm10 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm6,%ymm14,%ymm14 + vpaddd %ymm7,%ymm14,%ymm14 + vmovdqu 96-128(%rax),%ymm6 + vpaddd 352-256-128(%rbx),%ymm5,%ymm5 + + vpsrld $3,%ymm6,%ymm7 + vpsrld $7,%ymm6,%ymm1 + vpslld $25,%ymm6,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm6,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm6,%ymm2 + vmovdqu 0-128(%rax),%ymm0 + vpsrld $10,%ymm0,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm5,%ymm5 + vpxor %ymm1,%ymm3,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm5,%ymm5 + vpsrld $6,%ymm10,%ymm7 + vpslld $26,%ymm10,%ymm2 + vmovdqu %ymm5,64-128(%rax) + vpaddd %ymm13,%ymm5,%ymm5 + + vpsrld $11,%ymm10,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm10,%ymm2 + vpaddd -64(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm10,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm10,%ymm2 + vpandn %ymm12,%ymm10,%ymm0 + vpand %ymm11,%ymm10,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm14,%ymm13 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm14,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm14,%ymm15,%ymm3 + + vpxor %ymm1,%ymm13,%ymm13 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm14,%ymm1 + + vpslld $19,%ymm14,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm13,%ymm7 + + vpsrld $22,%ymm14,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm14,%ymm2 + vpxor %ymm4,%ymm15,%ymm13 + vpaddd %ymm5,%ymm9,%ymm9 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm13,%ymm13 + vpaddd %ymm7,%ymm13,%ymm13 + vmovdqu 128-128(%rax),%ymm5 + vpaddd 384-256-128(%rbx),%ymm6,%ymm6 + + vpsrld $3,%ymm5,%ymm7 + vpsrld $7,%ymm5,%ymm1 + vpslld $25,%ymm5,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm5,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm5,%ymm2 + vmovdqu 32-128(%rax),%ymm0 + vpsrld $10,%ymm0,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm1,%ymm4,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm6,%ymm6 + vpsrld $6,%ymm9,%ymm7 + vpslld $26,%ymm9,%ymm2 + vmovdqu %ymm6,96-128(%rax) + vpaddd %ymm12,%ymm6,%ymm6 + + vpsrld $11,%ymm9,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm9,%ymm2 + vpaddd -32(%rbp),%ymm6,%ymm6 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm9,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm9,%ymm2 + vpandn %ymm11,%ymm9,%ymm0 + vpand %ymm10,%ymm9,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm13,%ymm12 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm13,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm13,%ymm14,%ymm4 + + vpxor %ymm1,%ymm12,%ymm12 + vpaddd %ymm7,%ymm6,%ymm6 + + vpsrld $13,%ymm13,%ymm1 + + vpslld $19,%ymm13,%ymm2 + vpaddd %ymm0,%ymm6,%ymm6 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm12,%ymm7 + + vpsrld $22,%ymm13,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm13,%ymm2 + vpxor %ymm3,%ymm14,%ymm12 + vpaddd %ymm6,%ymm8,%ymm8 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm6,%ymm12,%ymm12 + vpaddd %ymm7,%ymm12,%ymm12 + vmovdqu 160-128(%rax),%ymm6 + vpaddd 416-256-128(%rbx),%ymm5,%ymm5 + + vpsrld $3,%ymm6,%ymm7 + vpsrld $7,%ymm6,%ymm1 + vpslld $25,%ymm6,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm6,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm6,%ymm2 + vmovdqu 64-128(%rax),%ymm0 + vpsrld $10,%ymm0,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm5,%ymm5 + vpxor %ymm1,%ymm3,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm5,%ymm5 + vpsrld $6,%ymm8,%ymm7 + vpslld $26,%ymm8,%ymm2 + vmovdqu %ymm5,128-128(%rax) + vpaddd %ymm11,%ymm5,%ymm5 + + vpsrld $11,%ymm8,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm8,%ymm2 + vpaddd 0(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm8,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm8,%ymm2 + vpandn %ymm10,%ymm8,%ymm0 + vpand %ymm9,%ymm8,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm12,%ymm11 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm12,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm12,%ymm13,%ymm3 + + vpxor %ymm1,%ymm11,%ymm11 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm12,%ymm1 + + vpslld $19,%ymm12,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm11,%ymm7 + + vpsrld $22,%ymm12,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm12,%ymm2 + vpxor %ymm4,%ymm13,%ymm11 + vpaddd %ymm5,%ymm15,%ymm15 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm11,%ymm11 + vpaddd %ymm7,%ymm11,%ymm11 + vmovdqu 192-128(%rax),%ymm5 + vpaddd 448-256-128(%rbx),%ymm6,%ymm6 + + vpsrld $3,%ymm5,%ymm7 + vpsrld $7,%ymm5,%ymm1 + vpslld $25,%ymm5,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm5,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm5,%ymm2 + vmovdqu 96-128(%rax),%ymm0 + vpsrld $10,%ymm0,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm1,%ymm4,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm6,%ymm6 + vpsrld $6,%ymm15,%ymm7 + vpslld $26,%ymm15,%ymm2 + vmovdqu %ymm6,160-128(%rax) + vpaddd %ymm10,%ymm6,%ymm6 + + vpsrld $11,%ymm15,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm15,%ymm2 + vpaddd 32(%rbp),%ymm6,%ymm6 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm15,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm15,%ymm2 + vpandn %ymm9,%ymm15,%ymm0 + vpand %ymm8,%ymm15,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm11,%ymm10 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm11,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm11,%ymm12,%ymm4 + + vpxor %ymm1,%ymm10,%ymm10 + vpaddd %ymm7,%ymm6,%ymm6 + + vpsrld $13,%ymm11,%ymm1 + + vpslld $19,%ymm11,%ymm2 + vpaddd %ymm0,%ymm6,%ymm6 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm10,%ymm7 + + vpsrld $22,%ymm11,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm11,%ymm2 + vpxor %ymm3,%ymm12,%ymm10 + vpaddd %ymm6,%ymm14,%ymm14 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm6,%ymm10,%ymm10 + vpaddd %ymm7,%ymm10,%ymm10 + vmovdqu 224-128(%rax),%ymm6 + vpaddd 480-256-128(%rbx),%ymm5,%ymm5 + + vpsrld $3,%ymm6,%ymm7 + vpsrld $7,%ymm6,%ymm1 + vpslld $25,%ymm6,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm6,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm6,%ymm2 + vmovdqu 128-128(%rax),%ymm0 + vpsrld $10,%ymm0,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm5,%ymm5 + vpxor %ymm1,%ymm3,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm5,%ymm5 + vpsrld $6,%ymm14,%ymm7 + vpslld $26,%ymm14,%ymm2 + vmovdqu %ymm5,192-128(%rax) + vpaddd %ymm9,%ymm5,%ymm5 + + vpsrld $11,%ymm14,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm14,%ymm2 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm14,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm14,%ymm2 + vpandn %ymm8,%ymm14,%ymm0 + vpand %ymm15,%ymm14,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm10,%ymm9 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm10,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm10,%ymm11,%ymm3 + + vpxor %ymm1,%ymm9,%ymm9 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm10,%ymm1 + + vpslld $19,%ymm10,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm9,%ymm7 + + vpsrld $22,%ymm10,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm10,%ymm2 + vpxor %ymm4,%ymm11,%ymm9 + vpaddd %ymm5,%ymm13,%ymm13 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm9,%ymm9 + vpaddd %ymm7,%ymm9,%ymm9 + vmovdqu 256-256-128(%rbx),%ymm5 + vpaddd 0-128(%rax),%ymm6,%ymm6 + + vpsrld $3,%ymm5,%ymm7 + vpsrld $7,%ymm5,%ymm1 + vpslld $25,%ymm5,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm5,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm5,%ymm2 + vmovdqu 160-128(%rax),%ymm0 + vpsrld $10,%ymm0,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm1,%ymm4,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm6,%ymm6 + vpsrld $6,%ymm13,%ymm7 + vpslld $26,%ymm13,%ymm2 + vmovdqu %ymm6,224-128(%rax) + vpaddd %ymm8,%ymm6,%ymm6 + + vpsrld $11,%ymm13,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm13,%ymm2 + vpaddd 96(%rbp),%ymm6,%ymm6 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm13,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm13,%ymm2 + vpandn %ymm15,%ymm13,%ymm0 + vpand %ymm14,%ymm13,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm9,%ymm8 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm9,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm9,%ymm10,%ymm4 + + vpxor %ymm1,%ymm8,%ymm8 + vpaddd %ymm7,%ymm6,%ymm6 + + vpsrld $13,%ymm9,%ymm1 + + vpslld $19,%ymm9,%ymm2 + vpaddd %ymm0,%ymm6,%ymm6 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm8,%ymm7 + + vpsrld $22,%ymm9,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm9,%ymm2 + vpxor %ymm3,%ymm10,%ymm8 + vpaddd %ymm6,%ymm12,%ymm12 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm6,%ymm8,%ymm8 + vpaddd %ymm7,%ymm8,%ymm8 + addq $256,%rbp + vmovdqu 288-256-128(%rbx),%ymm6 + vpaddd 32-128(%rax),%ymm5,%ymm5 + + vpsrld $3,%ymm6,%ymm7 + vpsrld $7,%ymm6,%ymm1 + vpslld $25,%ymm6,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm6,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm6,%ymm2 + vmovdqu 192-128(%rax),%ymm0 + vpsrld $10,%ymm0,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm5,%ymm5 + vpxor %ymm1,%ymm3,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm5,%ymm5 + vpsrld $6,%ymm12,%ymm7 + vpslld $26,%ymm12,%ymm2 + vmovdqu %ymm5,256-256-128(%rbx) + vpaddd %ymm15,%ymm5,%ymm5 + + vpsrld $11,%ymm12,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm12,%ymm2 + vpaddd -128(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm12,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm12,%ymm2 + vpandn %ymm14,%ymm12,%ymm0 + vpand %ymm13,%ymm12,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm8,%ymm15 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm8,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm8,%ymm9,%ymm3 + + vpxor %ymm1,%ymm15,%ymm15 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm8,%ymm1 + + vpslld $19,%ymm8,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm15,%ymm7 + + vpsrld $22,%ymm8,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm8,%ymm2 + vpxor %ymm4,%ymm9,%ymm15 + vpaddd %ymm5,%ymm11,%ymm11 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm15,%ymm15 + vpaddd %ymm7,%ymm15,%ymm15 + vmovdqu 320-256-128(%rbx),%ymm5 + vpaddd 64-128(%rax),%ymm6,%ymm6 + + vpsrld $3,%ymm5,%ymm7 + vpsrld $7,%ymm5,%ymm1 + vpslld $25,%ymm5,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm5,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm5,%ymm2 + vmovdqu 224-128(%rax),%ymm0 + vpsrld $10,%ymm0,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm1,%ymm4,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm6,%ymm6 + vpsrld $6,%ymm11,%ymm7 + vpslld $26,%ymm11,%ymm2 + vmovdqu %ymm6,288-256-128(%rbx) + vpaddd %ymm14,%ymm6,%ymm6 + + vpsrld $11,%ymm11,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm11,%ymm2 + vpaddd -96(%rbp),%ymm6,%ymm6 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm11,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm11,%ymm2 + vpandn %ymm13,%ymm11,%ymm0 + vpand %ymm12,%ymm11,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm15,%ymm14 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm15,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm15,%ymm8,%ymm4 + + vpxor %ymm1,%ymm14,%ymm14 + vpaddd %ymm7,%ymm6,%ymm6 + + vpsrld $13,%ymm15,%ymm1 + + vpslld $19,%ymm15,%ymm2 + vpaddd %ymm0,%ymm6,%ymm6 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm14,%ymm7 + + vpsrld $22,%ymm15,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm15,%ymm2 + vpxor %ymm3,%ymm8,%ymm14 + vpaddd %ymm6,%ymm10,%ymm10 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm6,%ymm14,%ymm14 + vpaddd %ymm7,%ymm14,%ymm14 + vmovdqu 352-256-128(%rbx),%ymm6 + vpaddd 96-128(%rax),%ymm5,%ymm5 + + vpsrld $3,%ymm6,%ymm7 + vpsrld $7,%ymm6,%ymm1 + vpslld $25,%ymm6,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm6,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm6,%ymm2 + vmovdqu 256-256-128(%rbx),%ymm0 + vpsrld $10,%ymm0,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm5,%ymm5 + vpxor %ymm1,%ymm3,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm5,%ymm5 + vpsrld $6,%ymm10,%ymm7 + vpslld $26,%ymm10,%ymm2 + vmovdqu %ymm5,320-256-128(%rbx) + vpaddd %ymm13,%ymm5,%ymm5 + + vpsrld $11,%ymm10,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm10,%ymm2 + vpaddd -64(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm10,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm10,%ymm2 + vpandn %ymm12,%ymm10,%ymm0 + vpand %ymm11,%ymm10,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm14,%ymm13 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm14,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm14,%ymm15,%ymm3 + + vpxor %ymm1,%ymm13,%ymm13 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm14,%ymm1 + + vpslld $19,%ymm14,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm13,%ymm7 + + vpsrld $22,%ymm14,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm14,%ymm2 + vpxor %ymm4,%ymm15,%ymm13 + vpaddd %ymm5,%ymm9,%ymm9 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm13,%ymm13 + vpaddd %ymm7,%ymm13,%ymm13 + vmovdqu 384-256-128(%rbx),%ymm5 + vpaddd 128-128(%rax),%ymm6,%ymm6 + + vpsrld $3,%ymm5,%ymm7 + vpsrld $7,%ymm5,%ymm1 + vpslld $25,%ymm5,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm5,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm5,%ymm2 + vmovdqu 288-256-128(%rbx),%ymm0 + vpsrld $10,%ymm0,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm1,%ymm4,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm6,%ymm6 + vpsrld $6,%ymm9,%ymm7 + vpslld $26,%ymm9,%ymm2 + vmovdqu %ymm6,352-256-128(%rbx) + vpaddd %ymm12,%ymm6,%ymm6 + + vpsrld $11,%ymm9,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm9,%ymm2 + vpaddd -32(%rbp),%ymm6,%ymm6 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm9,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm9,%ymm2 + vpandn %ymm11,%ymm9,%ymm0 + vpand %ymm10,%ymm9,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm13,%ymm12 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm13,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm13,%ymm14,%ymm4 + + vpxor %ymm1,%ymm12,%ymm12 + vpaddd %ymm7,%ymm6,%ymm6 + + vpsrld $13,%ymm13,%ymm1 + + vpslld $19,%ymm13,%ymm2 + vpaddd %ymm0,%ymm6,%ymm6 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm12,%ymm7 + + vpsrld $22,%ymm13,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm13,%ymm2 + vpxor %ymm3,%ymm14,%ymm12 + vpaddd %ymm6,%ymm8,%ymm8 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm6,%ymm12,%ymm12 + vpaddd %ymm7,%ymm12,%ymm12 + vmovdqu 416-256-128(%rbx),%ymm6 + vpaddd 160-128(%rax),%ymm5,%ymm5 + + vpsrld $3,%ymm6,%ymm7 + vpsrld $7,%ymm6,%ymm1 + vpslld $25,%ymm6,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm6,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm6,%ymm2 + vmovdqu 320-256-128(%rbx),%ymm0 + vpsrld $10,%ymm0,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm5,%ymm5 + vpxor %ymm1,%ymm3,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm5,%ymm5 + vpsrld $6,%ymm8,%ymm7 + vpslld $26,%ymm8,%ymm2 + vmovdqu %ymm5,384-256-128(%rbx) + vpaddd %ymm11,%ymm5,%ymm5 + + vpsrld $11,%ymm8,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm8,%ymm2 + vpaddd 0(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm8,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm8,%ymm2 + vpandn %ymm10,%ymm8,%ymm0 + vpand %ymm9,%ymm8,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm12,%ymm11 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm12,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm12,%ymm13,%ymm3 + + vpxor %ymm1,%ymm11,%ymm11 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm12,%ymm1 + + vpslld $19,%ymm12,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm11,%ymm7 + + vpsrld $22,%ymm12,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm12,%ymm2 + vpxor %ymm4,%ymm13,%ymm11 + vpaddd %ymm5,%ymm15,%ymm15 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm11,%ymm11 + vpaddd %ymm7,%ymm11,%ymm11 + vmovdqu 448-256-128(%rbx),%ymm5 + vpaddd 192-128(%rax),%ymm6,%ymm6 + + vpsrld $3,%ymm5,%ymm7 + vpsrld $7,%ymm5,%ymm1 + vpslld $25,%ymm5,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm5,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm5,%ymm2 + vmovdqu 352-256-128(%rbx),%ymm0 + vpsrld $10,%ymm0,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm1,%ymm4,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm6,%ymm6 + vpsrld $6,%ymm15,%ymm7 + vpslld $26,%ymm15,%ymm2 + vmovdqu %ymm6,416-256-128(%rbx) + vpaddd %ymm10,%ymm6,%ymm6 + + vpsrld $11,%ymm15,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm15,%ymm2 + vpaddd 32(%rbp),%ymm6,%ymm6 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm15,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm15,%ymm2 + vpandn %ymm9,%ymm15,%ymm0 + vpand %ymm8,%ymm15,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm11,%ymm10 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm11,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm11,%ymm12,%ymm4 + + vpxor %ymm1,%ymm10,%ymm10 + vpaddd %ymm7,%ymm6,%ymm6 + + vpsrld $13,%ymm11,%ymm1 + + vpslld $19,%ymm11,%ymm2 + vpaddd %ymm0,%ymm6,%ymm6 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm10,%ymm7 + + vpsrld $22,%ymm11,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm11,%ymm2 + vpxor %ymm3,%ymm12,%ymm10 + vpaddd %ymm6,%ymm14,%ymm14 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm6,%ymm10,%ymm10 + vpaddd %ymm7,%ymm10,%ymm10 + vmovdqu 480-256-128(%rbx),%ymm6 + vpaddd 224-128(%rax),%ymm5,%ymm5 + + vpsrld $3,%ymm6,%ymm7 + vpsrld $7,%ymm6,%ymm1 + vpslld $25,%ymm6,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm6,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm6,%ymm2 + vmovdqu 384-256-128(%rbx),%ymm0 + vpsrld $10,%ymm0,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm5,%ymm5 + vpxor %ymm1,%ymm3,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm5,%ymm5 + vpsrld $6,%ymm14,%ymm7 + vpslld $26,%ymm14,%ymm2 + vmovdqu %ymm5,448-256-128(%rbx) + vpaddd %ymm9,%ymm5,%ymm5 + + vpsrld $11,%ymm14,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm14,%ymm2 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm14,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm14,%ymm2 + vpandn %ymm8,%ymm14,%ymm0 + vpand %ymm15,%ymm14,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm10,%ymm9 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm10,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm10,%ymm11,%ymm3 + + vpxor %ymm1,%ymm9,%ymm9 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm10,%ymm1 + + vpslld $19,%ymm10,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm9,%ymm7 + + vpsrld $22,%ymm10,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm10,%ymm2 + vpxor %ymm4,%ymm11,%ymm9 + vpaddd %ymm5,%ymm13,%ymm13 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm9,%ymm9 + vpaddd %ymm7,%ymm9,%ymm9 + vmovdqu 0-128(%rax),%ymm5 + vpaddd 256-256-128(%rbx),%ymm6,%ymm6 + + vpsrld $3,%ymm5,%ymm7 + vpsrld $7,%ymm5,%ymm1 + vpslld $25,%ymm5,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm5,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm5,%ymm2 + vmovdqu 416-256-128(%rbx),%ymm0 + vpsrld $10,%ymm0,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm1,%ymm4,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm6,%ymm6 + vpsrld $6,%ymm13,%ymm7 + vpslld $26,%ymm13,%ymm2 + vmovdqu %ymm6,480-256-128(%rbx) + vpaddd %ymm8,%ymm6,%ymm6 + + vpsrld $11,%ymm13,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm13,%ymm2 + vpaddd 96(%rbp),%ymm6,%ymm6 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm13,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm13,%ymm2 + vpandn %ymm15,%ymm13,%ymm0 + vpand %ymm14,%ymm13,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm9,%ymm8 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm9,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm9,%ymm10,%ymm4 + + vpxor %ymm1,%ymm8,%ymm8 + vpaddd %ymm7,%ymm6,%ymm6 + + vpsrld $13,%ymm9,%ymm1 + + vpslld $19,%ymm9,%ymm2 + vpaddd %ymm0,%ymm6,%ymm6 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm8,%ymm7 + + vpsrld $22,%ymm9,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm9,%ymm2 + vpxor %ymm3,%ymm10,%ymm8 + vpaddd %ymm6,%ymm12,%ymm12 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm6,%ymm8,%ymm8 + vpaddd %ymm7,%ymm8,%ymm8 + addq $256,%rbp + decl %ecx + jnz .Loop_16_xx_avx2 + + movl $1,%ecx + leaq 512(%rsp),%rbx + leaq K256+128(%rip),%rbp + cmpl 0(%rbx),%ecx + cmovgeq %rbp,%r12 + cmpl 4(%rbx),%ecx + cmovgeq %rbp,%r13 + cmpl 8(%rbx),%ecx + cmovgeq %rbp,%r14 + cmpl 12(%rbx),%ecx + cmovgeq %rbp,%r15 + cmpl 16(%rbx),%ecx + cmovgeq %rbp,%r8 + cmpl 20(%rbx),%ecx + cmovgeq %rbp,%r9 + cmpl 24(%rbx),%ecx + cmovgeq %rbp,%r10 + cmpl 28(%rbx),%ecx + cmovgeq %rbp,%r11 + vmovdqa (%rbx),%ymm7 + vpxor %ymm0,%ymm0,%ymm0 + vmovdqa %ymm7,%ymm6 + vpcmpgtd %ymm0,%ymm6,%ymm6 + vpaddd %ymm6,%ymm7,%ymm7 + + vmovdqu 0-128(%rdi),%ymm0 + vpand %ymm6,%ymm8,%ymm8 + vmovdqu 32-128(%rdi),%ymm1 + vpand %ymm6,%ymm9,%ymm9 + vmovdqu 64-128(%rdi),%ymm2 + vpand %ymm6,%ymm10,%ymm10 + vmovdqu 96-128(%rdi),%ymm5 + vpand %ymm6,%ymm11,%ymm11 + vpaddd %ymm0,%ymm8,%ymm8 + vmovdqu 128-128(%rdi),%ymm0 + vpand %ymm6,%ymm12,%ymm12 + vpaddd %ymm1,%ymm9,%ymm9 + vmovdqu 160-128(%rdi),%ymm1 + vpand %ymm6,%ymm13,%ymm13 + vpaddd %ymm2,%ymm10,%ymm10 + vmovdqu 192-128(%rdi),%ymm2 + vpand %ymm6,%ymm14,%ymm14 + vpaddd %ymm5,%ymm11,%ymm11 + vmovdqu 224-128(%rdi),%ymm5 + vpand %ymm6,%ymm15,%ymm15 + vpaddd %ymm0,%ymm12,%ymm12 + vpaddd %ymm1,%ymm13,%ymm13 + vmovdqu %ymm8,0-128(%rdi) + vpaddd %ymm2,%ymm14,%ymm14 + vmovdqu %ymm9,32-128(%rdi) + vpaddd %ymm5,%ymm15,%ymm15 + vmovdqu %ymm10,64-128(%rdi) + vmovdqu %ymm11,96-128(%rdi) + vmovdqu %ymm12,128-128(%rdi) + vmovdqu %ymm13,160-128(%rdi) + vmovdqu %ymm14,192-128(%rdi) + vmovdqu %ymm15,224-128(%rdi) + + vmovdqu %ymm7,(%rbx) + leaq 256+128(%rsp),%rbx + vmovdqu .Lpbswap(%rip),%ymm6 + decl %edx + jnz .Loop_avx2 + + + + + + + +.Ldone_avx2: + movq 544(%rsp),%rax +.cfi_def_cfa %rax,8 + vzeroupper + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx2: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha256_multi_block_avx2,.-sha256_multi_block_avx2 .align 256 K256: .long 1116352408,1116352408,1116352408,1116352408 diff --git a/secure/lib/libcrypto/amd64/sha256-x86_64.S b/secure/lib/libcrypto/amd64/sha256-x86_64.S index 91b3ead8976..13d497c4f8e 100644 --- a/secure/lib/libcrypto/amd64/sha256-x86_64.S +++ b/secure/lib/libcrypto/amd64/sha256-x86_64.S @@ -14,6 +14,14 @@ sha256_block_data_order: movl 8(%r11),%r11d testl $536870912,%r11d jnz _shaext_shortcut + andl $296,%r11d + cmpl $296,%r11d + je .Lavx2_shortcut + andl $1073741824,%r9d + andl $268435968,%r10d + orl %r9d,%r10d + cmpl $1342177792,%r10d + je .Lavx_shortcut testl $512,%r10d jnz .Lssse3_shortcut movq %rsp,%rax @@ -3087,3 +3095,2364 @@ sha256_block_data_order_ssse3: .byte 0xf3,0xc3 .cfi_endproc .size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3 +.type sha256_block_data_order_avx,@function +.align 64 +sha256_block_data_order_avx: +.cfi_startproc +.Lavx_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $96,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 +.Lprologue_avx: + + vzeroupper + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + vmovdqa K256+512+32(%rip),%xmm8 + vmovdqa K256+512+64(%rip),%xmm9 + jmp .Lloop_avx +.align 16 +.Lloop_avx: + vmovdqa K256+512(%rip),%xmm7 + vmovdqu 0(%rsi),%xmm0 + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm7,%xmm0,%xmm0 + leaq K256(%rip),%rbp + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd 0(%rbp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 32(%rbp),%xmm1,%xmm5 + vpaddd 64(%rbp),%xmm2,%xmm6 + vpaddd 96(%rbp),%xmm3,%xmm7 + vmovdqa %xmm4,0(%rsp) + movl %eax,%r14d + vmovdqa %xmm5,16(%rsp) + movl %ebx,%edi + vmovdqa %xmm6,32(%rsp) + xorl %ecx,%edi + vmovdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lavx_00_47 + +.align 16 +.Lavx_00_47: + subq $-128,%rbp + vpalignr $4,%xmm0,%xmm1,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm2,%xmm3,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm0,%xmm0 + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm3,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm0,%xmm0 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm0,%xmm0 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + vpshufd $80,%xmm0,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm0,%xmm0 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 0(%rbp),%xmm0,%xmm6 + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,0(%rsp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm3,%xmm0,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm1,%xmm1 + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm0,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm1,%xmm1 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm1,%xmm1 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + vpshufd $80,%xmm1,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm1,%xmm1 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 32(%rbp),%xmm1,%xmm6 + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,16(%rsp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm0,%xmm1,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm2,%xmm2 + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm1,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm2,%xmm2 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm2,%xmm2 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + vpshufd $80,%xmm2,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm2,%xmm2 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 64(%rbp),%xmm2,%xmm6 + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,32(%rsp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm1,%xmm2,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm3,%xmm3 + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm2,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm3,%xmm3 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm3,%xmm3 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + vpshufd $80,%xmm3,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm3,%xmm3 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 96(%rbp),%xmm3,%xmm6 + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,48(%rsp) + cmpb $0,131(%rbp) + jne .Lavx_00_47 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%rdi + movl %r14d,%eax + + addl 0(%rdi),%eax + leaq 64(%rsi),%rsi + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_avx + + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 + vzeroupper + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha256_block_data_order_avx,.-sha256_block_data_order_avx +.type sha256_block_data_order_avx2,@function +.align 64 +sha256_block_data_order_avx2: +.cfi_startproc +.Lavx2_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + subq $544,%rsp + shlq $4,%rdx + andq $-1024,%rsp + leaq (%rsi,%rdx,4),%rdx + addq $448,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 +.Lprologue_avx2: + + vzeroupper + subq $-64,%rsi + movl 0(%rdi),%eax + movq %rsi,%r12 + movl 4(%rdi),%ebx + cmpq %rdx,%rsi + movl 8(%rdi),%ecx + cmoveq %rsp,%r12 + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + vmovdqa K256+512+32(%rip),%ymm8 + vmovdqa K256+512+64(%rip),%ymm9 + jmp .Loop_avx2 +.align 16 +.Loop_avx2: + vmovdqa K256+512(%rip),%ymm7 + vmovdqu -64+0(%rsi),%xmm0 + vmovdqu -64+16(%rsi),%xmm1 + vmovdqu -64+32(%rsi),%xmm2 + vmovdqu -64+48(%rsi),%xmm3 + + vinserti128 $1,(%r12),%ymm0,%ymm0 + vinserti128 $1,16(%r12),%ymm1,%ymm1 + vpshufb %ymm7,%ymm0,%ymm0 + vinserti128 $1,32(%r12),%ymm2,%ymm2 + vpshufb %ymm7,%ymm1,%ymm1 + vinserti128 $1,48(%r12),%ymm3,%ymm3 + + leaq K256(%rip),%rbp + vpshufb %ymm7,%ymm2,%ymm2 + vpaddd 0(%rbp),%ymm0,%ymm4 + vpshufb %ymm7,%ymm3,%ymm3 + vpaddd 32(%rbp),%ymm1,%ymm5 + vpaddd 64(%rbp),%ymm2,%ymm6 + vpaddd 96(%rbp),%ymm3,%ymm7 + vmovdqa %ymm4,0(%rsp) + xorl %r14d,%r14d + vmovdqa %ymm5,32(%rsp) + + movq 88(%rsp),%rdi +.cfi_def_cfa %rdi,8 + leaq -64(%rsp),%rsp + + + + movq %rdi,-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + movl %ebx,%edi + vmovdqa %ymm6,0(%rsp) + xorl %ecx,%edi + vmovdqa %ymm7,32(%rsp) + movl %r9d,%r12d + subq $-32*4,%rbp + jmp .Lavx2_00_47 + +.align 16 +.Lavx2_00_47: + leaq -64(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08 + + pushq 64-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08 + leaq 8(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + vpalignr $4,%ymm0,%ymm1,%ymm4 + addl 0+128(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + vpalignr $4,%ymm2,%ymm3,%ymm7 + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + vpsrld $7,%ymm4,%ymm6 + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + vpaddd %ymm7,%ymm0,%ymm0 + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + vpshufd $250,%ymm3,%ymm7 + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 4+128(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + vpslld $11,%ymm5,%ymm5 + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + vpsrlq $17,%ymm7,%ymm7 + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + vpaddd %ymm4,%ymm0,%ymm0 + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 8+128(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + vpxor %ymm7,%ymm6,%ymm6 + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + vpshufb %ymm8,%ymm6,%ymm6 + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + vpaddd %ymm6,%ymm0,%ymm0 + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + vpshufd $80,%ymm0,%ymm7 + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + vpsrld $10,%ymm7,%ymm6 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + vpsrlq $17,%ymm7,%ymm7 + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 12+128(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + vpxor %ymm7,%ymm6,%ymm6 + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + vpshufb %ymm9,%ymm6,%ymm6 + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + vpaddd %ymm6,%ymm0,%ymm0 + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + vpaddd 0(%rbp),%ymm0,%ymm6 + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + vmovdqa %ymm6,0(%rsp) + vpalignr $4,%ymm1,%ymm2,%ymm4 + addl 32+128(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + vpalignr $4,%ymm3,%ymm0,%ymm7 + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + vpsrld $7,%ymm4,%ymm6 + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + vpaddd %ymm7,%ymm1,%ymm1 + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + vpshufd $250,%ymm0,%ymm7 + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 36+128(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + vpslld $11,%ymm5,%ymm5 + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + vpsrlq $17,%ymm7,%ymm7 + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + vpaddd %ymm4,%ymm1,%ymm1 + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 40+128(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + vpxor %ymm7,%ymm6,%ymm6 + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + vpshufb %ymm8,%ymm6,%ymm6 + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + vpaddd %ymm6,%ymm1,%ymm1 + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + vpshufd $80,%ymm1,%ymm7 + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + vpsrld $10,%ymm7,%ymm6 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + vpsrlq $17,%ymm7,%ymm7 + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 44+128(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + vpxor %ymm7,%ymm6,%ymm6 + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + vpshufb %ymm9,%ymm6,%ymm6 + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + vpaddd %ymm6,%ymm1,%ymm1 + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + vpaddd 32(%rbp),%ymm1,%ymm6 + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + vmovdqa %ymm6,32(%rsp) + leaq -64(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08 + + pushq 64-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08 + leaq 8(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + vpalignr $4,%ymm2,%ymm3,%ymm4 + addl 0+128(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + vpalignr $4,%ymm0,%ymm1,%ymm7 + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + vpsrld $7,%ymm4,%ymm6 + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + vpaddd %ymm7,%ymm2,%ymm2 + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + vpshufd $250,%ymm1,%ymm7 + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 4+128(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + vpslld $11,%ymm5,%ymm5 + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + vpsrlq $17,%ymm7,%ymm7 + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + vpaddd %ymm4,%ymm2,%ymm2 + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 8+128(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + vpxor %ymm7,%ymm6,%ymm6 + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + vpshufb %ymm8,%ymm6,%ymm6 + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + vpaddd %ymm6,%ymm2,%ymm2 + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + vpshufd $80,%ymm2,%ymm7 + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + vpsrld $10,%ymm7,%ymm6 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + vpsrlq $17,%ymm7,%ymm7 + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 12+128(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + vpxor %ymm7,%ymm6,%ymm6 + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + vpshufb %ymm9,%ymm6,%ymm6 + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + vpaddd %ymm6,%ymm2,%ymm2 + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + vpaddd 64(%rbp),%ymm2,%ymm6 + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + vmovdqa %ymm6,0(%rsp) + vpalignr $4,%ymm3,%ymm0,%ymm4 + addl 32+128(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + vpalignr $4,%ymm1,%ymm2,%ymm7 + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + vpsrld $7,%ymm4,%ymm6 + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + vpaddd %ymm7,%ymm3,%ymm3 + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + vpshufd $250,%ymm2,%ymm7 + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 36+128(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + vpslld $11,%ymm5,%ymm5 + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + vpsrlq $17,%ymm7,%ymm7 + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + vpaddd %ymm4,%ymm3,%ymm3 + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 40+128(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + vpxor %ymm7,%ymm6,%ymm6 + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + vpshufb %ymm8,%ymm6,%ymm6 + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + vpaddd %ymm6,%ymm3,%ymm3 + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + vpshufd $80,%ymm3,%ymm7 + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + vpsrld $10,%ymm7,%ymm6 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + vpsrlq $17,%ymm7,%ymm7 + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 44+128(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + vpxor %ymm7,%ymm6,%ymm6 + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + vpshufb %ymm9,%ymm6,%ymm6 + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + vpaddd %ymm6,%ymm3,%ymm3 + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + vpaddd 96(%rbp),%ymm3,%ymm6 + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + vmovdqa %ymm6,32(%rsp) + leaq 128(%rbp),%rbp + cmpb $0,3(%rbp) + jne .Lavx2_00_47 + addl 0+64(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + addl 4+64(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + addl 8+64(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + addl 12+64(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + addl 32+64(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + addl 36+64(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + addl 40+64(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + addl 44+64(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + addl 0(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + addl 4(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + addl 8(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + addl 12(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + addl 32(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + addl 36(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + addl 40(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + addl 44(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + movq 512(%rsp),%rdi + addl %r14d,%eax + + leaq 448(%rsp),%rbp + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + + cmpq 80(%rbp),%rsi + je .Ldone_avx2 + + xorl %r14d,%r14d + movl %ebx,%edi + xorl %ecx,%edi + movl %r9d,%r12d + jmp .Lower_avx2 +.align 16 +.Lower_avx2: + addl 0+16(%rbp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + addl 4+16(%rbp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + addl 8+16(%rbp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + addl 12+16(%rbp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + addl 32+16(%rbp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + addl 36+16(%rbp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + addl 40+16(%rbp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + addl 44+16(%rbp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + leaq -64(%rbp),%rbp + cmpq %rsp,%rbp + jae .Lower_avx2 + + movq 512(%rsp),%rdi + addl %r14d,%eax + + leaq 448(%rsp),%rsp + +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + leaq 128(%rsi),%rsi + addl 24(%rdi),%r10d + movq %rsi,%r12 + addl 28(%rdi),%r11d + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + cmoveq %rsp,%r12 + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + + jbe .Loop_avx2 + leaq (%rsp),%rbp + + +.cfi_escape 0x0f,0x06,0x76,0xd8,0x00,0x06,0x23,0x08 + +.Ldone_avx2: + movq 88(%rbp),%rsi +.cfi_def_cfa %rsi,8 + vzeroupper + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx2: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha256_block_data_order_avx2,.-sha256_block_data_order_avx2 diff --git a/secure/lib/libcrypto/amd64/sha512-x86_64.S b/secure/lib/libcrypto/amd64/sha512-x86_64.S index a9b971a1b7c..ae11a36e482 100644 --- a/secure/lib/libcrypto/amd64/sha512-x86_64.S +++ b/secure/lib/libcrypto/amd64/sha512-x86_64.S @@ -8,6 +8,20 @@ .align 16 sha512_block_data_order: .cfi_startproc + leaq OPENSSL_ia32cap_P(%rip),%r11 + movl 0(%r11),%r9d + movl 4(%r11),%r10d + movl 8(%r11),%r11d + testl $2048,%r10d + jnz .Lxop_shortcut + andl $296,%r11d + cmpl $296,%r11d + je .Lavx2_shortcut + andl $1073741824,%r9d + andl $268435968,%r10d + orl %r9d,%r10d + cmpl $1342177792,%r10d + je .Lavx_shortcut movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx @@ -1801,3 +1815,3649 @@ K512: .quad 0x0001020304050607,0x08090a0b0c0d0e0f .quad 0x0001020304050607,0x08090a0b0c0d0e0f .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.type sha512_block_data_order_xop,@function +.align 64 +sha512_block_data_order_xop: +.cfi_startproc +.Lxop_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $160,%rsp + leaq (%rsi,%rdx,8),%rdx + andq $-64,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 +.Lprologue_xop: + + vzeroupper + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp .Lloop_xop +.align 16 +.Lloop_xop: + vmovdqa K512+1280(%rip),%xmm11 + vmovdqu 0(%rsi),%xmm0 + leaq K512+128(%rip),%rbp + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vpshufb %xmm11,%xmm0,%xmm0 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm11,%xmm1,%xmm1 + vmovdqu 64(%rsi),%xmm4 + vpshufb %xmm11,%xmm2,%xmm2 + vmovdqu 80(%rsi),%xmm5 + vpshufb %xmm11,%xmm3,%xmm3 + vmovdqu 96(%rsi),%xmm6 + vpshufb %xmm11,%xmm4,%xmm4 + vmovdqu 112(%rsi),%xmm7 + vpshufb %xmm11,%xmm5,%xmm5 + vpaddq -128(%rbp),%xmm0,%xmm8 + vpshufb %xmm11,%xmm6,%xmm6 + vpaddq -96(%rbp),%xmm1,%xmm9 + vpshufb %xmm11,%xmm7,%xmm7 + vpaddq -64(%rbp),%xmm2,%xmm10 + vpaddq -32(%rbp),%xmm3,%xmm11 + vmovdqa %xmm8,0(%rsp) + vpaddq 0(%rbp),%xmm4,%xmm8 + vmovdqa %xmm9,16(%rsp) + vpaddq 32(%rbp),%xmm5,%xmm9 + vmovdqa %xmm10,32(%rsp) + vpaddq 64(%rbp),%xmm6,%xmm10 + vmovdqa %xmm11,48(%rsp) + vpaddq 96(%rbp),%xmm7,%xmm11 + vmovdqa %xmm8,64(%rsp) + movq %rax,%r14 + vmovdqa %xmm9,80(%rsp) + movq %rbx,%rdi + vmovdqa %xmm10,96(%rsp) + xorq %rcx,%rdi + vmovdqa %xmm11,112(%rsp) + movq %r8,%r13 + jmp .Lxop_00_47 + +.align 16 +.Lxop_00_47: + addq $256,%rbp + vpalignr $8,%xmm0,%xmm1,%xmm8 + rorq $23,%r13 + movq %r14,%rax + vpalignr $8,%xmm4,%xmm5,%xmm11 + movq %r9,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r8,%r13 + xorq %r10,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rax,%r14 + vpaddq %xmm11,%xmm0,%xmm0 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 +.byte 143,72,120,195,209,7 + xorq %r10,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,223,3 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm7,%xmm10 + addq %r11,%rdx + addq %rdi,%r11 + vpaddq %xmm8,%xmm0,%xmm0 + movq %rdx,%r13 + addq %r11,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r11 + vpxor %xmm10,%xmm11,%xmm11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + vpaddq %xmm11,%xmm0,%xmm0 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + vpaddq -128(%rbp),%xmm0,%xmm10 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,0(%rsp) + vpalignr $8,%xmm1,%xmm2,%xmm8 + rorq $23,%r13 + movq %r14,%r10 + vpalignr $8,%xmm5,%xmm6,%xmm11 + movq %rdx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rcx,%r13 + xorq %r8,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r10,%r14 + vpaddq %xmm11,%xmm1,%xmm1 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 +.byte 143,72,120,195,209,7 + xorq %r8,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,216,3 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm0,%xmm10 + addq %r9,%rbx + addq %rdi,%r9 + vpaddq %xmm8,%xmm1,%xmm1 + movq %rbx,%r13 + addq %r9,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r9 + vpxor %xmm10,%xmm11,%xmm11 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + vpaddq %xmm11,%xmm1,%xmm1 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + vpaddq -96(%rbp),%xmm1,%xmm10 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,16(%rsp) + vpalignr $8,%xmm2,%xmm3,%xmm8 + rorq $23,%r13 + movq %r14,%r8 + vpalignr $8,%xmm6,%xmm7,%xmm11 + movq %rbx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rax,%r13 + xorq %rcx,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r8,%r14 + vpaddq %xmm11,%xmm2,%xmm2 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 +.byte 143,72,120,195,209,7 + xorq %rcx,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,217,3 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm1,%xmm10 + addq %rdx,%r11 + addq %rdi,%rdx + vpaddq %xmm8,%xmm2,%xmm2 + movq %r11,%r13 + addq %rdx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rdx + vpxor %xmm10,%xmm11,%xmm11 + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + vpaddq %xmm11,%xmm2,%xmm2 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + vpaddq -64(%rbp),%xmm2,%xmm10 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,32(%rsp) + vpalignr $8,%xmm3,%xmm4,%xmm8 + rorq $23,%r13 + movq %r14,%rcx + vpalignr $8,%xmm7,%xmm0,%xmm11 + movq %r11,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r10,%r13 + xorq %rax,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rcx,%r14 + vpaddq %xmm11,%xmm3,%xmm3 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 +.byte 143,72,120,195,209,7 + xorq %rax,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,218,3 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm2,%xmm10 + addq %rbx,%r9 + addq %rdi,%rbx + vpaddq %xmm8,%xmm3,%xmm3 + movq %r9,%r13 + addq %rbx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rbx + vpxor %xmm10,%xmm11,%xmm11 + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + vpaddq %xmm11,%xmm3,%xmm3 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + vpaddq -32(%rbp),%xmm3,%xmm10 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,48(%rsp) + vpalignr $8,%xmm4,%xmm5,%xmm8 + rorq $23,%r13 + movq %r14,%rax + vpalignr $8,%xmm0,%xmm1,%xmm11 + movq %r9,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r8,%r13 + xorq %r10,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rax,%r14 + vpaddq %xmm11,%xmm4,%xmm4 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 +.byte 143,72,120,195,209,7 + xorq %r10,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,219,3 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm3,%xmm10 + addq %r11,%rdx + addq %rdi,%r11 + vpaddq %xmm8,%xmm4,%xmm4 + movq %rdx,%r13 + addq %r11,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r11 + vpxor %xmm10,%xmm11,%xmm11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + vpaddq %xmm11,%xmm4,%xmm4 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + vpaddq 0(%rbp),%xmm4,%xmm10 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,64(%rsp) + vpalignr $8,%xmm5,%xmm6,%xmm8 + rorq $23,%r13 + movq %r14,%r10 + vpalignr $8,%xmm1,%xmm2,%xmm11 + movq %rdx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rcx,%r13 + xorq %r8,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r10,%r14 + vpaddq %xmm11,%xmm5,%xmm5 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 +.byte 143,72,120,195,209,7 + xorq %r8,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,220,3 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm4,%xmm10 + addq %r9,%rbx + addq %rdi,%r9 + vpaddq %xmm8,%xmm5,%xmm5 + movq %rbx,%r13 + addq %r9,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r9 + vpxor %xmm10,%xmm11,%xmm11 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + vpaddq %xmm11,%xmm5,%xmm5 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + vpaddq 32(%rbp),%xmm5,%xmm10 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,80(%rsp) + vpalignr $8,%xmm6,%xmm7,%xmm8 + rorq $23,%r13 + movq %r14,%r8 + vpalignr $8,%xmm2,%xmm3,%xmm11 + movq %rbx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rax,%r13 + xorq %rcx,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r8,%r14 + vpaddq %xmm11,%xmm6,%xmm6 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 +.byte 143,72,120,195,209,7 + xorq %rcx,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,221,3 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm5,%xmm10 + addq %rdx,%r11 + addq %rdi,%rdx + vpaddq %xmm8,%xmm6,%xmm6 + movq %r11,%r13 + addq %rdx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rdx + vpxor %xmm10,%xmm11,%xmm11 + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + vpaddq %xmm11,%xmm6,%xmm6 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + vpaddq 64(%rbp),%xmm6,%xmm10 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,96(%rsp) + vpalignr $8,%xmm7,%xmm0,%xmm8 + rorq $23,%r13 + movq %r14,%rcx + vpalignr $8,%xmm3,%xmm4,%xmm11 + movq %r11,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r10,%r13 + xorq %rax,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rcx,%r14 + vpaddq %xmm11,%xmm7,%xmm7 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 +.byte 143,72,120,195,209,7 + xorq %rax,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,222,3 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm6,%xmm10 + addq %rbx,%r9 + addq %rdi,%rbx + vpaddq %xmm8,%xmm7,%xmm7 + movq %r9,%r13 + addq %rbx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rbx + vpxor %xmm10,%xmm11,%xmm11 + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + vpaddq %xmm11,%xmm7,%xmm7 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + vpaddq 96(%rbp),%xmm7,%xmm10 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,112(%rsp) + cmpb $0,135(%rbp) + jne .Lxop_00_47 + rorq $23,%r13 + movq %r14,%rax + movq %r9,%r12 + rorq $5,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + rorq $4,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + rorq $6,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + rorq $28,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + rorq $23,%r13 + movq %r14,%r11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + rorq $23,%r13 + movq %r14,%r10 + movq %rdx,%r12 + rorq $5,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + rorq $4,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + rorq $6,%r14 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + rorq $28,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + rorq $23,%r13 + movq %r14,%r9 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + rorq $23,%r13 + movq %r14,%r8 + movq %rbx,%r12 + rorq $5,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + rorq $4,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + rorq $6,%r14 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + rorq $28,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + rorq $23,%r13 + movq %r14,%rdx + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + rorq $23,%r13 + movq %r14,%rcx + movq %r11,%r12 + rorq $5,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + rorq $4,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + rorq $6,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + rorq $28,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + rorq $23,%r13 + movq %r14,%rbx + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + rorq $23,%r13 + movq %r14,%rax + movq %r9,%r12 + rorq $5,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + rorq $4,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + rorq $6,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + rorq $28,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + rorq $23,%r13 + movq %r14,%r11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + rorq $23,%r13 + movq %r14,%r10 + movq %rdx,%r12 + rorq $5,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + rorq $4,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + rorq $6,%r14 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + rorq $28,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + rorq $23,%r13 + movq %r14,%r9 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + rorq $23,%r13 + movq %r14,%r8 + movq %rbx,%r12 + rorq $5,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + rorq $4,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + rorq $6,%r14 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + rorq $28,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + rorq $23,%r13 + movq %r14,%rdx + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + rorq $23,%r13 + movq %r14,%rcx + movq %r11,%r12 + rorq $5,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + rorq $4,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + rorq $6,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + rorq $28,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + rorq $23,%r13 + movq %r14,%rbx + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + movq 128+0(%rsp),%rdi + movq %r14,%rax + + addq 0(%rdi),%rax + leaq 128(%rsi),%rsi + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb .Lloop_xop + + movq 152(%rsp),%rsi +.cfi_def_cfa %rsi,8 + vzeroupper + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_xop: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha512_block_data_order_xop,.-sha512_block_data_order_xop +.type sha512_block_data_order_avx,@function +.align 64 +sha512_block_data_order_avx: +.cfi_startproc +.Lavx_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $160,%rsp + leaq (%rsi,%rdx,8),%rdx + andq $-64,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 +.Lprologue_avx: + + vzeroupper + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp .Lloop_avx +.align 16 +.Lloop_avx: + vmovdqa K512+1280(%rip),%xmm11 + vmovdqu 0(%rsi),%xmm0 + leaq K512+128(%rip),%rbp + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vpshufb %xmm11,%xmm0,%xmm0 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm11,%xmm1,%xmm1 + vmovdqu 64(%rsi),%xmm4 + vpshufb %xmm11,%xmm2,%xmm2 + vmovdqu 80(%rsi),%xmm5 + vpshufb %xmm11,%xmm3,%xmm3 + vmovdqu 96(%rsi),%xmm6 + vpshufb %xmm11,%xmm4,%xmm4 + vmovdqu 112(%rsi),%xmm7 + vpshufb %xmm11,%xmm5,%xmm5 + vpaddq -128(%rbp),%xmm0,%xmm8 + vpshufb %xmm11,%xmm6,%xmm6 + vpaddq -96(%rbp),%xmm1,%xmm9 + vpshufb %xmm11,%xmm7,%xmm7 + vpaddq -64(%rbp),%xmm2,%xmm10 + vpaddq -32(%rbp),%xmm3,%xmm11 + vmovdqa %xmm8,0(%rsp) + vpaddq 0(%rbp),%xmm4,%xmm8 + vmovdqa %xmm9,16(%rsp) + vpaddq 32(%rbp),%xmm5,%xmm9 + vmovdqa %xmm10,32(%rsp) + vpaddq 64(%rbp),%xmm6,%xmm10 + vmovdqa %xmm11,48(%rsp) + vpaddq 96(%rbp),%xmm7,%xmm11 + vmovdqa %xmm8,64(%rsp) + movq %rax,%r14 + vmovdqa %xmm9,80(%rsp) + movq %rbx,%rdi + vmovdqa %xmm10,96(%rsp) + xorq %rcx,%rdi + vmovdqa %xmm11,112(%rsp) + movq %r8,%r13 + jmp .Lavx_00_47 + +.align 16 +.Lavx_00_47: + addq $256,%rbp + vpalignr $8,%xmm0,%xmm1,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm4,%xmm5,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm0,%xmm0 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 0(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm7,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm7,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm0,%xmm0 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm7,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 8(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm0,%xmm0 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq -128(%rbp),%xmm0,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,0(%rsp) + vpalignr $8,%xmm1,%xmm2,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm5,%xmm6,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm1,%xmm1 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 16(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm0,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm0,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm1,%xmm1 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm0,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 24(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm1,%xmm1 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq -96(%rbp),%xmm1,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,16(%rsp) + vpalignr $8,%xmm2,%xmm3,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm6,%xmm7,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm2,%xmm2 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 32(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm1,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm1,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm2,%xmm2 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm1,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 40(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm2,%xmm2 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq -64(%rbp),%xmm2,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,32(%rsp) + vpalignr $8,%xmm3,%xmm4,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm7,%xmm0,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm3,%xmm3 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 48(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm2,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm2,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm3,%xmm3 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm2,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 56(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm3,%xmm3 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq -32(%rbp),%xmm3,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,48(%rsp) + vpalignr $8,%xmm4,%xmm5,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm0,%xmm1,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm4,%xmm4 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 64(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm3,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm3,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm4,%xmm4 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm3,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 72(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm4,%xmm4 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq 0(%rbp),%xmm4,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,64(%rsp) + vpalignr $8,%xmm5,%xmm6,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm1,%xmm2,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm5,%xmm5 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 80(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm4,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm4,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm5,%xmm5 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm4,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 88(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm5,%xmm5 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq 32(%rbp),%xmm5,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,80(%rsp) + vpalignr $8,%xmm6,%xmm7,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm2,%xmm3,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm6,%xmm6 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 96(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm5,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm5,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm6,%xmm6 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm5,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 104(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm6,%xmm6 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq 64(%rbp),%xmm6,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,96(%rsp) + vpalignr $8,%xmm7,%xmm0,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm3,%xmm4,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm7,%xmm7 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 112(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm6,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm6,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm7,%xmm7 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm6,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 120(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm7,%xmm7 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq 96(%rbp),%xmm7,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,112(%rsp) + cmpb $0,135(%rbp) + jne .Lavx_00_47 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + movq 128+0(%rsp),%rdi + movq %r14,%rax + + addq 0(%rdi),%rax + leaq 128(%rsi),%rsi + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb .Lloop_avx + + movq 152(%rsp),%rsi +.cfi_def_cfa %rsi,8 + vzeroupper + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha512_block_data_order_avx,.-sha512_block_data_order_avx +.type sha512_block_data_order_avx2,@function +.align 64 +sha512_block_data_order_avx2: +.cfi_startproc +.Lavx2_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + subq $1312,%rsp + shlq $4,%rdx + andq $-2048,%rsp + leaq (%rsi,%rdx,8),%rdx + addq $1152,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 +.Lprologue_avx2: + + vzeroupper + subq $-128,%rsi + movq 0(%rdi),%rax + movq %rsi,%r12 + movq 8(%rdi),%rbx + cmpq %rdx,%rsi + movq 16(%rdi),%rcx + cmoveq %rsp,%r12 + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp .Loop_avx2 +.align 16 +.Loop_avx2: + vmovdqu -128(%rsi),%xmm0 + vmovdqu -128+16(%rsi),%xmm1 + vmovdqu -128+32(%rsi),%xmm2 + leaq K512+128(%rip),%rbp + vmovdqu -128+48(%rsi),%xmm3 + vmovdqu -128+64(%rsi),%xmm4 + vmovdqu -128+80(%rsi),%xmm5 + vmovdqu -128+96(%rsi),%xmm6 + vmovdqu -128+112(%rsi),%xmm7 + + vmovdqa 1152(%rbp),%ymm10 + vinserti128 $1,(%r12),%ymm0,%ymm0 + vinserti128 $1,16(%r12),%ymm1,%ymm1 + vpshufb %ymm10,%ymm0,%ymm0 + vinserti128 $1,32(%r12),%ymm2,%ymm2 + vpshufb %ymm10,%ymm1,%ymm1 + vinserti128 $1,48(%r12),%ymm3,%ymm3 + vpshufb %ymm10,%ymm2,%ymm2 + vinserti128 $1,64(%r12),%ymm4,%ymm4 + vpshufb %ymm10,%ymm3,%ymm3 + vinserti128 $1,80(%r12),%ymm5,%ymm5 + vpshufb %ymm10,%ymm4,%ymm4 + vinserti128 $1,96(%r12),%ymm6,%ymm6 + vpshufb %ymm10,%ymm5,%ymm5 + vinserti128 $1,112(%r12),%ymm7,%ymm7 + + vpaddq -128(%rbp),%ymm0,%ymm8 + vpshufb %ymm10,%ymm6,%ymm6 + vpaddq -96(%rbp),%ymm1,%ymm9 + vpshufb %ymm10,%ymm7,%ymm7 + vpaddq -64(%rbp),%ymm2,%ymm10 + vpaddq -32(%rbp),%ymm3,%ymm11 + vmovdqa %ymm8,0(%rsp) + vpaddq 0(%rbp),%ymm4,%ymm8 + vmovdqa %ymm9,32(%rsp) + vpaddq 32(%rbp),%ymm5,%ymm9 + vmovdqa %ymm10,64(%rsp) + vpaddq 64(%rbp),%ymm6,%ymm10 + vmovdqa %ymm11,96(%rsp) + + movq 152(%rsp),%rdi +.cfi_def_cfa %rdi,8 + leaq -128(%rsp),%rsp + + + + movq %rdi,-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + vpaddq 96(%rbp),%ymm7,%ymm11 + vmovdqa %ymm8,0(%rsp) + xorq %r14,%r14 + vmovdqa %ymm9,32(%rsp) + movq %rbx,%rdi + vmovdqa %ymm10,64(%rsp) + xorq %rcx,%rdi + vmovdqa %ymm11,96(%rsp) + movq %r9,%r12 + addq $32*8,%rbp + jmp .Lavx2_00_47 + +.align 16 +.Lavx2_00_47: + leaq -128(%rsp),%rsp +.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08 + + pushq 128-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08 + leaq 8(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + vpalignr $8,%ymm0,%ymm1,%ymm8 + addq 0+256(%rsp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + vpalignr $8,%ymm4,%ymm5,%ymm11 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + vpsrlq $1,%ymm8,%ymm10 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + vpaddq %ymm11,%ymm0,%ymm0 + vpsrlq $7,%ymm8,%ymm11 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + vpsrlq $6,%ymm7,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + vpsllq $3,%ymm7,%ymm10 + vpaddq %ymm8,%ymm0,%ymm0 + addq 8+256(%rsp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + vpsrlq $19,%ymm7,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + vpaddq %ymm11,%ymm0,%ymm0 + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + vpaddq -128(%rbp),%ymm0,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + vmovdqa %ymm10,0(%rsp) + vpalignr $8,%ymm1,%ymm2,%ymm8 + addq 32+256(%rsp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + vpalignr $8,%ymm5,%ymm6,%ymm11 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + vpsrlq $1,%ymm8,%ymm10 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + vpaddq %ymm11,%ymm1,%ymm1 + vpsrlq $7,%ymm8,%ymm11 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + vpsrlq $6,%ymm0,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + vpsllq $3,%ymm0,%ymm10 + vpaddq %ymm8,%ymm1,%ymm1 + addq 40+256(%rsp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + vpsrlq $19,%ymm0,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + vpaddq %ymm11,%ymm1,%ymm1 + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + vpaddq -96(%rbp),%ymm1,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + vmovdqa %ymm10,32(%rsp) + vpalignr $8,%ymm2,%ymm3,%ymm8 + addq 64+256(%rsp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + vpalignr $8,%ymm6,%ymm7,%ymm11 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + vpsrlq $1,%ymm8,%ymm10 + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + vpaddq %ymm11,%ymm2,%ymm2 + vpsrlq $7,%ymm8,%ymm11 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + vpsrlq $6,%ymm1,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + vpsllq $3,%ymm1,%ymm10 + vpaddq %ymm8,%ymm2,%ymm2 + addq 72+256(%rsp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + vpsrlq $19,%ymm1,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + vpaddq %ymm11,%ymm2,%ymm2 + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + vpaddq -64(%rbp),%ymm2,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + vmovdqa %ymm10,64(%rsp) + vpalignr $8,%ymm3,%ymm4,%ymm8 + addq 96+256(%rsp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + vpalignr $8,%ymm7,%ymm0,%ymm11 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + vpsrlq $1,%ymm8,%ymm10 + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + vpaddq %ymm11,%ymm3,%ymm3 + vpsrlq $7,%ymm8,%ymm11 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + vpsrlq $6,%ymm2,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + vpsllq $3,%ymm2,%ymm10 + vpaddq %ymm8,%ymm3,%ymm3 + addq 104+256(%rsp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + vpsrlq $19,%ymm2,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + vpaddq %ymm11,%ymm3,%ymm3 + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + vpaddq -32(%rbp),%ymm3,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + vmovdqa %ymm10,96(%rsp) + leaq -128(%rsp),%rsp +.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08 + + pushq 128-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08 + leaq 8(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + vpalignr $8,%ymm4,%ymm5,%ymm8 + addq 0+256(%rsp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + vpalignr $8,%ymm0,%ymm1,%ymm11 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + vpsrlq $1,%ymm8,%ymm10 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + vpaddq %ymm11,%ymm4,%ymm4 + vpsrlq $7,%ymm8,%ymm11 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + vpsrlq $6,%ymm3,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + vpsllq $3,%ymm3,%ymm10 + vpaddq %ymm8,%ymm4,%ymm4 + addq 8+256(%rsp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + vpsrlq $19,%ymm3,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + vpaddq %ymm11,%ymm4,%ymm4 + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + vpaddq 0(%rbp),%ymm4,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + vmovdqa %ymm10,0(%rsp) + vpalignr $8,%ymm5,%ymm6,%ymm8 + addq 32+256(%rsp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + vpalignr $8,%ymm1,%ymm2,%ymm11 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + vpsrlq $1,%ymm8,%ymm10 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + vpaddq %ymm11,%ymm5,%ymm5 + vpsrlq $7,%ymm8,%ymm11 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + vpsrlq $6,%ymm4,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + vpsllq $3,%ymm4,%ymm10 + vpaddq %ymm8,%ymm5,%ymm5 + addq 40+256(%rsp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + vpsrlq $19,%ymm4,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + vpaddq %ymm11,%ymm5,%ymm5 + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + vpaddq 32(%rbp),%ymm5,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + vmovdqa %ymm10,32(%rsp) + vpalignr $8,%ymm6,%ymm7,%ymm8 + addq 64+256(%rsp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + vpalignr $8,%ymm2,%ymm3,%ymm11 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + vpsrlq $1,%ymm8,%ymm10 + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + vpaddq %ymm11,%ymm6,%ymm6 + vpsrlq $7,%ymm8,%ymm11 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + vpsrlq $6,%ymm5,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + vpsllq $3,%ymm5,%ymm10 + vpaddq %ymm8,%ymm6,%ymm6 + addq 72+256(%rsp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + vpsrlq $19,%ymm5,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + vpaddq %ymm11,%ymm6,%ymm6 + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + vpaddq 64(%rbp),%ymm6,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + vmovdqa %ymm10,64(%rsp) + vpalignr $8,%ymm7,%ymm0,%ymm8 + addq 96+256(%rsp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + vpalignr $8,%ymm3,%ymm4,%ymm11 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + vpsrlq $1,%ymm8,%ymm10 + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + vpaddq %ymm11,%ymm7,%ymm7 + vpsrlq $7,%ymm8,%ymm11 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + vpsrlq $6,%ymm6,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + vpsllq $3,%ymm6,%ymm10 + vpaddq %ymm8,%ymm7,%ymm7 + addq 104+256(%rsp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + vpsrlq $19,%ymm6,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + vpaddq %ymm11,%ymm7,%ymm7 + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + vpaddq 96(%rbp),%ymm7,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + vmovdqa %ymm10,96(%rsp) + leaq 256(%rbp),%rbp + cmpb $0,-121(%rbp) + jne .Lavx2_00_47 + addq 0+128(%rsp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + addq 8+128(%rsp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + addq 32+128(%rsp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + addq 40+128(%rsp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + addq 64+128(%rsp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + addq 72+128(%rsp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + addq 96+128(%rsp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + addq 104+128(%rsp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + addq 0(%rsp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + addq 8(%rsp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + addq 32(%rsp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + addq 40(%rsp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + addq 64(%rsp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + addq 72(%rsp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + addq 96(%rsp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + addq 104(%rsp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + movq 1280(%rsp),%rdi + addq %r14,%rax + + leaq 1152(%rsp),%rbp + + addq 0(%rdi),%rax + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + + cmpq 144(%rbp),%rsi + je .Ldone_avx2 + + xorq %r14,%r14 + movq %rbx,%rdi + xorq %rcx,%rdi + movq %r9,%r12 + jmp .Lower_avx2 +.align 16 +.Lower_avx2: + addq 0+16(%rbp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + addq 8+16(%rbp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + addq 32+16(%rbp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + addq 40+16(%rbp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + addq 64+16(%rbp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + addq 72+16(%rbp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + addq 96+16(%rbp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + addq 104+16(%rbp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + leaq -128(%rbp),%rbp + cmpq %rsp,%rbp + jae .Lower_avx2 + + movq 1280(%rsp),%rdi + addq %r14,%rax + + leaq 1152(%rsp),%rsp + +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 + + addq 0(%rdi),%rax + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + leaq 256(%rsi),%rsi + addq 48(%rdi),%r10 + movq %rsi,%r12 + addq 56(%rdi),%r11 + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + cmoveq %rsp,%r12 + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + + jbe .Loop_avx2 + leaq (%rsp),%rbp + + +.cfi_escape 0x0f,0x06,0x76,0x98,0x01,0x06,0x23,0x08 + +.Ldone_avx2: + movq 152(%rbp),%rsi +.cfi_def_cfa %rsi,8 + vzeroupper + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx2: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha512_block_data_order_avx2,.-sha512_block_data_order_avx2 diff --git a/secure/lib/libcrypto/amd64/x25519-x86_64.S b/secure/lib/libcrypto/amd64/x25519-x86_64.S index 28063bf95b0..7448e866aaf 100644 --- a/secure/lib/libcrypto/amd64/x25519-x86_64.S +++ b/secure/lib/libcrypto/amd64/x25519-x86_64.S @@ -397,32 +397,408 @@ x25519_fe51_mul121666: .Lfe51_mul121666_epilogue: .cfi_endproc .size x25519_fe51_mul121666,.-x25519_fe51_mul121666 + .globl x25519_fe64_eligible .type x25519_fe64_eligible,@function .align 32 x25519_fe64_eligible: .cfi_startproc + movl OPENSSL_ia32cap_P+8(%rip),%ecx xorl %eax,%eax + andl $0x80100,%ecx + cmpl $0x80100,%ecx + cmovel %ecx,%eax .byte 0xf3,0xc3 .cfi_endproc .size x25519_fe64_eligible,.-x25519_fe64_eligible .globl x25519_fe64_mul .type x25519_fe64_mul,@function -.globl x25519_fe64_sqr -.globl x25519_fe64_mul121666 -.globl x25519_fe64_add -.globl x25519_fe64_sub -.globl x25519_fe64_tobytes +.align 32 x25519_fe64_mul: +.cfi_startproc + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 +.cfi_offset %rdi,-64 + leaq -16(%rsp),%rsp +.cfi_adjust_cfa_offset 16 +.Lfe64_mul_body: + + movq %rdx,%rax + movq 0(%rdx),%rbp + movq 0(%rsi),%rdx + movq 8(%rax),%rcx + movq 16(%rax),%r14 + movq 24(%rax),%r15 + + mulxq %rbp,%r8,%rax + xorl %edi,%edi + mulxq %rcx,%r9,%rbx + adcxq %rax,%r9 + mulxq %r14,%r10,%rax + adcxq %rbx,%r10 + mulxq %r15,%r11,%r12 + movq 8(%rsi),%rdx + adcxq %rax,%r11 + movq %r14,(%rsp) + adcxq %rdi,%r12 + + mulxq %rbp,%rax,%rbx + adoxq %rax,%r9 + adcxq %rbx,%r10 + mulxq %rcx,%rax,%rbx + adoxq %rax,%r10 + adcxq %rbx,%r11 + mulxq %r14,%rax,%rbx + adoxq %rax,%r11 + adcxq %rbx,%r12 + mulxq %r15,%rax,%r13 + movq 16(%rsi),%rdx + adoxq %rax,%r12 + adcxq %rdi,%r13 + adoxq %rdi,%r13 + + mulxq %rbp,%rax,%rbx + adcxq %rax,%r10 + adoxq %rbx,%r11 + mulxq %rcx,%rax,%rbx + adcxq %rax,%r11 + adoxq %rbx,%r12 + mulxq %r14,%rax,%rbx + adcxq %rax,%r12 + adoxq %rbx,%r13 + mulxq %r15,%rax,%r14 + movq 24(%rsi),%rdx + adcxq %rax,%r13 + adoxq %rdi,%r14 + adcxq %rdi,%r14 + + mulxq %rbp,%rax,%rbx + adoxq %rax,%r11 + adcxq %rbx,%r12 + mulxq %rcx,%rax,%rbx + adoxq %rax,%r12 + adcxq %rbx,%r13 + mulxq (%rsp),%rax,%rbx + adoxq %rax,%r13 + adcxq %rbx,%r14 + mulxq %r15,%rax,%r15 + movl $38,%edx + adoxq %rax,%r14 + adcxq %rdi,%r15 + adoxq %rdi,%r15 + + jmp .Lreduce64 +.Lfe64_mul_epilogue: +.cfi_endproc +.size x25519_fe64_mul,.-x25519_fe64_mul + +.globl x25519_fe64_sqr +.type x25519_fe64_sqr,@function +.align 32 x25519_fe64_sqr: +.cfi_startproc + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 +.cfi_offset %rdi,-64 + leaq -16(%rsp),%rsp +.cfi_adjust_cfa_offset 16 +.Lfe64_sqr_body: + + movq 0(%rsi),%rdx + movq 8(%rsi),%rcx + movq 16(%rsi),%rbp + movq 24(%rsi),%rsi + + + mulxq %rdx,%r8,%r15 + mulxq %rcx,%r9,%rax + xorl %edi,%edi + mulxq %rbp,%r10,%rbx + adcxq %rax,%r10 + mulxq %rsi,%r11,%r12 + movq %rcx,%rdx + adcxq %rbx,%r11 + adcxq %rdi,%r12 + + + mulxq %rbp,%rax,%rbx + adoxq %rax,%r11 + adcxq %rbx,%r12 + mulxq %rsi,%rax,%r13 + movq %rbp,%rdx + adoxq %rax,%r12 + adcxq %rdi,%r13 + + + mulxq %rsi,%rax,%r14 + movq %rcx,%rdx + adoxq %rax,%r13 + adcxq %rdi,%r14 + adoxq %rdi,%r14 + + adcxq %r9,%r9 + adoxq %r15,%r9 + adcxq %r10,%r10 + mulxq %rdx,%rax,%rbx + movq %rbp,%rdx + adcxq %r11,%r11 + adoxq %rax,%r10 + adcxq %r12,%r12 + adoxq %rbx,%r11 + mulxq %rdx,%rax,%rbx + movq %rsi,%rdx + adcxq %r13,%r13 + adoxq %rax,%r12 + adcxq %r14,%r14 + adoxq %rbx,%r13 + mulxq %rdx,%rax,%r15 + movl $38,%edx + adoxq %rax,%r14 + adcxq %rdi,%r15 + adoxq %rdi,%r15 + jmp .Lreduce64 + +.align 32 +.Lreduce64: + mulxq %r12,%rax,%rbx + adcxq %rax,%r8 + adoxq %rbx,%r9 + mulxq %r13,%rax,%rbx + adcxq %rax,%r9 + adoxq %rbx,%r10 + mulxq %r14,%rax,%rbx + adcxq %rax,%r10 + adoxq %rbx,%r11 + mulxq %r15,%rax,%r12 + adcxq %rax,%r11 + adoxq %rdi,%r12 + adcxq %rdi,%r12 + + movq 16(%rsp),%rdi + imulq %rdx,%r12 + + addq %r12,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + sbbq %rax,%rax + andq $38,%rax + + addq %rax,%r8 + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r8,0(%rdi) + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset 88 +.Lfe64_sqr_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size x25519_fe64_sqr,.-x25519_fe64_sqr + +.globl x25519_fe64_mul121666 +.type x25519_fe64_mul121666,@function +.align 32 x25519_fe64_mul121666: +.Lfe64_mul121666_body: +.cfi_startproc + movl $121666,%edx + mulxq 0(%rsi),%r8,%rcx + mulxq 8(%rsi),%r9,%rax + addq %rcx,%r9 + mulxq 16(%rsi),%r10,%rcx + adcq %rax,%r10 + mulxq 24(%rsi),%r11,%rax + adcq %rcx,%r11 + adcq $0,%rax + + imulq $38,%rax,%rax + + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + sbbq %rax,%rax + andq $38,%rax + + addq %rax,%r8 + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r8,0(%rdi) + +.Lfe64_mul121666_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size x25519_fe64_mul121666,.-x25519_fe64_mul121666 + +.globl x25519_fe64_add +.type x25519_fe64_add,@function +.align 32 x25519_fe64_add: +.Lfe64_add_body: +.cfi_startproc + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + + sbbq %rax,%rax + andq $38,%rax + + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + movq %r9,8(%rdi) + adcq $0,%r11 + movq %r10,16(%rdi) + sbbq %rax,%rax + movq %r11,24(%rdi) + andq $38,%rax + + addq %rax,%r8 + movq %r8,0(%rdi) + +.Lfe64_add_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size x25519_fe64_add,.-x25519_fe64_add + +.globl x25519_fe64_sub +.type x25519_fe64_sub,@function +.align 32 x25519_fe64_sub: +.Lfe64_sub_body: +.cfi_startproc + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + subq 0(%rdx),%r8 + sbbq 8(%rdx),%r9 + sbbq 16(%rdx),%r10 + sbbq 24(%rdx),%r11 + + sbbq %rax,%rax + andq $38,%rax + + subq %rax,%r8 + sbbq $0,%r9 + sbbq $0,%r10 + movq %r9,8(%rdi) + sbbq $0,%r11 + movq %r10,16(%rdi) + sbbq %rax,%rax + movq %r11,24(%rdi) + andq $38,%rax + + subq %rax,%r8 + movq %r8,0(%rdi) + +.Lfe64_sub_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size x25519_fe64_sub,.-x25519_fe64_sub + +.globl x25519_fe64_tobytes +.type x25519_fe64_tobytes,@function +.align 32 x25519_fe64_tobytes: +.Lfe64_to_body: .cfi_startproc -.byte 0x0f,0x0b + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + + leaq (%r11,%r11,1),%rax + sarq $63,%r11 + shrq $1,%rax + andq $19,%r11 + addq $19,%r11 + + addq %r11,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%rax + + leaq (%rax,%rax,1),%r11 + sarq $63,%rax + shrq $1,%r11 + notq %rax + andq $19,%rax + + subq %rax,%r8 + sbbq $0,%r9 + sbbq $0,%r10 + sbbq $0,%r11 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + +.Lfe64_to_epilogue: .byte 0xf3,0xc3 .cfi_endproc -.size x25519_fe64_mul,.-x25519_fe64_mul +.size x25519_fe64_tobytes,.-x25519_fe64_tobytes .byte 88,50,53,53,49,57,32,112,114,105,109,105,116,105,118,101,115,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 diff --git a/secure/lib/libcrypto/amd64/x86_64-mont.S b/secure/lib/libcrypto/amd64/x86_64-mont.S index 2fd4d2f4600..015a87c446b 100644 --- a/secure/lib/libcrypto/amd64/x86_64-mont.S +++ b/secure/lib/libcrypto/amd64/x86_64-mont.S @@ -16,6 +16,7 @@ bn_mul_mont: jnz .Lmul_enter cmpl $8,%r9d jb .Lmul_enter + movl OPENSSL_ia32cap_P+8(%rip),%r11d cmpq %rsi,%rdx jne .Lmul4x_enter testl $7,%r9d @@ -264,6 +265,9 @@ bn_mul4x_mont: movq %rsp,%rax .cfi_def_cfa_register %rax .Lmul4x_enter: + andl $0x80100,%r11d + cmpl $0x80100,%r11d + je .Lmulx4x_enter pushq %rbx .cfi_offset %rbx,-16 pushq %rbp @@ -689,6 +693,7 @@ bn_mul4x_mont: .size bn_mul4x_mont,.-bn_mul4x_mont + .type bn_sqr8x_mont,@function .align 32 bn_sqr8x_mont: @@ -770,6 +775,25 @@ bn_sqr8x_mont: pxor %xmm0,%xmm0 .byte 102,72,15,110,207 .byte 102,73,15,110,218 + movl OPENSSL_ia32cap_P+8(%rip),%eax + andl $0x80100,%eax + cmpl $0x80100,%eax + jne .Lsqr8x_nox + + call bn_sqrx8x_internal + + + + + leaq (%r8,%rcx,1),%rbx + movq %rcx,%r9 + movq %rcx,%rdx +.byte 102,72,15,126,207 + sarq $3+2,%rcx + jmp .Lsqr8x_sub + +.align 32 +.Lsqr8x_nox: call bn_sqr8x_internal @@ -857,5 +881,361 @@ bn_sqr8x_mont: .byte 0xf3,0xc3 .cfi_endproc .size bn_sqr8x_mont,.-bn_sqr8x_mont +.type bn_mulx4x_mont,@function +.align 32 +bn_mulx4x_mont: +.cfi_startproc + movq %rsp,%rax +.cfi_def_cfa_register %rax +.Lmulx4x_enter: + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 +.Lmulx4x_prologue: + + shll $3,%r9d + xorq %r10,%r10 + subq %r9,%r10 + movq (%r8),%r8 + leaq -72(%rsp,%r10,1),%rbp + andq $-128,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk + jmp .Lmulx4x_page_walk_done + +.align 16 +.Lmulx4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk +.Lmulx4x_page_walk_done: + + leaq (%rdx,%r9,1),%r10 + + + + + + + + + + + + + movq %r9,0(%rsp) + shrq $5,%r9 + movq %r10,16(%rsp) + subq $1,%r9 + movq %r8,24(%rsp) + movq %rdi,32(%rsp) + movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 + movq %r9,48(%rsp) + jmp .Lmulx4x_body + +.align 32 +.Lmulx4x_body: + leaq 8(%rdx),%rdi + movq (%rdx),%rdx + leaq 64+32(%rsp),%rbx + movq %rdx,%r9 + + mulxq 0(%rsi),%r8,%rax + mulxq 8(%rsi),%r11,%r14 + addq %rax,%r11 + movq %rdi,8(%rsp) + mulxq 16(%rsi),%r12,%r13 + adcq %r14,%r12 + adcq $0,%r13 + + movq %r8,%rdi + imulq 24(%rsp),%r8 + xorq %rbp,%rbp + + mulxq 24(%rsi),%rax,%r14 + movq %r8,%rdx + leaq 32(%rsi),%rsi + adcxq %rax,%r13 + adcxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%rdi + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 +.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 + movq 48(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + adcxq %rax,%r12 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r12,-16(%rbx) + + jmp .Lmulx4x_1st + +.align 32 +.Lmulx4x_1st: + adcxq %rbp,%r15 + mulxq 0(%rsi),%r10,%rax + adcxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 +.byte 0x67,0x67 + movq %r8,%rdx + adcxq %rax,%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + movq %r11,-32(%rbx) + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz .Lmulx4x_1st + + movq 0(%rsp),%rax + movq 8(%rsp),%rdi + adcq %rbp,%r15 + addq %r15,%r14 + sbbq %r15,%r15 + movq %r14,-8(%rbx) + jmp .Lmulx4x_outer + +.align 32 +.Lmulx4x_outer: + movq (%rdi),%rdx + leaq 8(%rdi),%rdi + subq %rax,%rsi + movq %r15,(%rbx) + leaq 64+32(%rsp),%rbx + subq %rax,%rcx + + mulxq 0(%rsi),%r8,%r11 + xorl %ebp,%ebp + movq %rdx,%r9 + mulxq 8(%rsi),%r14,%r12 + adoxq -32(%rbx),%r8 + adcxq %r14,%r11 + mulxq 16(%rsi),%r15,%r13 + adoxq -24(%rbx),%r11 + adcxq %r15,%r12 + adoxq -16(%rbx),%r12 + adcxq %rbp,%r13 + adoxq %rbp,%r13 + + movq %rdi,8(%rsp) + movq %r8,%r15 + imulq 24(%rsp),%r8 + xorl %ebp,%ebp + + mulxq 24(%rsi),%rax,%r14 + movq %r8,%rdx + adcxq %rax,%r13 + adoxq -8(%rbx),%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + adoxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + leaq 32(%rcx),%rcx + adcxq %rax,%r12 + adoxq %rbp,%r15 + movq 48(%rsp),%rdi + movq %r12,-16(%rbx) + + jmp .Lmulx4x_inner + +.align 32 +.Lmulx4x_inner: + mulxq 0(%rsi),%r10,%rax + adcxq %rbp,%r15 + adoxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq 0(%rbx),%r10 + adoxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq 8(%rbx),%r11 + adoxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 + movq %r8,%rdx + adcxq 16(%rbx),%r12 + adoxq %rax,%r13 + adcxq 24(%rbx),%r13 + adoxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + adcxq %rbp,%r14 + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-32(%rbx) + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz .Lmulx4x_inner + + movq 0(%rsp),%rax + movq 8(%rsp),%rdi + adcq %rbp,%r15 + subq 0(%rbx),%rbp + adcq %r15,%r14 + sbbq %r15,%r15 + movq %r14,-8(%rbx) + + cmpq 16(%rsp),%rdi + jne .Lmulx4x_outer + + leaq 64(%rsp),%rbx + subq %rax,%rcx + negq %r15 + movq %rax,%rdx + shrq $3+2,%rax + movq 32(%rsp),%rdi + jmp .Lmulx4x_sub + +.align 32 +.Lmulx4x_sub: + movq 0(%rbx),%r11 + movq 8(%rbx),%r12 + movq 16(%rbx),%r13 + movq 24(%rbx),%r14 + leaq 32(%rbx),%rbx + sbbq 0(%rcx),%r11 + sbbq 8(%rcx),%r12 + sbbq 16(%rcx),%r13 + sbbq 24(%rcx),%r14 + leaq 32(%rcx),%rcx + movq %r11,0(%rdi) + movq %r12,8(%rdi) + movq %r13,16(%rdi) + movq %r14,24(%rdi) + leaq 32(%rdi),%rdi + decq %rax + jnz .Lmulx4x_sub + + sbbq $0,%r15 + leaq 64(%rsp),%rbx + subq %rdx,%rdi + +.byte 102,73,15,110,207 + pxor %xmm0,%xmm0 + pshufd $0,%xmm1,%xmm1 + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 + jmp .Lmulx4x_cond_copy + +.align 32 +.Lmulx4x_cond_copy: + movdqa 0(%rbx),%xmm2 + movdqa 16(%rbx),%xmm3 + leaq 32(%rbx),%rbx + movdqu 0(%rdi),%xmm4 + movdqu 16(%rdi),%xmm5 + leaq 32(%rdi),%rdi + movdqa %xmm0,-32(%rbx) + movdqa %xmm0,-16(%rbx) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-32(%rdi) + movdqu %xmm5,-16(%rdi) + subq $32,%rdx + jnz .Lmulx4x_cond_copy + + movq %rdx,(%rbx) + + movq $1,%rax + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lmulx4x_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size bn_mulx4x_mont,.-bn_mulx4x_mont .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 16 diff --git a/secure/lib/libcrypto/amd64/x86_64-mont5.S b/secure/lib/libcrypto/amd64/x86_64-mont5.S index b69366fa905..cb2528c08dd 100644 --- a/secure/lib/libcrypto/amd64/x86_64-mont5.S +++ b/secure/lib/libcrypto/amd64/x86_64-mont5.S @@ -14,6 +14,7 @@ bn_mul_mont_gather5: .cfi_def_cfa_register %rax testl $7,%r9d jnz .Lmul_enter + movl OPENSSL_ia32cap_P+8(%rip),%r11d jmp .Lmul4x_enter .align 16 @@ -450,6 +451,9 @@ bn_mul4x_mont_gather5: movq %rsp,%rax .cfi_def_cfa_register %rax .Lmul4x_enter: + andl $0x80108,%r11d + cmpl $0x80108,%r11d + je .Lmulx4x_enter pushq %rbx .cfi_offset %rbx,-16 pushq %rbp @@ -1079,6 +1083,10 @@ bn_power5: .cfi_startproc movq %rsp,%rax .cfi_def_cfa_register %rax + movl OPENSSL_ia32cap_P+8(%rip),%r11d + andl $0x80108,%r11d + cmpl $0x80108,%r11d + je .Lpowerx5_enter pushq %rbx .cfi_offset %rbx,-16 pushq %rbp @@ -2168,6 +2176,21 @@ bn_from_mont8x: .byte 0x67 movq %rcx,%rbp .byte 102,73,15,110,218 + movl OPENSSL_ia32cap_P+8(%rip),%r11d + andl $0x80108,%r11d + cmpl $0x80108,%r11d + jne .Lfrom_mont_nox + + leaq (%rax,%r9,1),%rdi + call __bn_sqrx8x_reduction + call __bn_postx4x_internal + + pxor %xmm0,%xmm0 + leaq 48(%rsp),%rax + jmp .Lfrom_mont_zero + +.align 32 +.Lfrom_mont_nox: call __bn_sqr8x_reduction call __bn_post4x_internal @@ -2206,6 +2229,1348 @@ bn_from_mont8x: .byte 0xf3,0xc3 .cfi_endproc .size bn_from_mont8x,.-bn_from_mont8x +.type bn_mulx4x_mont_gather5,@function +.align 32 +bn_mulx4x_mont_gather5: +.cfi_startproc + movq %rsp,%rax +.cfi_def_cfa_register %rax +.Lmulx4x_enter: + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 +.Lmulx4x_prologue: + + shll $3,%r9d + leaq (%r9,%r9,2),%r10 + negq %r9 + movq (%r8),%r8 + + + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb .Lmulx4xsp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp .Lmulx4xsp_done + +.Lmulx4xsp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +.Lmulx4xsp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk + jmp .Lmulx4x_page_walk_done + +.Lmulx4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk +.Lmulx4x_page_walk_done: + + + + + + + + + + + + + + movq %r8,32(%rsp) + movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 +.Lmulx4x_body: + call mulx4x_internal + + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq $1,%rax + + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lmulx4x_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 + +.type mulx4x_internal,@function +.align 32 +mulx4x_internal: +.cfi_startproc + movq %r9,8(%rsp) + movq %r9,%r10 + negq %r9 + shlq $5,%r9 + negq %r10 + leaq 128(%rdx,%r9,1),%r13 + shrq $5+5,%r9 + movd 8(%rax),%xmm5 + subq $1,%r9 + leaq .Linc(%rip),%rax + movq %r13,16+8(%rsp) + movq %r9,24+8(%rsp) + movq %rdi,56+8(%rsp) + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 88-112(%rsp,%r10,1),%r10 + leaq 128(%rdx),%rdi + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 +.byte 0x67 + movdqa %xmm1,%xmm2 +.byte 0x67 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 +.byte 0x67 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + + pand 64(%rdi),%xmm0 + pand 80(%rdi),%xmm1 + pand 96(%rdi),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%rdi),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -128(%rdi),%xmm4 + movdqa -112(%rdi),%xmm5 + movdqa -96(%rdi),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%rdi),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%rdi),%xmm4 + movdqa -48(%rdi),%xmm5 + movdqa -32(%rdi),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%rdi),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%rdi),%xmm4 + movdqa 16(%rdi),%xmm5 + movdqa 32(%rdi),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%rdi),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + pxor %xmm1,%xmm0 + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + leaq 256(%rdi),%rdi +.byte 102,72,15,126,194 + leaq 64+32+8(%rsp),%rbx + + movq %rdx,%r9 + mulxq 0(%rsi),%r8,%rax + mulxq 8(%rsi),%r11,%r12 + addq %rax,%r11 + mulxq 16(%rsi),%rax,%r13 + adcq %rax,%r12 + adcq $0,%r13 + mulxq 24(%rsi),%rax,%r14 + + movq %r8,%r15 + imulq 32+8(%rsp),%r8 + xorq %rbp,%rbp + movq %r8,%rdx + + movq %rdi,8+8(%rsp) + + leaq 32(%rsi),%rsi + adcxq %rax,%r13 + adcxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + movq 24+8(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + adcxq %rax,%r12 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r12,-16(%rbx) + jmp .Lmulx4x_1st + +.align 32 +.Lmulx4x_1st: + adcxq %rbp,%r15 + mulxq 0(%rsi),%r10,%rax + adcxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 +.byte 0x67,0x67 + movq %r8,%rdx + adcxq %rax,%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + movq %r11,-32(%rbx) + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz .Lmulx4x_1st + + movq 8(%rsp),%rax + adcq %rbp,%r15 + leaq (%rsi,%rax,1),%rsi + addq %r15,%r14 + movq 8+8(%rsp),%rdi + adcq %rbp,%rbp + movq %r14,-8(%rbx) + jmp .Lmulx4x_outer + +.align 32 +.Lmulx4x_outer: + leaq 16-256(%rbx),%r10 + pxor %xmm4,%xmm4 +.byte 0x67,0x67 + pxor %xmm5,%xmm5 + movdqa -128(%rdi),%xmm0 + movdqa -112(%rdi),%xmm1 + movdqa -96(%rdi),%xmm2 + pand 256(%r10),%xmm0 + movdqa -80(%rdi),%xmm3 + pand 272(%r10),%xmm1 + por %xmm0,%xmm4 + pand 288(%r10),%xmm2 + por %xmm1,%xmm5 + pand 304(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%rdi),%xmm0 + movdqa -48(%rdi),%xmm1 + movdqa -32(%rdi),%xmm2 + pand 320(%r10),%xmm0 + movdqa -16(%rdi),%xmm3 + pand 336(%r10),%xmm1 + por %xmm0,%xmm4 + pand 352(%r10),%xmm2 + por %xmm1,%xmm5 + pand 368(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%rdi),%xmm0 + movdqa 16(%rdi),%xmm1 + movdqa 32(%rdi),%xmm2 + pand 384(%r10),%xmm0 + movdqa 48(%rdi),%xmm3 + pand 400(%r10),%xmm1 + por %xmm0,%xmm4 + pand 416(%r10),%xmm2 + por %xmm1,%xmm5 + pand 432(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%rdi),%xmm0 + movdqa 80(%rdi),%xmm1 + movdqa 96(%rdi),%xmm2 + pand 448(%r10),%xmm0 + movdqa 112(%rdi),%xmm3 + pand 464(%r10),%xmm1 + por %xmm0,%xmm4 + pand 480(%r10),%xmm2 + por %xmm1,%xmm5 + pand 496(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%rdi),%rdi +.byte 102,72,15,126,194 + + movq %rbp,(%rbx) + leaq 32(%rbx,%rax,1),%rbx + mulxq 0(%rsi),%r8,%r11 + xorq %rbp,%rbp + movq %rdx,%r9 + mulxq 8(%rsi),%r14,%r12 + adoxq -32(%rbx),%r8 + adcxq %r14,%r11 + mulxq 16(%rsi),%r15,%r13 + adoxq -24(%rbx),%r11 + adcxq %r15,%r12 + mulxq 24(%rsi),%rdx,%r14 + adoxq -16(%rbx),%r12 + adcxq %rdx,%r13 + leaq (%rcx,%rax,1),%rcx + leaq 32(%rsi),%rsi + adoxq -8(%rbx),%r13 + adcxq %rbp,%r14 + adoxq %rbp,%r14 + + movq %r8,%r15 + imulq 32+8(%rsp),%r8 + + movq %r8,%rdx + xorq %rbp,%rbp + movq %rdi,8+8(%rsp) + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq 24+8(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r12 + movq %r11,-24(%rbx) + adoxq %rbp,%r15 + movq %r12,-16(%rbx) + leaq 32(%rcx),%rcx + jmp .Lmulx4x_inner + +.align 32 +.Lmulx4x_inner: + mulxq 0(%rsi),%r10,%rax + adcxq %rbp,%r15 + adoxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq 0(%rbx),%r10 + adoxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq 8(%rbx),%r11 + adoxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 + movq %r8,%rdx + adcxq 16(%rbx),%r12 + adoxq %rax,%r13 + adcxq 24(%rbx),%r13 + adoxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + adcxq %rbp,%r14 + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + adoxq %r15,%r13 + movq %r11,-32(%rbx) + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + leaq 32(%rcx),%rcx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + movq %r13,-16(%rbx) + + decq %rdi + jnz .Lmulx4x_inner + + movq 0+8(%rsp),%rax + adcq %rbp,%r15 + subq 0(%rbx),%rdi + movq 8+8(%rsp),%rdi + movq 16+8(%rsp),%r10 + adcq %r15,%r14 + leaq (%rsi,%rax,1),%rsi + adcq %rbp,%rbp + movq %r14,-8(%rbx) + + cmpq %r10,%rdi + jb .Lmulx4x_outer + + movq -8(%rcx),%r10 + movq %rbp,%r8 + movq (%rcx,%rax,1),%r12 + leaq (%rcx,%rax,1),%rbp + movq %rax,%rcx + leaq (%rbx,%rax,1),%rdi + xorl %eax,%eax + xorq %r15,%r15 + subq %r14,%r10 + adcq %r15,%r15 + orq %r15,%r8 + sarq $3+2,%rcx + subq %r8,%rax + movq 56+8(%rsp),%rdx + decq %r12 + movq 8(%rbp),%r13 + xorq %r8,%r8 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp .Lsqrx4x_sub_entry +.cfi_endproc +.size mulx4x_internal,.-mulx4x_internal +.type bn_powerx5,@function +.align 32 +bn_powerx5: +.cfi_startproc + movq %rsp,%rax +.cfi_def_cfa_register %rax +.Lpowerx5_enter: + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 +.Lpowerx5_prologue: + + shll $3,%r9d + leaq (%r9,%r9,2),%r10 + negq %r9 + movq (%r8),%r8 + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb .Lpwrx_sp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp .Lpwrx_sp_done + +.align 32 +.Lpwrx_sp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +.Lpwrx_sp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lpwrx_page_walk + jmp .Lpwrx_page_walk_done + +.Lpwrx_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lpwrx_page_walk +.Lpwrx_page_walk_done: + + movq %r9,%r10 + negq %r9 + + + + + + + + + + + + + pxor %xmm0,%xmm0 +.byte 102,72,15,110,207 +.byte 102,72,15,110,209 +.byte 102,73,15,110,218 +.byte 102,72,15,110,226 + movq %r8,32(%rsp) + movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 +.Lpowerx5_body: + + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + + movq %r10,%r9 + movq %rsi,%rdi +.byte 102,72,15,126,209 +.byte 102,72,15,126,226 + movq 40(%rsp),%rax + + call mulx4x_internal + + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq $1,%rax + + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpowerx5_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size bn_powerx5,.-bn_powerx5 + +.globl bn_sqrx8x_internal +.hidden bn_sqrx8x_internal +.type bn_sqrx8x_internal,@function +.align 32 +bn_sqrx8x_internal: +__bn_sqrx8x_internal: +.cfi_startproc + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + leaq 48+8(%rsp),%rdi + leaq (%rsi,%r9,1),%rbp + movq %r9,0+8(%rsp) + movq %rbp,8+8(%rsp) + jmp .Lsqr8x_zero_start + +.align 32 +.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 +.Lsqrx8x_zero: +.byte 0x3e + movdqa %xmm0,0(%rdi) + movdqa %xmm0,16(%rdi) + movdqa %xmm0,32(%rdi) + movdqa %xmm0,48(%rdi) +.Lsqr8x_zero_start: + movdqa %xmm0,64(%rdi) + movdqa %xmm0,80(%rdi) + movdqa %xmm0,96(%rdi) + movdqa %xmm0,112(%rdi) + leaq 128(%rdi),%rdi + subq $64,%r9 + jnz .Lsqrx8x_zero + + movq 0(%rsi),%rdx + + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r12,%r12 + xorq %r13,%r13 + xorq %r14,%r14 + xorq %r15,%r15 + leaq 48+8(%rsp),%rdi + xorq %rbp,%rbp + jmp .Lsqrx8x_outer_loop + +.align 32 +.Lsqrx8x_outer_loop: + mulxq 8(%rsi),%r8,%rax + adcxq %r9,%r8 + adoxq %rax,%r10 + mulxq 16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 +.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 + adcxq %r11,%r10 + adoxq %rax,%r12 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 + adcxq %r12,%r11 + adoxq %rax,%r13 + mulxq 40(%rsi),%r12,%rax + adcxq %r13,%r12 + adoxq %rax,%r14 + mulxq 48(%rsi),%r13,%rax + adcxq %r14,%r13 + adoxq %r15,%rax + mulxq 56(%rsi),%r14,%r15 + movq 8(%rsi),%rdx + adcxq %rax,%r14 + adoxq %rbp,%r15 + adcq 64(%rdi),%r15 + movq %r8,8(%rdi) + movq %r9,16(%rdi) + sbbq %rcx,%rcx + xorq %rbp,%rbp + + + mulxq 16(%rsi),%r8,%rbx + mulxq 24(%rsi),%r9,%rax + adcxq %r10,%r8 + adoxq %rbx,%r9 + mulxq 32(%rsi),%r10,%rbx + adcxq %r11,%r9 + adoxq %rax,%r10 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 + adcxq %r12,%r10 + adoxq %rbx,%r11 +.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 + adcxq %r13,%r11 + adoxq %r14,%r12 +.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 + movq 16(%rsi),%rdx + adcxq %rax,%r12 + adoxq %rbx,%r13 + adcxq %r15,%r13 + adoxq %rbp,%r14 + adcxq %rbp,%r14 + + movq %r8,24(%rdi) + movq %r9,32(%rdi) + + mulxq 24(%rsi),%r8,%rbx + mulxq 32(%rsi),%r9,%rax + adcxq %r10,%r8 + adoxq %rbx,%r9 + mulxq 40(%rsi),%r10,%rbx + adcxq %r11,%r9 + adoxq %rax,%r10 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 + adcxq %r12,%r10 + adoxq %r13,%r11 +.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 +.byte 0x3e + movq 24(%rsi),%rdx + adcxq %rbx,%r11 + adoxq %rax,%r12 + adcxq %r14,%r12 + movq %r8,40(%rdi) + movq %r9,48(%rdi) + mulxq 32(%rsi),%r8,%rax + adoxq %rbp,%r13 + adcxq %rbp,%r13 + + mulxq 40(%rsi),%r9,%rbx + adcxq %r10,%r8 + adoxq %rax,%r9 + mulxq 48(%rsi),%r10,%rax + adcxq %r11,%r9 + adoxq %r12,%r10 + mulxq 56(%rsi),%r11,%r12 + movq 32(%rsi),%rdx + movq 40(%rsi),%r14 + adcxq %rbx,%r10 + adoxq %rax,%r11 + movq 48(%rsi),%r15 + adcxq %r13,%r11 + adoxq %rbp,%r12 + adcxq %rbp,%r12 + + movq %r8,56(%rdi) + movq %r9,64(%rdi) + + mulxq %r14,%r9,%rax + movq 56(%rsi),%r8 + adcxq %r10,%r9 + mulxq %r15,%r10,%rbx + adoxq %rax,%r10 + adcxq %r11,%r10 + mulxq %r8,%r11,%rax + movq %r14,%rdx + adoxq %rbx,%r11 + adcxq %r12,%r11 + + adcxq %rbp,%rax + + mulxq %r15,%r14,%rbx + mulxq %r8,%r12,%r13 + movq %r15,%rdx + leaq 64(%rsi),%rsi + adcxq %r14,%r11 + adoxq %rbx,%r12 + adcxq %rax,%r12 + adoxq %rbp,%r13 + +.byte 0x67,0x67 + mulxq %r8,%r8,%r14 + adcxq %r8,%r13 + adcxq %rbp,%r14 + + cmpq 8+8(%rsp),%rsi + je .Lsqrx8x_outer_break + + negq %rcx + movq $-8,%rcx + movq %rbp,%r15 + movq 64(%rdi),%r8 + adcxq 72(%rdi),%r9 + adcxq 80(%rdi),%r10 + adcxq 88(%rdi),%r11 + adcq 96(%rdi),%r12 + adcq 104(%rdi),%r13 + adcq 112(%rdi),%r14 + adcq 120(%rdi),%r15 + leaq (%rsi),%rbp + leaq 128(%rdi),%rdi + sbbq %rax,%rax + + movq -64(%rsi),%rdx + movq %rax,16+8(%rsp) + movq %rdi,24+8(%rsp) + + + xorl %eax,%eax + jmp .Lsqrx8x_loop + +.align 32 +.Lsqrx8x_loop: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rax,%rbx + adoxq %r9,%r8 + + mulxq 8(%rbp),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rax,%r10 + adcxq %rax,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + movq %rbx,(%rdi,%rcx,8) + movl $0,%ebx + adcxq %rax,%r13 + adoxq %r15,%r14 + +.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 + movq 8(%rsi,%rcx,8),%rdx + adcxq %rax,%r14 + adoxq %rbx,%r15 + adcxq %rbx,%r15 + +.byte 0x67 + incq %rcx + jnz .Lsqrx8x_loop + + leaq 64(%rbp),%rbp + movq $-8,%rcx + cmpq 8+8(%rsp),%rbp + je .Lsqrx8x_break + + subq 16+8(%rsp),%rbx +.byte 0x66 + movq -64(%rsi),%rdx + adcxq 0(%rdi),%r8 + adcxq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi +.byte 0x67 + sbbq %rax,%rax + xorl %ebx,%ebx + movq %rax,16+8(%rsp) + jmp .Lsqrx8x_loop + +.align 32 +.Lsqrx8x_break: + xorq %rbp,%rbp + subq 16+8(%rsp),%rbx + adcxq %rbp,%r8 + movq 24+8(%rsp),%rcx + adcxq %rbp,%r9 + movq 0(%rsi),%rdx + adcq $0,%r10 + movq %r8,0(%rdi) + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + cmpq %rcx,%rdi + je .Lsqrx8x_outer_loop + + movq %r9,8(%rdi) + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + movq 40(%rcx),%r13 + movq %r14,48(%rdi) + movq 48(%rcx),%r14 + movq %r15,56(%rdi) + movq 56(%rcx),%r15 + movq %rcx,%rdi + jmp .Lsqrx8x_outer_loop + +.align 32 +.Lsqrx8x_outer_break: + movq %r9,72(%rdi) +.byte 102,72,15,126,217 + movq %r10,80(%rdi) + movq %r11,88(%rdi) + movq %r12,96(%rdi) + movq %r13,104(%rdi) + movq %r14,112(%rdi) + leaq 48+8(%rsp),%rdi + movq (%rsi,%rcx,1),%rdx + + movq 8(%rdi),%r11 + xorq %r10,%r10 + movq 0+8(%rsp),%r9 + adoxq %r11,%r11 + movq 16(%rdi),%r12 + movq 24(%rdi),%r13 + + +.align 32 +.Lsqrx4x_shift_n_add: + mulxq %rdx,%rax,%rbx + adoxq %r12,%r12 + adcxq %r10,%rax +.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 +.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 + adoxq %r13,%r13 + adcxq %r11,%rbx + movq 40(%rdi),%r11 + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r10,%r10 + adcxq %r12,%rax + movq 16(%rsi,%rcx,1),%rdx + movq 48(%rdi),%r12 + adoxq %r11,%r11 + adcxq %r13,%rbx + movq 56(%rdi),%r13 + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r12,%r12 + adcxq %r10,%rax + movq 24(%rsi,%rcx,1),%rdx + leaq 32(%rcx),%rcx + movq 64(%rdi),%r10 + adoxq %r13,%r13 + adcxq %r11,%rbx + movq 72(%rdi),%r11 + movq %rax,32(%rdi) + movq %rbx,40(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r10,%r10 + adcxq %r12,%rax + jrcxz .Lsqrx4x_shift_n_add_break +.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 + adoxq %r11,%r11 + adcxq %r13,%rbx + movq 80(%rdi),%r12 + movq 88(%rdi),%r13 + movq %rax,48(%rdi) + movq %rbx,56(%rdi) + leaq 64(%rdi),%rdi + nop + jmp .Lsqrx4x_shift_n_add + +.align 32 +.Lsqrx4x_shift_n_add_break: + adcxq %r13,%rbx + movq %rax,48(%rdi) + movq %rbx,56(%rdi) + leaq 64(%rdi),%rdi +.byte 102,72,15,126,213 +__bn_sqrx8x_reduction: + xorl %eax,%eax + movq 32+8(%rsp),%rbx + movq 48+8(%rsp),%rdx + leaq -64(%rbp,%r9,1),%rcx + + movq %rcx,0+8(%rsp) + movq %rdi,8+8(%rsp) + + leaq 48+8(%rsp),%rdi + jmp .Lsqrx8x_reduction_loop + +.align 32 +.Lsqrx8x_reduction_loop: + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq %rdx,%r8 + imulq %rbx,%rdx + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq %rax,24+8(%rsp) + + leaq 64(%rdi),%rdi + xorq %rsi,%rsi + movq $-8,%rcx + jmp .Lsqrx8x_reduce + +.align 32 +.Lsqrx8x_reduce: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rbx,%rax + adoxq %r9,%r8 + + mulxq 8(%rbp),%rbx,%r9 + adcxq %rbx,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rbx,%r10 + adcxq %rbx,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rbx,%r11 + adcxq %rbx,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 + movq %rdx,%rax + movq %r8,%rdx + adcxq %rbx,%r11 + adoxq %r13,%r12 + + mulxq 32+8(%rsp),%rbx,%rdx + movq %rax,%rdx + movq %rax,64+48+8(%rsp,%rcx,8) + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + adcxq %rax,%r13 + adoxq %r15,%r14 + + mulxq 56(%rbp),%rax,%r15 + movq %rbx,%rdx + adcxq %rax,%r14 + adoxq %rsi,%r15 + adcxq %rsi,%r15 + +.byte 0x67,0x67,0x67 + incq %rcx + jnz .Lsqrx8x_reduce + + movq %rsi,%rax + cmpq 0+8(%rsp),%rbp + jae .Lsqrx8x_no_tail + + movq 48+8(%rsp),%rdx + addq 0(%rdi),%r8 + leaq 64(%rbp),%rbp + movq $-8,%rcx + adcxq 8(%rdi),%r9 + adcxq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi + sbbq %rax,%rax + + xorq %rsi,%rsi + movq %rax,16+8(%rsp) + jmp .Lsqrx8x_tail + +.align 32 +.Lsqrx8x_tail: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rax,%rbx + adoxq %r9,%r8 + + mulxq 8(%rbp),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rax,%r10 + adcxq %rax,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + adcxq %rax,%r13 + adoxq %r15,%r14 + + mulxq 56(%rbp),%rax,%r15 + movq 72+48+8(%rsp,%rcx,8),%rdx + adcxq %rax,%r14 + adoxq %rsi,%r15 + movq %rbx,(%rdi,%rcx,8) + movq %r8,%rbx + adcxq %rsi,%r15 + + incq %rcx + jnz .Lsqrx8x_tail + + cmpq 0+8(%rsp),%rbp + jae .Lsqrx8x_tail_done + + subq 16+8(%rsp),%rsi + movq 48+8(%rsp),%rdx + leaq 64(%rbp),%rbp + adcq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi + sbbq %rax,%rax + subq $8,%rcx + + xorq %rsi,%rsi + movq %rax,16+8(%rsp) + jmp .Lsqrx8x_tail + +.align 32 +.Lsqrx8x_tail_done: + xorq %rax,%rax + addq 24+8(%rsp),%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rax + + subq 16+8(%rsp),%rsi +.Lsqrx8x_no_tail: + adcq 0(%rdi),%r8 +.byte 102,72,15,126,217 + adcq 8(%rdi),%r9 + movq 56(%rbp),%rsi +.byte 102,72,15,126,213 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + adcq $0,%rax + + movq 32+8(%rsp),%rbx + movq 64(%rdi,%rcx,1),%rdx + + movq %r8,0(%rdi) + leaq 64(%rdi),%r8 + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + leaq 64(%rdi,%rcx,1),%rdi + cmpq 8+8(%rsp),%r8 + jb .Lsqrx8x_reduction_loop + .byte 0xf3,0xc3 +.cfi_endproc +.size bn_sqrx8x_internal,.-bn_sqrx8x_internal +.align 32 +__bn_postx4x_internal: +.cfi_startproc + movq 0(%rbp),%r12 + movq %rcx,%r10 + movq %rcx,%r9 + negq %rax + sarq $3+2,%rcx + +.byte 102,72,15,126,202 +.byte 102,72,15,126,206 + decq %r12 + movq 8(%rbp),%r13 + xorq %r8,%r8 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp .Lsqrx4x_sub_entry + +.align 16 +.Lsqrx4x_sub: + movq 0(%rbp),%r12 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 +.Lsqrx4x_sub_entry: + andnq %rax,%r12,%r12 + leaq 32(%rbp),%rbp + andnq %rax,%r13,%r13 + andnq %rax,%r14,%r14 + andnq %rax,%r15,%r15 + + negq %r8 + adcq 0(%rdi),%r12 + adcq 8(%rdi),%r13 + adcq 16(%rdi),%r14 + adcq 24(%rdi),%r15 + movq %r12,0(%rdx) + leaq 32(%rdi),%rdi + movq %r13,8(%rdx) + sbbq %r8,%r8 + movq %r14,16(%rdx) + movq %r15,24(%rdx) + leaq 32(%rdx),%rdx + + incq %rcx + jnz .Lsqrx4x_sub + + negq %r9 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __bn_postx4x_internal,.-__bn_postx4x_internal .globl bn_get_bits5 .type bn_get_bits5,@function .align 16 diff --git a/secure/lib/libcrypto/i386/chacha-x86.S b/secure/lib/libcrypto/i386/chacha-x86.S index 566285310e0..d6b2936a538 100644 --- a/secure/lib/libcrypto/i386/chacha-x86.S +++ b/secure/lib/libcrypto/i386/chacha-x86.S @@ -385,6 +385,8 @@ ChaCha20_ssse3: pushl %esi pushl %edi .Lssse3_shortcut: + testl $2048,4(%ebp) + jnz .Lxop_shortcut movl 20(%esp),%edi movl 24(%esp),%esi movl 28(%esp),%ecx @@ -528,6 +530,484 @@ ChaCha20_ssse3: .byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 .byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 .byte 114,103,62,0 +.globl ChaCha20_xop +.type ChaCha20_xop,@function +.align 16 +ChaCha20_xop: +.L_ChaCha20_xop_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi +.Lxop_shortcut: + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%ecx + movl 32(%esp),%edx + movl 36(%esp),%ebx + vzeroupper + movl %esp,%ebp + subl $524,%esp + andl $-64,%esp + movl %ebp,512(%esp) + leal .Lssse3_data-.Lpic_point(%eax),%eax + vmovdqu (%ebx),%xmm3 + cmpl $256,%ecx + jb .L0141x + movl %edx,516(%esp) + movl %ebx,520(%esp) + subl $256,%ecx + leal 384(%esp),%ebp + vmovdqu (%edx),%xmm7 + vpshufd $0,%xmm3,%xmm0 + vpshufd $85,%xmm3,%xmm1 + vpshufd $170,%xmm3,%xmm2 + vpshufd $255,%xmm3,%xmm3 + vpaddd 48(%eax),%xmm0,%xmm0 + vpshufd $0,%xmm7,%xmm4 + vpshufd $85,%xmm7,%xmm5 + vpsubd 64(%eax),%xmm0,%xmm0 + vpshufd $170,%xmm7,%xmm6 + vpshufd $255,%xmm7,%xmm7 + vmovdqa %xmm0,64(%ebp) + vmovdqa %xmm1,80(%ebp) + vmovdqa %xmm2,96(%ebp) + vmovdqa %xmm3,112(%ebp) + vmovdqu 16(%edx),%xmm3 + vmovdqa %xmm4,-64(%ebp) + vmovdqa %xmm5,-48(%ebp) + vmovdqa %xmm6,-32(%ebp) + vmovdqa %xmm7,-16(%ebp) + vmovdqa 32(%eax),%xmm7 + leal 128(%esp),%ebx + vpshufd $0,%xmm3,%xmm0 + vpshufd $85,%xmm3,%xmm1 + vpshufd $170,%xmm3,%xmm2 + vpshufd $255,%xmm3,%xmm3 + vpshufd $0,%xmm7,%xmm4 + vpshufd $85,%xmm7,%xmm5 + vpshufd $170,%xmm7,%xmm6 + vpshufd $255,%xmm7,%xmm7 + vmovdqa %xmm0,(%ebp) + vmovdqa %xmm1,16(%ebp) + vmovdqa %xmm2,32(%ebp) + vmovdqa %xmm3,48(%ebp) + vmovdqa %xmm4,-128(%ebp) + vmovdqa %xmm5,-112(%ebp) + vmovdqa %xmm6,-96(%ebp) + vmovdqa %xmm7,-80(%ebp) + leal 128(%esi),%esi + leal 128(%edi),%edi + jmp .L015outer_loop +.align 32 +.L015outer_loop: + vmovdqa -112(%ebp),%xmm1 + vmovdqa -96(%ebp),%xmm2 + vmovdqa -80(%ebp),%xmm3 + vmovdqa -48(%ebp),%xmm5 + vmovdqa -32(%ebp),%xmm6 + vmovdqa -16(%ebp),%xmm7 + vmovdqa %xmm1,-112(%ebx) + vmovdqa %xmm2,-96(%ebx) + vmovdqa %xmm3,-80(%ebx) + vmovdqa %xmm5,-48(%ebx) + vmovdqa %xmm6,-32(%ebx) + vmovdqa %xmm7,-16(%ebx) + vmovdqa 32(%ebp),%xmm2 + vmovdqa 48(%ebp),%xmm3 + vmovdqa 64(%ebp),%xmm4 + vmovdqa 80(%ebp),%xmm5 + vmovdqa 96(%ebp),%xmm6 + vmovdqa 112(%ebp),%xmm7 + vpaddd 64(%eax),%xmm4,%xmm4 + vmovdqa %xmm2,32(%ebx) + vmovdqa %xmm3,48(%ebx) + vmovdqa %xmm4,64(%ebx) + vmovdqa %xmm5,80(%ebx) + vmovdqa %xmm6,96(%ebx) + vmovdqa %xmm7,112(%ebx) + vmovdqa %xmm4,64(%ebp) + vmovdqa -128(%ebp),%xmm0 + vmovdqa %xmm4,%xmm6 + vmovdqa -64(%ebp),%xmm3 + vmovdqa (%ebp),%xmm4 + vmovdqa 16(%ebp),%xmm5 + movl $10,%edx + nop +.align 32 +.L016loop: + vpaddd %xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm6,%xmm6 +.byte 143,232,120,194,246,16 + vpaddd %xmm6,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm2 + vmovdqa -112(%ebx),%xmm1 +.byte 143,232,120,194,210,12 + vmovdqa -48(%ebx),%xmm3 + vpaddd %xmm2,%xmm0,%xmm0 + vmovdqa 80(%ebx),%xmm7 + vpxor %xmm0,%xmm6,%xmm6 + vpaddd %xmm3,%xmm1,%xmm1 +.byte 143,232,120,194,246,8 + vmovdqa %xmm0,-128(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa %xmm6,64(%ebx) + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,210,7 + vmovdqa %xmm4,(%ebx) +.byte 143,232,120,194,255,16 + vmovdqa %xmm2,-64(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa 32(%ebx),%xmm4 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqa -96(%ebx),%xmm0 +.byte 143,232,120,194,219,12 + vmovdqa -32(%ebx),%xmm2 + vpaddd %xmm3,%xmm1,%xmm1 + vmovdqa 96(%ebx),%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + vpaddd %xmm2,%xmm0,%xmm0 +.byte 143,232,120,194,255,8 + vmovdqa %xmm1,-112(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa %xmm7,80(%ebx) + vpxor %xmm5,%xmm3,%xmm3 + vpxor %xmm0,%xmm6,%xmm6 +.byte 143,232,120,194,219,7 + vmovdqa %xmm5,16(%ebx) +.byte 143,232,120,194,246,16 + vmovdqa %xmm3,-48(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa 48(%ebx),%xmm5 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa -80(%ebx),%xmm1 +.byte 143,232,120,194,210,12 + vmovdqa -16(%ebx),%xmm3 + vpaddd %xmm2,%xmm0,%xmm0 + vmovdqa 112(%ebx),%xmm7 + vpxor %xmm0,%xmm6,%xmm6 + vpaddd %xmm3,%xmm1,%xmm1 +.byte 143,232,120,194,246,8 + vmovdqa %xmm0,-96(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa %xmm6,96(%ebx) + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,210,7 +.byte 143,232,120,194,255,16 + vmovdqa %xmm2,-32(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqa -128(%ebx),%xmm0 +.byte 143,232,120,194,219,12 + vmovdqa -48(%ebx),%xmm2 + vpaddd %xmm3,%xmm1,%xmm1 + vpxor %xmm1,%xmm7,%xmm7 + vpaddd %xmm2,%xmm0,%xmm0 +.byte 143,232,120,194,255,8 + vmovdqa %xmm1,-80(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm5,%xmm3,%xmm3 + vpxor %xmm0,%xmm7,%xmm6 +.byte 143,232,120,194,219,7 +.byte 143,232,120,194,246,16 + vmovdqa %xmm3,-16(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa -112(%ebx),%xmm1 +.byte 143,232,120,194,210,12 + vmovdqa -32(%ebx),%xmm3 + vpaddd %xmm2,%xmm0,%xmm0 + vmovdqa 64(%ebx),%xmm7 + vpxor %xmm0,%xmm6,%xmm6 + vpaddd %xmm3,%xmm1,%xmm1 +.byte 143,232,120,194,246,8 + vmovdqa %xmm0,-128(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa %xmm6,112(%ebx) + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,210,7 + vmovdqa %xmm4,32(%ebx) +.byte 143,232,120,194,255,16 + vmovdqa %xmm2,-48(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa (%ebx),%xmm4 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqa -96(%ebx),%xmm0 +.byte 143,232,120,194,219,12 + vmovdqa -16(%ebx),%xmm2 + vpaddd %xmm3,%xmm1,%xmm1 + vmovdqa 80(%ebx),%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + vpaddd %xmm2,%xmm0,%xmm0 +.byte 143,232,120,194,255,8 + vmovdqa %xmm1,-112(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa %xmm7,64(%ebx) + vpxor %xmm5,%xmm3,%xmm3 + vpxor %xmm0,%xmm6,%xmm6 +.byte 143,232,120,194,219,7 + vmovdqa %xmm5,48(%ebx) +.byte 143,232,120,194,246,16 + vmovdqa %xmm3,-32(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa 16(%ebx),%xmm5 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa -80(%ebx),%xmm1 +.byte 143,232,120,194,210,12 + vmovdqa -64(%ebx),%xmm3 + vpaddd %xmm2,%xmm0,%xmm0 + vmovdqa 96(%ebx),%xmm7 + vpxor %xmm0,%xmm6,%xmm6 + vpaddd %xmm3,%xmm1,%xmm1 +.byte 143,232,120,194,246,8 + vmovdqa %xmm0,-96(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa %xmm6,80(%ebx) + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,210,7 +.byte 143,232,120,194,255,16 + vmovdqa %xmm2,-16(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqa -128(%ebx),%xmm0 +.byte 143,232,120,194,219,12 + vpaddd %xmm3,%xmm1,%xmm1 + vmovdqa 64(%ebx),%xmm6 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,255,8 + vmovdqa %xmm1,-80(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa %xmm7,96(%ebx) + vpxor %xmm5,%xmm3,%xmm3 +.byte 143,232,120,194,219,7 + decl %edx + jnz .L016loop + vmovdqa %xmm3,-64(%ebx) + vmovdqa %xmm4,(%ebx) + vmovdqa %xmm5,16(%ebx) + vmovdqa %xmm6,64(%ebx) + vmovdqa %xmm7,96(%ebx) + vmovdqa -112(%ebx),%xmm1 + vmovdqa -96(%ebx),%xmm2 + vmovdqa -80(%ebx),%xmm3 + vpaddd -128(%ebp),%xmm0,%xmm0 + vpaddd -112(%ebp),%xmm1,%xmm1 + vpaddd -96(%ebp),%xmm2,%xmm2 + vpaddd -80(%ebp),%xmm3,%xmm3 + vpunpckldq %xmm1,%xmm0,%xmm6 + vpunpckldq %xmm3,%xmm2,%xmm7 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm7,%xmm6,%xmm1 + vpunpckhqdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm2,%xmm0,%xmm7 + vpunpckhqdq %xmm2,%xmm0,%xmm3 + vpxor -128(%esi),%xmm1,%xmm4 + vpxor -64(%esi),%xmm6,%xmm5 + vpxor (%esi),%xmm7,%xmm6 + vpxor 64(%esi),%xmm3,%xmm7 + leal 16(%esi),%esi + vmovdqa -64(%ebx),%xmm0 + vmovdqa -48(%ebx),%xmm1 + vmovdqa -32(%ebx),%xmm2 + vmovdqa -16(%ebx),%xmm3 + vmovdqu %xmm4,-128(%edi) + vmovdqu %xmm5,-64(%edi) + vmovdqu %xmm6,(%edi) + vmovdqu %xmm7,64(%edi) + leal 16(%edi),%edi + vpaddd -64(%ebp),%xmm0,%xmm0 + vpaddd -48(%ebp),%xmm1,%xmm1 + vpaddd -32(%ebp),%xmm2,%xmm2 + vpaddd -16(%ebp),%xmm3,%xmm3 + vpunpckldq %xmm1,%xmm0,%xmm6 + vpunpckldq %xmm3,%xmm2,%xmm7 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm7,%xmm6,%xmm1 + vpunpckhqdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm2,%xmm0,%xmm7 + vpunpckhqdq %xmm2,%xmm0,%xmm3 + vpxor -128(%esi),%xmm1,%xmm4 + vpxor -64(%esi),%xmm6,%xmm5 + vpxor (%esi),%xmm7,%xmm6 + vpxor 64(%esi),%xmm3,%xmm7 + leal 16(%esi),%esi + vmovdqa (%ebx),%xmm0 + vmovdqa 16(%ebx),%xmm1 + vmovdqa 32(%ebx),%xmm2 + vmovdqa 48(%ebx),%xmm3 + vmovdqu %xmm4,-128(%edi) + vmovdqu %xmm5,-64(%edi) + vmovdqu %xmm6,(%edi) + vmovdqu %xmm7,64(%edi) + leal 16(%edi),%edi + vpaddd (%ebp),%xmm0,%xmm0 + vpaddd 16(%ebp),%xmm1,%xmm1 + vpaddd 32(%ebp),%xmm2,%xmm2 + vpaddd 48(%ebp),%xmm3,%xmm3 + vpunpckldq %xmm1,%xmm0,%xmm6 + vpunpckldq %xmm3,%xmm2,%xmm7 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm7,%xmm6,%xmm1 + vpunpckhqdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm2,%xmm0,%xmm7 + vpunpckhqdq %xmm2,%xmm0,%xmm3 + vpxor -128(%esi),%xmm1,%xmm4 + vpxor -64(%esi),%xmm6,%xmm5 + vpxor (%esi),%xmm7,%xmm6 + vpxor 64(%esi),%xmm3,%xmm7 + leal 16(%esi),%esi + vmovdqa 64(%ebx),%xmm0 + vmovdqa 80(%ebx),%xmm1 + vmovdqa 96(%ebx),%xmm2 + vmovdqa 112(%ebx),%xmm3 + vmovdqu %xmm4,-128(%edi) + vmovdqu %xmm5,-64(%edi) + vmovdqu %xmm6,(%edi) + vmovdqu %xmm7,64(%edi) + leal 16(%edi),%edi + vpaddd 64(%ebp),%xmm0,%xmm0 + vpaddd 80(%ebp),%xmm1,%xmm1 + vpaddd 96(%ebp),%xmm2,%xmm2 + vpaddd 112(%ebp),%xmm3,%xmm3 + vpunpckldq %xmm1,%xmm0,%xmm6 + vpunpckldq %xmm3,%xmm2,%xmm7 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm7,%xmm6,%xmm1 + vpunpckhqdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm2,%xmm0,%xmm7 + vpunpckhqdq %xmm2,%xmm0,%xmm3 + vpxor -128(%esi),%xmm1,%xmm4 + vpxor -64(%esi),%xmm6,%xmm5 + vpxor (%esi),%xmm7,%xmm6 + vpxor 64(%esi),%xmm3,%xmm7 + leal 208(%esi),%esi + vmovdqu %xmm4,-128(%edi) + vmovdqu %xmm5,-64(%edi) + vmovdqu %xmm6,(%edi) + vmovdqu %xmm7,64(%edi) + leal 208(%edi),%edi + subl $256,%ecx + jnc .L015outer_loop + addl $256,%ecx + jz .L017done + movl 520(%esp),%ebx + leal -128(%esi),%esi + movl 516(%esp),%edx + leal -128(%edi),%edi + vmovd 64(%ebp),%xmm2 + vmovdqu (%ebx),%xmm3 + vpaddd 96(%eax),%xmm2,%xmm2 + vpand 112(%eax),%xmm3,%xmm3 + vpor %xmm2,%xmm3,%xmm3 +.L0141x: + vmovdqa 32(%eax),%xmm0 + vmovdqu (%edx),%xmm1 + vmovdqu 16(%edx),%xmm2 + vmovdqa (%eax),%xmm6 + vmovdqa 16(%eax),%xmm7 + movl %ebp,48(%esp) + vmovdqa %xmm0,(%esp) + vmovdqa %xmm1,16(%esp) + vmovdqa %xmm2,32(%esp) + vmovdqa %xmm3,48(%esp) + movl $10,%edx + jmp .L018loop1x +.align 16 +.L019outer1x: + vmovdqa 80(%eax),%xmm3 + vmovdqa (%esp),%xmm0 + vmovdqa 16(%esp),%xmm1 + vmovdqa 32(%esp),%xmm2 + vpaddd 48(%esp),%xmm3,%xmm3 + movl $10,%edx + vmovdqa %xmm3,48(%esp) + jmp .L018loop1x +.align 16 +.L018loop1x: + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 +.byte 143,232,120,194,219,16 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 +.byte 143,232,120,194,201,12 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 +.byte 143,232,120,194,219,8 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 +.byte 143,232,120,194,201,7 + vpshufd $78,%xmm2,%xmm2 + vpshufd $57,%xmm1,%xmm1 + vpshufd $147,%xmm3,%xmm3 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 +.byte 143,232,120,194,219,16 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 +.byte 143,232,120,194,201,12 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 +.byte 143,232,120,194,219,8 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 +.byte 143,232,120,194,201,7 + vpshufd $78,%xmm2,%xmm2 + vpshufd $147,%xmm1,%xmm1 + vpshufd $57,%xmm3,%xmm3 + decl %edx + jnz .L018loop1x + vpaddd (%esp),%xmm0,%xmm0 + vpaddd 16(%esp),%xmm1,%xmm1 + vpaddd 32(%esp),%xmm2,%xmm2 + vpaddd 48(%esp),%xmm3,%xmm3 + cmpl $64,%ecx + jb .L020tail + vpxor (%esi),%xmm0,%xmm0 + vpxor 16(%esi),%xmm1,%xmm1 + vpxor 32(%esi),%xmm2,%xmm2 + vpxor 48(%esi),%xmm3,%xmm3 + leal 64(%esi),%esi + vmovdqu %xmm0,(%edi) + vmovdqu %xmm1,16(%edi) + vmovdqu %xmm2,32(%edi) + vmovdqu %xmm3,48(%edi) + leal 64(%edi),%edi + subl $64,%ecx + jnz .L019outer1x + jmp .L017done +.L020tail: + vmovdqa %xmm0,(%esp) + vmovdqa %xmm1,16(%esp) + vmovdqa %xmm2,32(%esp) + vmovdqa %xmm3,48(%esp) + xorl %eax,%eax + xorl %edx,%edx + xorl %ebp,%ebp +.L021tail_loop: + movb (%esp,%ebp,1),%al + movb (%esi,%ebp,1),%dl + leal 1(%ebp),%ebp + xorb %dl,%al + movb %al,-1(%edi,%ebp,1) + decl %ecx + jnz .L021tail_loop +.L017done: + vzeroupper + movl 512(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size ChaCha20_xop,.-.L_ChaCha20_xop_begin .comm OPENSSL_ia32cap_P,16,4 #else .text @@ -914,6 +1394,8 @@ ChaCha20_ssse3: pushl %esi pushl %edi .Lssse3_shortcut: + testl $2048,4(%ebp) + jnz .Lxop_shortcut movl 20(%esp),%edi movl 24(%esp),%esi movl 28(%esp),%ecx @@ -1057,5 +1539,483 @@ ChaCha20_ssse3: .byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 .byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 .byte 114,103,62,0 +.globl ChaCha20_xop +.type ChaCha20_xop,@function +.align 16 +ChaCha20_xop: +.L_ChaCha20_xop_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi +.Lxop_shortcut: + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%ecx + movl 32(%esp),%edx + movl 36(%esp),%ebx + vzeroupper + movl %esp,%ebp + subl $524,%esp + andl $-64,%esp + movl %ebp,512(%esp) + leal .Lssse3_data-.Lpic_point(%eax),%eax + vmovdqu (%ebx),%xmm3 + cmpl $256,%ecx + jb .L0141x + movl %edx,516(%esp) + movl %ebx,520(%esp) + subl $256,%ecx + leal 384(%esp),%ebp + vmovdqu (%edx),%xmm7 + vpshufd $0,%xmm3,%xmm0 + vpshufd $85,%xmm3,%xmm1 + vpshufd $170,%xmm3,%xmm2 + vpshufd $255,%xmm3,%xmm3 + vpaddd 48(%eax),%xmm0,%xmm0 + vpshufd $0,%xmm7,%xmm4 + vpshufd $85,%xmm7,%xmm5 + vpsubd 64(%eax),%xmm0,%xmm0 + vpshufd $170,%xmm7,%xmm6 + vpshufd $255,%xmm7,%xmm7 + vmovdqa %xmm0,64(%ebp) + vmovdqa %xmm1,80(%ebp) + vmovdqa %xmm2,96(%ebp) + vmovdqa %xmm3,112(%ebp) + vmovdqu 16(%edx),%xmm3 + vmovdqa %xmm4,-64(%ebp) + vmovdqa %xmm5,-48(%ebp) + vmovdqa %xmm6,-32(%ebp) + vmovdqa %xmm7,-16(%ebp) + vmovdqa 32(%eax),%xmm7 + leal 128(%esp),%ebx + vpshufd $0,%xmm3,%xmm0 + vpshufd $85,%xmm3,%xmm1 + vpshufd $170,%xmm3,%xmm2 + vpshufd $255,%xmm3,%xmm3 + vpshufd $0,%xmm7,%xmm4 + vpshufd $85,%xmm7,%xmm5 + vpshufd $170,%xmm7,%xmm6 + vpshufd $255,%xmm7,%xmm7 + vmovdqa %xmm0,(%ebp) + vmovdqa %xmm1,16(%ebp) + vmovdqa %xmm2,32(%ebp) + vmovdqa %xmm3,48(%ebp) + vmovdqa %xmm4,-128(%ebp) + vmovdqa %xmm5,-112(%ebp) + vmovdqa %xmm6,-96(%ebp) + vmovdqa %xmm7,-80(%ebp) + leal 128(%esi),%esi + leal 128(%edi),%edi + jmp .L015outer_loop +.align 32 +.L015outer_loop: + vmovdqa -112(%ebp),%xmm1 + vmovdqa -96(%ebp),%xmm2 + vmovdqa -80(%ebp),%xmm3 + vmovdqa -48(%ebp),%xmm5 + vmovdqa -32(%ebp),%xmm6 + vmovdqa -16(%ebp),%xmm7 + vmovdqa %xmm1,-112(%ebx) + vmovdqa %xmm2,-96(%ebx) + vmovdqa %xmm3,-80(%ebx) + vmovdqa %xmm5,-48(%ebx) + vmovdqa %xmm6,-32(%ebx) + vmovdqa %xmm7,-16(%ebx) + vmovdqa 32(%ebp),%xmm2 + vmovdqa 48(%ebp),%xmm3 + vmovdqa 64(%ebp),%xmm4 + vmovdqa 80(%ebp),%xmm5 + vmovdqa 96(%ebp),%xmm6 + vmovdqa 112(%ebp),%xmm7 + vpaddd 64(%eax),%xmm4,%xmm4 + vmovdqa %xmm2,32(%ebx) + vmovdqa %xmm3,48(%ebx) + vmovdqa %xmm4,64(%ebx) + vmovdqa %xmm5,80(%ebx) + vmovdqa %xmm6,96(%ebx) + vmovdqa %xmm7,112(%ebx) + vmovdqa %xmm4,64(%ebp) + vmovdqa -128(%ebp),%xmm0 + vmovdqa %xmm4,%xmm6 + vmovdqa -64(%ebp),%xmm3 + vmovdqa (%ebp),%xmm4 + vmovdqa 16(%ebp),%xmm5 + movl $10,%edx + nop +.align 32 +.L016loop: + vpaddd %xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm6,%xmm6 +.byte 143,232,120,194,246,16 + vpaddd %xmm6,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm2 + vmovdqa -112(%ebx),%xmm1 +.byte 143,232,120,194,210,12 + vmovdqa -48(%ebx),%xmm3 + vpaddd %xmm2,%xmm0,%xmm0 + vmovdqa 80(%ebx),%xmm7 + vpxor %xmm0,%xmm6,%xmm6 + vpaddd %xmm3,%xmm1,%xmm1 +.byte 143,232,120,194,246,8 + vmovdqa %xmm0,-128(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa %xmm6,64(%ebx) + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,210,7 + vmovdqa %xmm4,(%ebx) +.byte 143,232,120,194,255,16 + vmovdqa %xmm2,-64(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa 32(%ebx),%xmm4 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqa -96(%ebx),%xmm0 +.byte 143,232,120,194,219,12 + vmovdqa -32(%ebx),%xmm2 + vpaddd %xmm3,%xmm1,%xmm1 + vmovdqa 96(%ebx),%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + vpaddd %xmm2,%xmm0,%xmm0 +.byte 143,232,120,194,255,8 + vmovdqa %xmm1,-112(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa %xmm7,80(%ebx) + vpxor %xmm5,%xmm3,%xmm3 + vpxor %xmm0,%xmm6,%xmm6 +.byte 143,232,120,194,219,7 + vmovdqa %xmm5,16(%ebx) +.byte 143,232,120,194,246,16 + vmovdqa %xmm3,-48(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa 48(%ebx),%xmm5 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa -80(%ebx),%xmm1 +.byte 143,232,120,194,210,12 + vmovdqa -16(%ebx),%xmm3 + vpaddd %xmm2,%xmm0,%xmm0 + vmovdqa 112(%ebx),%xmm7 + vpxor %xmm0,%xmm6,%xmm6 + vpaddd %xmm3,%xmm1,%xmm1 +.byte 143,232,120,194,246,8 + vmovdqa %xmm0,-96(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa %xmm6,96(%ebx) + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,210,7 +.byte 143,232,120,194,255,16 + vmovdqa %xmm2,-32(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqa -128(%ebx),%xmm0 +.byte 143,232,120,194,219,12 + vmovdqa -48(%ebx),%xmm2 + vpaddd %xmm3,%xmm1,%xmm1 + vpxor %xmm1,%xmm7,%xmm7 + vpaddd %xmm2,%xmm0,%xmm0 +.byte 143,232,120,194,255,8 + vmovdqa %xmm1,-80(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm5,%xmm3,%xmm3 + vpxor %xmm0,%xmm7,%xmm6 +.byte 143,232,120,194,219,7 +.byte 143,232,120,194,246,16 + vmovdqa %xmm3,-16(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa -112(%ebx),%xmm1 +.byte 143,232,120,194,210,12 + vmovdqa -32(%ebx),%xmm3 + vpaddd %xmm2,%xmm0,%xmm0 + vmovdqa 64(%ebx),%xmm7 + vpxor %xmm0,%xmm6,%xmm6 + vpaddd %xmm3,%xmm1,%xmm1 +.byte 143,232,120,194,246,8 + vmovdqa %xmm0,-128(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa %xmm6,112(%ebx) + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,210,7 + vmovdqa %xmm4,32(%ebx) +.byte 143,232,120,194,255,16 + vmovdqa %xmm2,-48(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa (%ebx),%xmm4 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqa -96(%ebx),%xmm0 +.byte 143,232,120,194,219,12 + vmovdqa -16(%ebx),%xmm2 + vpaddd %xmm3,%xmm1,%xmm1 + vmovdqa 80(%ebx),%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + vpaddd %xmm2,%xmm0,%xmm0 +.byte 143,232,120,194,255,8 + vmovdqa %xmm1,-112(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa %xmm7,64(%ebx) + vpxor %xmm5,%xmm3,%xmm3 + vpxor %xmm0,%xmm6,%xmm6 +.byte 143,232,120,194,219,7 + vmovdqa %xmm5,48(%ebx) +.byte 143,232,120,194,246,16 + vmovdqa %xmm3,-32(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa 16(%ebx),%xmm5 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa -80(%ebx),%xmm1 +.byte 143,232,120,194,210,12 + vmovdqa -64(%ebx),%xmm3 + vpaddd %xmm2,%xmm0,%xmm0 + vmovdqa 96(%ebx),%xmm7 + vpxor %xmm0,%xmm6,%xmm6 + vpaddd %xmm3,%xmm1,%xmm1 +.byte 143,232,120,194,246,8 + vmovdqa %xmm0,-96(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa %xmm6,80(%ebx) + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,210,7 +.byte 143,232,120,194,255,16 + vmovdqa %xmm2,-16(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqa -128(%ebx),%xmm0 +.byte 143,232,120,194,219,12 + vpaddd %xmm3,%xmm1,%xmm1 + vmovdqa 64(%ebx),%xmm6 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,255,8 + vmovdqa %xmm1,-80(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa %xmm7,96(%ebx) + vpxor %xmm5,%xmm3,%xmm3 +.byte 143,232,120,194,219,7 + decl %edx + jnz .L016loop + vmovdqa %xmm3,-64(%ebx) + vmovdqa %xmm4,(%ebx) + vmovdqa %xmm5,16(%ebx) + vmovdqa %xmm6,64(%ebx) + vmovdqa %xmm7,96(%ebx) + vmovdqa -112(%ebx),%xmm1 + vmovdqa -96(%ebx),%xmm2 + vmovdqa -80(%ebx),%xmm3 + vpaddd -128(%ebp),%xmm0,%xmm0 + vpaddd -112(%ebp),%xmm1,%xmm1 + vpaddd -96(%ebp),%xmm2,%xmm2 + vpaddd -80(%ebp),%xmm3,%xmm3 + vpunpckldq %xmm1,%xmm0,%xmm6 + vpunpckldq %xmm3,%xmm2,%xmm7 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm7,%xmm6,%xmm1 + vpunpckhqdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm2,%xmm0,%xmm7 + vpunpckhqdq %xmm2,%xmm0,%xmm3 + vpxor -128(%esi),%xmm1,%xmm4 + vpxor -64(%esi),%xmm6,%xmm5 + vpxor (%esi),%xmm7,%xmm6 + vpxor 64(%esi),%xmm3,%xmm7 + leal 16(%esi),%esi + vmovdqa -64(%ebx),%xmm0 + vmovdqa -48(%ebx),%xmm1 + vmovdqa -32(%ebx),%xmm2 + vmovdqa -16(%ebx),%xmm3 + vmovdqu %xmm4,-128(%edi) + vmovdqu %xmm5,-64(%edi) + vmovdqu %xmm6,(%edi) + vmovdqu %xmm7,64(%edi) + leal 16(%edi),%edi + vpaddd -64(%ebp),%xmm0,%xmm0 + vpaddd -48(%ebp),%xmm1,%xmm1 + vpaddd -32(%ebp),%xmm2,%xmm2 + vpaddd -16(%ebp),%xmm3,%xmm3 + vpunpckldq %xmm1,%xmm0,%xmm6 + vpunpckldq %xmm3,%xmm2,%xmm7 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm7,%xmm6,%xmm1 + vpunpckhqdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm2,%xmm0,%xmm7 + vpunpckhqdq %xmm2,%xmm0,%xmm3 + vpxor -128(%esi),%xmm1,%xmm4 + vpxor -64(%esi),%xmm6,%xmm5 + vpxor (%esi),%xmm7,%xmm6 + vpxor 64(%esi),%xmm3,%xmm7 + leal 16(%esi),%esi + vmovdqa (%ebx),%xmm0 + vmovdqa 16(%ebx),%xmm1 + vmovdqa 32(%ebx),%xmm2 + vmovdqa 48(%ebx),%xmm3 + vmovdqu %xmm4,-128(%edi) + vmovdqu %xmm5,-64(%edi) + vmovdqu %xmm6,(%edi) + vmovdqu %xmm7,64(%edi) + leal 16(%edi),%edi + vpaddd (%ebp),%xmm0,%xmm0 + vpaddd 16(%ebp),%xmm1,%xmm1 + vpaddd 32(%ebp),%xmm2,%xmm2 + vpaddd 48(%ebp),%xmm3,%xmm3 + vpunpckldq %xmm1,%xmm0,%xmm6 + vpunpckldq %xmm3,%xmm2,%xmm7 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm7,%xmm6,%xmm1 + vpunpckhqdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm2,%xmm0,%xmm7 + vpunpckhqdq %xmm2,%xmm0,%xmm3 + vpxor -128(%esi),%xmm1,%xmm4 + vpxor -64(%esi),%xmm6,%xmm5 + vpxor (%esi),%xmm7,%xmm6 + vpxor 64(%esi),%xmm3,%xmm7 + leal 16(%esi),%esi + vmovdqa 64(%ebx),%xmm0 + vmovdqa 80(%ebx),%xmm1 + vmovdqa 96(%ebx),%xmm2 + vmovdqa 112(%ebx),%xmm3 + vmovdqu %xmm4,-128(%edi) + vmovdqu %xmm5,-64(%edi) + vmovdqu %xmm6,(%edi) + vmovdqu %xmm7,64(%edi) + leal 16(%edi),%edi + vpaddd 64(%ebp),%xmm0,%xmm0 + vpaddd 80(%ebp),%xmm1,%xmm1 + vpaddd 96(%ebp),%xmm2,%xmm2 + vpaddd 112(%ebp),%xmm3,%xmm3 + vpunpckldq %xmm1,%xmm0,%xmm6 + vpunpckldq %xmm3,%xmm2,%xmm7 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm7,%xmm6,%xmm1 + vpunpckhqdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm2,%xmm0,%xmm7 + vpunpckhqdq %xmm2,%xmm0,%xmm3 + vpxor -128(%esi),%xmm1,%xmm4 + vpxor -64(%esi),%xmm6,%xmm5 + vpxor (%esi),%xmm7,%xmm6 + vpxor 64(%esi),%xmm3,%xmm7 + leal 208(%esi),%esi + vmovdqu %xmm4,-128(%edi) + vmovdqu %xmm5,-64(%edi) + vmovdqu %xmm6,(%edi) + vmovdqu %xmm7,64(%edi) + leal 208(%edi),%edi + subl $256,%ecx + jnc .L015outer_loop + addl $256,%ecx + jz .L017done + movl 520(%esp),%ebx + leal -128(%esi),%esi + movl 516(%esp),%edx + leal -128(%edi),%edi + vmovd 64(%ebp),%xmm2 + vmovdqu (%ebx),%xmm3 + vpaddd 96(%eax),%xmm2,%xmm2 + vpand 112(%eax),%xmm3,%xmm3 + vpor %xmm2,%xmm3,%xmm3 +.L0141x: + vmovdqa 32(%eax),%xmm0 + vmovdqu (%edx),%xmm1 + vmovdqu 16(%edx),%xmm2 + vmovdqa (%eax),%xmm6 + vmovdqa 16(%eax),%xmm7 + movl %ebp,48(%esp) + vmovdqa %xmm0,(%esp) + vmovdqa %xmm1,16(%esp) + vmovdqa %xmm2,32(%esp) + vmovdqa %xmm3,48(%esp) + movl $10,%edx + jmp .L018loop1x +.align 16 +.L019outer1x: + vmovdqa 80(%eax),%xmm3 + vmovdqa (%esp),%xmm0 + vmovdqa 16(%esp),%xmm1 + vmovdqa 32(%esp),%xmm2 + vpaddd 48(%esp),%xmm3,%xmm3 + movl $10,%edx + vmovdqa %xmm3,48(%esp) + jmp .L018loop1x +.align 16 +.L018loop1x: + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 +.byte 143,232,120,194,219,16 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 +.byte 143,232,120,194,201,12 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 +.byte 143,232,120,194,219,8 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 +.byte 143,232,120,194,201,7 + vpshufd $78,%xmm2,%xmm2 + vpshufd $57,%xmm1,%xmm1 + vpshufd $147,%xmm3,%xmm3 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 +.byte 143,232,120,194,219,16 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 +.byte 143,232,120,194,201,12 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 +.byte 143,232,120,194,219,8 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 +.byte 143,232,120,194,201,7 + vpshufd $78,%xmm2,%xmm2 + vpshufd $147,%xmm1,%xmm1 + vpshufd $57,%xmm3,%xmm3 + decl %edx + jnz .L018loop1x + vpaddd (%esp),%xmm0,%xmm0 + vpaddd 16(%esp),%xmm1,%xmm1 + vpaddd 32(%esp),%xmm2,%xmm2 + vpaddd 48(%esp),%xmm3,%xmm3 + cmpl $64,%ecx + jb .L020tail + vpxor (%esi),%xmm0,%xmm0 + vpxor 16(%esi),%xmm1,%xmm1 + vpxor 32(%esi),%xmm2,%xmm2 + vpxor 48(%esi),%xmm3,%xmm3 + leal 64(%esi),%esi + vmovdqu %xmm0,(%edi) + vmovdqu %xmm1,16(%edi) + vmovdqu %xmm2,32(%edi) + vmovdqu %xmm3,48(%edi) + leal 64(%edi),%edi + subl $64,%ecx + jnz .L019outer1x + jmp .L017done +.L020tail: + vmovdqa %xmm0,(%esp) + vmovdqa %xmm1,16(%esp) + vmovdqa %xmm2,32(%esp) + vmovdqa %xmm3,48(%esp) + xorl %eax,%eax + xorl %edx,%edx + xorl %ebp,%ebp +.L021tail_loop: + movb (%esp,%ebp,1),%al + movb (%esi,%ebp,1),%dl + leal 1(%ebp),%ebp + xorb %dl,%al + movb %al,-1(%edi,%ebp,1) + decl %ecx + jnz .L021tail_loop +.L017done: + vzeroupper + movl 512(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size ChaCha20_xop,.-.L_ChaCha20_xop_begin .comm OPENSSL_ia32cap_P,16,4 #endif diff --git a/secure/lib/libcrypto/i386/poly1305-x86.S b/secure/lib/libcrypto/i386/poly1305-x86.S index b394500278d..100deee40bf 100644 --- a/secure/lib/libcrypto/i386/poly1305-x86.S +++ b/secure/lib/libcrypto/i386/poly1305-x86.S @@ -36,6 +36,10 @@ poly1305_init: jne .L002no_sse2 leal _poly1305_blocks_sse2-.L001pic_point(%ebx),%eax leal _poly1305_emit_sse2-.L001pic_point(%ebx),%edx + movl 8(%edi),%ecx + testl $32,%ecx + jz .L002no_sse2 + leal _poly1305_blocks_avx2-.L001pic_point(%ebx),%eax .L002no_sse2: movl 20(%esp),%edi movl %eax,(%ebp) @@ -1344,6 +1348,557 @@ _poly1305_emit_sse2: popl %ebp ret .size _poly1305_emit_sse2,.-_poly1305_emit_sse2 +.align 32 +.type _poly1305_init_avx2,@function +.align 16 +_poly1305_init_avx2: + vmovdqu 24(%edi),%xmm4 + leal 48(%edi),%edi + movl %esp,%ebp + subl $224,%esp + andl $-16,%esp + vmovdqa 64(%ebx),%xmm7 + vpand %xmm7,%xmm4,%xmm0 + vpsrlq $26,%xmm4,%xmm1 + vpsrldq $6,%xmm4,%xmm3 + vpand %xmm7,%xmm1,%xmm1 + vpsrlq $4,%xmm3,%xmm2 + vpsrlq $30,%xmm3,%xmm3 + vpand %xmm7,%xmm2,%xmm2 + vpand %xmm7,%xmm3,%xmm3 + vpsrldq $13,%xmm4,%xmm4 + leal 144(%esp),%edx + movl $2,%ecx +.L018square: + vmovdqa %xmm0,(%esp) + vmovdqa %xmm1,16(%esp) + vmovdqa %xmm2,32(%esp) + vmovdqa %xmm3,48(%esp) + vmovdqa %xmm4,64(%esp) + vpslld $2,%xmm1,%xmm6 + vpslld $2,%xmm2,%xmm5 + vpaddd %xmm1,%xmm6,%xmm6 + vpaddd %xmm2,%xmm5,%xmm5 + vmovdqa %xmm6,80(%esp) + vmovdqa %xmm5,96(%esp) + vpslld $2,%xmm3,%xmm6 + vpslld $2,%xmm4,%xmm5 + vpaddd %xmm3,%xmm6,%xmm6 + vpaddd %xmm4,%xmm5,%xmm5 + vmovdqa %xmm6,112(%esp) + vmovdqa %xmm5,128(%esp) + vpshufd $68,%xmm0,%xmm5 + vmovdqa %xmm1,%xmm6 + vpshufd $68,%xmm1,%xmm1 + vpshufd $68,%xmm2,%xmm2 + vpshufd $68,%xmm3,%xmm3 + vpshufd $68,%xmm4,%xmm4 + vmovdqa %xmm5,(%edx) + vmovdqa %xmm1,16(%edx) + vmovdqa %xmm2,32(%edx) + vmovdqa %xmm3,48(%edx) + vmovdqa %xmm4,64(%edx) + vpmuludq %xmm0,%xmm4,%xmm4 + vpmuludq %xmm0,%xmm3,%xmm3 + vpmuludq %xmm0,%xmm2,%xmm2 + vpmuludq %xmm0,%xmm1,%xmm1 + vpmuludq %xmm0,%xmm5,%xmm0 + vpmuludq 48(%edx),%xmm6,%xmm5 + vpaddq %xmm5,%xmm4,%xmm4 + vpmuludq 32(%edx),%xmm6,%xmm7 + vpaddq %xmm7,%xmm3,%xmm3 + vpmuludq 16(%edx),%xmm6,%xmm5 + vpaddq %xmm5,%xmm2,%xmm2 + vmovdqa 80(%esp),%xmm7 + vpmuludq (%edx),%xmm6,%xmm6 + vpaddq %xmm6,%xmm1,%xmm1 + vmovdqa 32(%esp),%xmm5 + vpmuludq 64(%edx),%xmm7,%xmm7 + vpaddq %xmm7,%xmm0,%xmm0 + vpmuludq 32(%edx),%xmm5,%xmm6 + vpaddq %xmm6,%xmm4,%xmm4 + vpmuludq 16(%edx),%xmm5,%xmm7 + vpaddq %xmm7,%xmm3,%xmm3 + vmovdqa 96(%esp),%xmm6 + vpmuludq (%edx),%xmm5,%xmm5 + vpaddq %xmm5,%xmm2,%xmm2 + vpmuludq 64(%edx),%xmm6,%xmm7 + vpaddq %xmm7,%xmm1,%xmm1 + vmovdqa 48(%esp),%xmm5 + vpmuludq 48(%edx),%xmm6,%xmm6 + vpaddq %xmm6,%xmm0,%xmm0 + vpmuludq 16(%edx),%xmm5,%xmm7 + vpaddq %xmm7,%xmm4,%xmm4 + vmovdqa 112(%esp),%xmm6 + vpmuludq (%edx),%xmm5,%xmm5 + vpaddq %xmm5,%xmm3,%xmm3 + vpmuludq 64(%edx),%xmm6,%xmm7 + vpaddq %xmm7,%xmm2,%xmm2 + vpmuludq 48(%edx),%xmm6,%xmm5 + vpaddq %xmm5,%xmm1,%xmm1 + vmovdqa 64(%esp),%xmm7 + vpmuludq 32(%edx),%xmm6,%xmm6 + vpaddq %xmm6,%xmm0,%xmm0 + vmovdqa 128(%esp),%xmm5 + vpmuludq (%edx),%xmm7,%xmm7 + vpaddq %xmm7,%xmm4,%xmm4 + vpmuludq 64(%edx),%xmm5,%xmm6 + vpaddq %xmm6,%xmm3,%xmm3 + vpmuludq 16(%edx),%xmm5,%xmm7 + vpaddq %xmm7,%xmm0,%xmm0 + vpmuludq 32(%edx),%xmm5,%xmm6 + vpaddq %xmm6,%xmm1,%xmm1 + vmovdqa 64(%ebx),%xmm7 + vpmuludq 48(%edx),%xmm5,%xmm5 + vpaddq %xmm5,%xmm2,%xmm2 + vpsrlq $26,%xmm3,%xmm5 + vpand %xmm7,%xmm3,%xmm3 + vpsrlq $26,%xmm0,%xmm6 + vpand %xmm7,%xmm0,%xmm0 + vpaddq %xmm5,%xmm4,%xmm4 + vpaddq %xmm6,%xmm1,%xmm1 + vpsrlq $26,%xmm4,%xmm5 + vpand %xmm7,%xmm4,%xmm4 + vpsrlq $26,%xmm1,%xmm6 + vpand %xmm7,%xmm1,%xmm1 + vpaddq %xmm6,%xmm2,%xmm2 + vpaddd %xmm5,%xmm0,%xmm0 + vpsllq $2,%xmm5,%xmm5 + vpsrlq $26,%xmm2,%xmm6 + vpand %xmm7,%xmm2,%xmm2 + vpaddd %xmm5,%xmm0,%xmm0 + vpaddd %xmm6,%xmm3,%xmm3 + vpsrlq $26,%xmm3,%xmm6 + vpsrlq $26,%xmm0,%xmm5 + vpand %xmm7,%xmm0,%xmm0 + vpand %xmm7,%xmm3,%xmm3 + vpaddd %xmm5,%xmm1,%xmm1 + vpaddd %xmm6,%xmm4,%xmm4 + decl %ecx + jz .L019square_break + vpunpcklqdq (%esp),%xmm0,%xmm0 + vpunpcklqdq 16(%esp),%xmm1,%xmm1 + vpunpcklqdq 32(%esp),%xmm2,%xmm2 + vpunpcklqdq 48(%esp),%xmm3,%xmm3 + vpunpcklqdq 64(%esp),%xmm4,%xmm4 + jmp .L018square +.L019square_break: + vpsllq $32,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm1 + vpsllq $32,%xmm2,%xmm2 + vpsllq $32,%xmm3,%xmm3 + vpsllq $32,%xmm4,%xmm4 + vpor (%esp),%xmm0,%xmm0 + vpor 16(%esp),%xmm1,%xmm1 + vpor 32(%esp),%xmm2,%xmm2 + vpor 48(%esp),%xmm3,%xmm3 + vpor 64(%esp),%xmm4,%xmm4 + vpshufd $141,%xmm0,%xmm0 + vpshufd $141,%xmm1,%xmm1 + vpshufd $141,%xmm2,%xmm2 + vpshufd $141,%xmm3,%xmm3 + vpshufd $141,%xmm4,%xmm4 + vmovdqu %xmm0,(%edi) + vmovdqu %xmm1,16(%edi) + vmovdqu %xmm2,32(%edi) + vmovdqu %xmm3,48(%edi) + vmovdqu %xmm4,64(%edi) + vpslld $2,%xmm1,%xmm6 + vpslld $2,%xmm2,%xmm5 + vpaddd %xmm1,%xmm6,%xmm6 + vpaddd %xmm2,%xmm5,%xmm5 + vmovdqu %xmm6,80(%edi) + vmovdqu %xmm5,96(%edi) + vpslld $2,%xmm3,%xmm6 + vpslld $2,%xmm4,%xmm5 + vpaddd %xmm3,%xmm6,%xmm6 + vpaddd %xmm4,%xmm5,%xmm5 + vmovdqu %xmm6,112(%edi) + vmovdqu %xmm5,128(%edi) + movl %ebp,%esp + leal -48(%edi),%edi + ret +.size _poly1305_init_avx2,.-_poly1305_init_avx2 +.align 32 +.type _poly1305_blocks_avx2,@function +.align 16 +_poly1305_blocks_avx2: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%ecx + movl 20(%edi),%eax + andl $-16,%ecx + jz .L020nodata + cmpl $64,%ecx + jae .L021enter_avx2 + testl %eax,%eax + jz .Lenter_blocks +.L021enter_avx2: + vzeroupper + call .L022pic_point +.L022pic_point: + popl %ebx + leal .Lconst_sse2-.L022pic_point(%ebx),%ebx + testl %eax,%eax + jnz .L023base2_26 + call _poly1305_init_avx2 + movl (%edi),%eax + movl 3(%edi),%ecx + movl 6(%edi),%edx + movl 9(%edi),%esi + movl 13(%edi),%ebp + shrl $2,%ecx + andl $67108863,%eax + shrl $4,%edx + andl $67108863,%ecx + shrl $6,%esi + andl $67108863,%edx + movl %eax,(%edi) + movl %ecx,4(%edi) + movl %edx,8(%edi) + movl %esi,12(%edi) + movl %ebp,16(%edi) + movl $1,20(%edi) + movl 24(%esp),%esi + movl 28(%esp),%ecx +.L023base2_26: + movl 32(%esp),%eax + movl %esp,%ebp + subl $448,%esp + andl $-512,%esp + vmovdqu 48(%edi),%xmm0 + leal 288(%esp),%edx + vmovdqu 64(%edi),%xmm1 + vmovdqu 80(%edi),%xmm2 + vmovdqu 96(%edi),%xmm3 + vmovdqu 112(%edi),%xmm4 + leal 48(%edi),%edi + vpermq $64,%ymm0,%ymm0 + vpermq $64,%ymm1,%ymm1 + vpermq $64,%ymm2,%ymm2 + vpermq $64,%ymm3,%ymm3 + vpermq $64,%ymm4,%ymm4 + vpshufd $200,%ymm0,%ymm0 + vpshufd $200,%ymm1,%ymm1 + vpshufd $200,%ymm2,%ymm2 + vpshufd $200,%ymm3,%ymm3 + vpshufd $200,%ymm4,%ymm4 + vmovdqa %ymm0,-128(%edx) + vmovdqu 80(%edi),%xmm0 + vmovdqa %ymm1,-96(%edx) + vmovdqu 96(%edi),%xmm1 + vmovdqa %ymm2,-64(%edx) + vmovdqu 112(%edi),%xmm2 + vmovdqa %ymm3,-32(%edx) + vmovdqu 128(%edi),%xmm3 + vmovdqa %ymm4,(%edx) + vpermq $64,%ymm0,%ymm0 + vpermq $64,%ymm1,%ymm1 + vpermq $64,%ymm2,%ymm2 + vpermq $64,%ymm3,%ymm3 + vpshufd $200,%ymm0,%ymm0 + vpshufd $200,%ymm1,%ymm1 + vpshufd $200,%ymm2,%ymm2 + vpshufd $200,%ymm3,%ymm3 + vmovdqa %ymm0,32(%edx) + vmovd -48(%edi),%xmm0 + vmovdqa %ymm1,64(%edx) + vmovd -44(%edi),%xmm1 + vmovdqa %ymm2,96(%edx) + vmovd -40(%edi),%xmm2 + vmovdqa %ymm3,128(%edx) + vmovd -36(%edi),%xmm3 + vmovd -32(%edi),%xmm4 + vmovdqa 64(%ebx),%ymm7 + negl %eax + testl $63,%ecx + jz .L024even + movl %ecx,%edx + andl $-64,%ecx + andl $63,%edx + vmovdqu (%esi),%xmm5 + cmpl $32,%edx + jb .L025one + vmovdqu 16(%esi),%xmm6 + je .L026two + vinserti128 $1,32(%esi),%ymm5,%ymm5 + leal 48(%esi),%esi + leal 8(%ebx),%ebx + leal 296(%esp),%edx + jmp .L027tail +.L026two: + leal 32(%esi),%esi + leal 16(%ebx),%ebx + leal 304(%esp),%edx + jmp .L027tail +.L025one: + leal 16(%esi),%esi + vpxor %ymm6,%ymm6,%ymm6 + leal 32(%ebx,%eax,8),%ebx + leal 312(%esp),%edx + jmp .L027tail +.align 32 +.L024even: + vmovdqu (%esi),%xmm5 + vmovdqu 16(%esi),%xmm6 + vinserti128 $1,32(%esi),%ymm5,%ymm5 + vinserti128 $1,48(%esi),%ymm6,%ymm6 + leal 64(%esi),%esi + subl $64,%ecx + jz .L027tail +.L028loop: + vmovdqa %ymm2,64(%esp) + vpsrldq $6,%ymm5,%ymm2 + vmovdqa %ymm0,(%esp) + vpsrldq $6,%ymm6,%ymm0 + vmovdqa %ymm1,32(%esp) + vpunpckhqdq %ymm6,%ymm5,%ymm1 + vpunpcklqdq %ymm6,%ymm5,%ymm5 + vpunpcklqdq %ymm0,%ymm2,%ymm2 + vpsrlq $30,%ymm2,%ymm0 + vpsrlq $4,%ymm2,%ymm2 + vpsrlq $26,%ymm5,%ymm6 + vpsrlq $40,%ymm1,%ymm1 + vpand %ymm7,%ymm2,%ymm2 + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + vpand %ymm7,%ymm0,%ymm0 + vpor (%ebx),%ymm1,%ymm1 + vpaddq 64(%esp),%ymm2,%ymm2 + vpaddq (%esp),%ymm5,%ymm5 + vpaddq 32(%esp),%ymm6,%ymm6 + vpaddq %ymm3,%ymm0,%ymm0 + vpaddq %ymm4,%ymm1,%ymm1 + vpmuludq -96(%edx),%ymm2,%ymm3 + vmovdqa %ymm6,32(%esp) + vpmuludq -64(%edx),%ymm2,%ymm4 + vmovdqa %ymm0,96(%esp) + vpmuludq 96(%edx),%ymm2,%ymm0 + vmovdqa %ymm1,128(%esp) + vpmuludq 128(%edx),%ymm2,%ymm1 + vpmuludq -128(%edx),%ymm2,%ymm2 + vpmuludq -32(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 + vpmuludq (%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm4,%ymm4 + vpmuludq -128(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm0,%ymm0 + vmovdqa 32(%esp),%ymm7 + vpmuludq -96(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm1,%ymm1 + vpmuludq -64(%edx),%ymm5,%ymm5 + vpaddq %ymm5,%ymm2,%ymm2 + vpmuludq -64(%edx),%ymm7,%ymm6 + vpaddq %ymm6,%ymm3,%ymm3 + vpmuludq -32(%edx),%ymm7,%ymm5 + vpaddq %ymm5,%ymm4,%ymm4 + vpmuludq 128(%edx),%ymm7,%ymm6 + vpaddq %ymm6,%ymm0,%ymm0 + vmovdqa 96(%esp),%ymm6 + vpmuludq -128(%edx),%ymm7,%ymm5 + vpaddq %ymm5,%ymm1,%ymm1 + vpmuludq -96(%edx),%ymm7,%ymm7 + vpaddq %ymm7,%ymm2,%ymm2 + vpmuludq -128(%edx),%ymm6,%ymm5 + vpaddq %ymm5,%ymm3,%ymm3 + vpmuludq -96(%edx),%ymm6,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 + vpmuludq 64(%edx),%ymm6,%ymm5 + vpaddq %ymm5,%ymm0,%ymm0 + vmovdqa 128(%esp),%ymm5 + vpmuludq 96(%edx),%ymm6,%ymm7 + vpaddq %ymm7,%ymm1,%ymm1 + vpmuludq 128(%edx),%ymm6,%ymm6 + vpaddq %ymm6,%ymm2,%ymm2 + vpmuludq 128(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 + vpmuludq 32(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm0,%ymm0 + vpmuludq -128(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 + vmovdqa 64(%ebx),%ymm7 + vpmuludq 64(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm1,%ymm1 + vpmuludq 96(%edx),%ymm5,%ymm5 + vpaddq %ymm5,%ymm2,%ymm2 + vpsrlq $26,%ymm3,%ymm5 + vpand %ymm7,%ymm3,%ymm3 + vpsrlq $26,%ymm0,%ymm6 + vpand %ymm7,%ymm0,%ymm0 + vpaddq %ymm5,%ymm4,%ymm4 + vpaddq %ymm6,%ymm1,%ymm1 + vpsrlq $26,%ymm4,%ymm5 + vpand %ymm7,%ymm4,%ymm4 + vpsrlq $26,%ymm1,%ymm6 + vpand %ymm7,%ymm1,%ymm1 + vpaddq %ymm6,%ymm2,%ymm2 + vpaddq %ymm5,%ymm0,%ymm0 + vpsllq $2,%ymm5,%ymm5 + vpsrlq $26,%ymm2,%ymm6 + vpand %ymm7,%ymm2,%ymm2 + vpaddq %ymm5,%ymm0,%ymm0 + vpaddq %ymm6,%ymm3,%ymm3 + vpsrlq $26,%ymm3,%ymm6 + vpsrlq $26,%ymm0,%ymm5 + vpand %ymm7,%ymm0,%ymm0 + vpand %ymm7,%ymm3,%ymm3 + vpaddq %ymm5,%ymm1,%ymm1 + vpaddq %ymm6,%ymm4,%ymm4 + vmovdqu (%esi),%xmm5 + vmovdqu 16(%esi),%xmm6 + vinserti128 $1,32(%esi),%ymm5,%ymm5 + vinserti128 $1,48(%esi),%ymm6,%ymm6 + leal 64(%esi),%esi + subl $64,%ecx + jnz .L028loop +.L027tail: + vmovdqa %ymm2,64(%esp) + vpsrldq $6,%ymm5,%ymm2 + vmovdqa %ymm0,(%esp) + vpsrldq $6,%ymm6,%ymm0 + vmovdqa %ymm1,32(%esp) + vpunpckhqdq %ymm6,%ymm5,%ymm1 + vpunpcklqdq %ymm6,%ymm5,%ymm5 + vpunpcklqdq %ymm0,%ymm2,%ymm2 + vpsrlq $30,%ymm2,%ymm0 + vpsrlq $4,%ymm2,%ymm2 + vpsrlq $26,%ymm5,%ymm6 + vpsrlq $40,%ymm1,%ymm1 + vpand %ymm7,%ymm2,%ymm2 + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + vpand %ymm7,%ymm0,%ymm0 + vpor (%ebx),%ymm1,%ymm1 + andl $-64,%ebx + vpaddq 64(%esp),%ymm2,%ymm2 + vpaddq (%esp),%ymm5,%ymm5 + vpaddq 32(%esp),%ymm6,%ymm6 + vpaddq %ymm3,%ymm0,%ymm0 + vpaddq %ymm4,%ymm1,%ymm1 + vpmuludq -92(%edx),%ymm2,%ymm3 + vmovdqa %ymm6,32(%esp) + vpmuludq -60(%edx),%ymm2,%ymm4 + vmovdqa %ymm0,96(%esp) + vpmuludq 100(%edx),%ymm2,%ymm0 + vmovdqa %ymm1,128(%esp) + vpmuludq 132(%edx),%ymm2,%ymm1 + vpmuludq -124(%edx),%ymm2,%ymm2 + vpmuludq -28(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 + vpmuludq 4(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm4,%ymm4 + vpmuludq -124(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm0,%ymm0 + vmovdqa 32(%esp),%ymm7 + vpmuludq -92(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm1,%ymm1 + vpmuludq -60(%edx),%ymm5,%ymm5 + vpaddq %ymm5,%ymm2,%ymm2 + vpmuludq -60(%edx),%ymm7,%ymm6 + vpaddq %ymm6,%ymm3,%ymm3 + vpmuludq -28(%edx),%ymm7,%ymm5 + vpaddq %ymm5,%ymm4,%ymm4 + vpmuludq 132(%edx),%ymm7,%ymm6 + vpaddq %ymm6,%ymm0,%ymm0 + vmovdqa 96(%esp),%ymm6 + vpmuludq -124(%edx),%ymm7,%ymm5 + vpaddq %ymm5,%ymm1,%ymm1 + vpmuludq -92(%edx),%ymm7,%ymm7 + vpaddq %ymm7,%ymm2,%ymm2 + vpmuludq -124(%edx),%ymm6,%ymm5 + vpaddq %ymm5,%ymm3,%ymm3 + vpmuludq -92(%edx),%ymm6,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 + vpmuludq 68(%edx),%ymm6,%ymm5 + vpaddq %ymm5,%ymm0,%ymm0 + vmovdqa 128(%esp),%ymm5 + vpmuludq 100(%edx),%ymm6,%ymm7 + vpaddq %ymm7,%ymm1,%ymm1 + vpmuludq 132(%edx),%ymm6,%ymm6 + vpaddq %ymm6,%ymm2,%ymm2 + vpmuludq 132(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 + vpmuludq 36(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm0,%ymm0 + vpmuludq -124(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 + vmovdqa 64(%ebx),%ymm7 + vpmuludq 68(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm1,%ymm1 + vpmuludq 100(%edx),%ymm5,%ymm5 + vpaddq %ymm5,%ymm2,%ymm2 + vpsrldq $8,%ymm4,%ymm5 + vpsrldq $8,%ymm3,%ymm6 + vpaddq %ymm5,%ymm4,%ymm4 + vpsrldq $8,%ymm0,%ymm5 + vpaddq %ymm6,%ymm3,%ymm3 + vpsrldq $8,%ymm1,%ymm6 + vpaddq %ymm5,%ymm0,%ymm0 + vpsrldq $8,%ymm2,%ymm5 + vpaddq %ymm6,%ymm1,%ymm1 + vpermq $2,%ymm4,%ymm6 + vpaddq %ymm5,%ymm2,%ymm2 + vpermq $2,%ymm3,%ymm5 + vpaddq %ymm6,%ymm4,%ymm4 + vpermq $2,%ymm0,%ymm6 + vpaddq %ymm5,%ymm3,%ymm3 + vpermq $2,%ymm1,%ymm5 + vpaddq %ymm6,%ymm0,%ymm0 + vpermq $2,%ymm2,%ymm6 + vpaddq %ymm5,%ymm1,%ymm1 + vpaddq %ymm6,%ymm2,%ymm2 + vpsrlq $26,%ymm3,%ymm5 + vpand %ymm7,%ymm3,%ymm3 + vpsrlq $26,%ymm0,%ymm6 + vpand %ymm7,%ymm0,%ymm0 + vpaddq %ymm5,%ymm4,%ymm4 + vpaddq %ymm6,%ymm1,%ymm1 + vpsrlq $26,%ymm4,%ymm5 + vpand %ymm7,%ymm4,%ymm4 + vpsrlq $26,%ymm1,%ymm6 + vpand %ymm7,%ymm1,%ymm1 + vpaddq %ymm6,%ymm2,%ymm2 + vpaddq %ymm5,%ymm0,%ymm0 + vpsllq $2,%ymm5,%ymm5 + vpsrlq $26,%ymm2,%ymm6 + vpand %ymm7,%ymm2,%ymm2 + vpaddq %ymm5,%ymm0,%ymm0 + vpaddq %ymm6,%ymm3,%ymm3 + vpsrlq $26,%ymm3,%ymm6 + vpsrlq $26,%ymm0,%ymm5 + vpand %ymm7,%ymm0,%ymm0 + vpand %ymm7,%ymm3,%ymm3 + vpaddq %ymm5,%ymm1,%ymm1 + vpaddq %ymm6,%ymm4,%ymm4 + cmpl $0,%ecx + je .L029done + vpshufd $252,%xmm0,%xmm0 + leal 288(%esp),%edx + vpshufd $252,%xmm1,%xmm1 + vpshufd $252,%xmm2,%xmm2 + vpshufd $252,%xmm3,%xmm3 + vpshufd $252,%xmm4,%xmm4 + jmp .L024even +.align 16 +.L029done: + vmovd %xmm0,-48(%edi) + vmovd %xmm1,-44(%edi) + vmovd %xmm2,-40(%edi) + vmovd %xmm3,-36(%edi) + vmovd %xmm4,-32(%edi) + vzeroupper + movl %ebp,%esp +.L020nodata: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size _poly1305_blocks_avx2,.-_poly1305_blocks_avx2 .align 64 .Lconst_sse2: .long 16777216,0,16777216,0,16777216,0,16777216,0 @@ -1392,6 +1947,10 @@ poly1305_init: jne .L002no_sse2 leal _poly1305_blocks_sse2-.L001pic_point(%ebx),%eax leal _poly1305_emit_sse2-.L001pic_point(%ebx),%edx + movl 8(%edi),%ecx + testl $32,%ecx + jz .L002no_sse2 + leal _poly1305_blocks_avx2-.L001pic_point(%ebx),%eax .L002no_sse2: movl 20(%esp),%edi movl %eax,(%ebp) @@ -2700,6 +3259,557 @@ _poly1305_emit_sse2: popl %ebp ret .size _poly1305_emit_sse2,.-_poly1305_emit_sse2 +.align 32 +.type _poly1305_init_avx2,@function +.align 16 +_poly1305_init_avx2: + vmovdqu 24(%edi),%xmm4 + leal 48(%edi),%edi + movl %esp,%ebp + subl $224,%esp + andl $-16,%esp + vmovdqa 64(%ebx),%xmm7 + vpand %xmm7,%xmm4,%xmm0 + vpsrlq $26,%xmm4,%xmm1 + vpsrldq $6,%xmm4,%xmm3 + vpand %xmm7,%xmm1,%xmm1 + vpsrlq $4,%xmm3,%xmm2 + vpsrlq $30,%xmm3,%xmm3 + vpand %xmm7,%xmm2,%xmm2 + vpand %xmm7,%xmm3,%xmm3 + vpsrldq $13,%xmm4,%xmm4 + leal 144(%esp),%edx + movl $2,%ecx +.L018square: + vmovdqa %xmm0,(%esp) + vmovdqa %xmm1,16(%esp) + vmovdqa %xmm2,32(%esp) + vmovdqa %xmm3,48(%esp) + vmovdqa %xmm4,64(%esp) + vpslld $2,%xmm1,%xmm6 + vpslld $2,%xmm2,%xmm5 + vpaddd %xmm1,%xmm6,%xmm6 + vpaddd %xmm2,%xmm5,%xmm5 + vmovdqa %xmm6,80(%esp) + vmovdqa %xmm5,96(%esp) + vpslld $2,%xmm3,%xmm6 + vpslld $2,%xmm4,%xmm5 + vpaddd %xmm3,%xmm6,%xmm6 + vpaddd %xmm4,%xmm5,%xmm5 + vmovdqa %xmm6,112(%esp) + vmovdqa %xmm5,128(%esp) + vpshufd $68,%xmm0,%xmm5 + vmovdqa %xmm1,%xmm6 + vpshufd $68,%xmm1,%xmm1 + vpshufd $68,%xmm2,%xmm2 + vpshufd $68,%xmm3,%xmm3 + vpshufd $68,%xmm4,%xmm4 + vmovdqa %xmm5,(%edx) + vmovdqa %xmm1,16(%edx) + vmovdqa %xmm2,32(%edx) + vmovdqa %xmm3,48(%edx) + vmovdqa %xmm4,64(%edx) + vpmuludq %xmm0,%xmm4,%xmm4 + vpmuludq %xmm0,%xmm3,%xmm3 + vpmuludq %xmm0,%xmm2,%xmm2 + vpmuludq %xmm0,%xmm1,%xmm1 + vpmuludq %xmm0,%xmm5,%xmm0 + vpmuludq 48(%edx),%xmm6,%xmm5 + vpaddq %xmm5,%xmm4,%xmm4 + vpmuludq 32(%edx),%xmm6,%xmm7 + vpaddq %xmm7,%xmm3,%xmm3 + vpmuludq 16(%edx),%xmm6,%xmm5 + vpaddq %xmm5,%xmm2,%xmm2 + vmovdqa 80(%esp),%xmm7 + vpmuludq (%edx),%xmm6,%xmm6 + vpaddq %xmm6,%xmm1,%xmm1 + vmovdqa 32(%esp),%xmm5 + vpmuludq 64(%edx),%xmm7,%xmm7 + vpaddq %xmm7,%xmm0,%xmm0 + vpmuludq 32(%edx),%xmm5,%xmm6 + vpaddq %xmm6,%xmm4,%xmm4 + vpmuludq 16(%edx),%xmm5,%xmm7 + vpaddq %xmm7,%xmm3,%xmm3 + vmovdqa 96(%esp),%xmm6 + vpmuludq (%edx),%xmm5,%xmm5 + vpaddq %xmm5,%xmm2,%xmm2 + vpmuludq 64(%edx),%xmm6,%xmm7 + vpaddq %xmm7,%xmm1,%xmm1 + vmovdqa 48(%esp),%xmm5 + vpmuludq 48(%edx),%xmm6,%xmm6 + vpaddq %xmm6,%xmm0,%xmm0 + vpmuludq 16(%edx),%xmm5,%xmm7 + vpaddq %xmm7,%xmm4,%xmm4 + vmovdqa 112(%esp),%xmm6 + vpmuludq (%edx),%xmm5,%xmm5 + vpaddq %xmm5,%xmm3,%xmm3 + vpmuludq 64(%edx),%xmm6,%xmm7 + vpaddq %xmm7,%xmm2,%xmm2 + vpmuludq 48(%edx),%xmm6,%xmm5 + vpaddq %xmm5,%xmm1,%xmm1 + vmovdqa 64(%esp),%xmm7 + vpmuludq 32(%edx),%xmm6,%xmm6 + vpaddq %xmm6,%xmm0,%xmm0 + vmovdqa 128(%esp),%xmm5 + vpmuludq (%edx),%xmm7,%xmm7 + vpaddq %xmm7,%xmm4,%xmm4 + vpmuludq 64(%edx),%xmm5,%xmm6 + vpaddq %xmm6,%xmm3,%xmm3 + vpmuludq 16(%edx),%xmm5,%xmm7 + vpaddq %xmm7,%xmm0,%xmm0 + vpmuludq 32(%edx),%xmm5,%xmm6 + vpaddq %xmm6,%xmm1,%xmm1 + vmovdqa 64(%ebx),%xmm7 + vpmuludq 48(%edx),%xmm5,%xmm5 + vpaddq %xmm5,%xmm2,%xmm2 + vpsrlq $26,%xmm3,%xmm5 + vpand %xmm7,%xmm3,%xmm3 + vpsrlq $26,%xmm0,%xmm6 + vpand %xmm7,%xmm0,%xmm0 + vpaddq %xmm5,%xmm4,%xmm4 + vpaddq %xmm6,%xmm1,%xmm1 + vpsrlq $26,%xmm4,%xmm5 + vpand %xmm7,%xmm4,%xmm4 + vpsrlq $26,%xmm1,%xmm6 + vpand %xmm7,%xmm1,%xmm1 + vpaddq %xmm6,%xmm2,%xmm2 + vpaddd %xmm5,%xmm0,%xmm0 + vpsllq $2,%xmm5,%xmm5 + vpsrlq $26,%xmm2,%xmm6 + vpand %xmm7,%xmm2,%xmm2 + vpaddd %xmm5,%xmm0,%xmm0 + vpaddd %xmm6,%xmm3,%xmm3 + vpsrlq $26,%xmm3,%xmm6 + vpsrlq $26,%xmm0,%xmm5 + vpand %xmm7,%xmm0,%xmm0 + vpand %xmm7,%xmm3,%xmm3 + vpaddd %xmm5,%xmm1,%xmm1 + vpaddd %xmm6,%xmm4,%xmm4 + decl %ecx + jz .L019square_break + vpunpcklqdq (%esp),%xmm0,%xmm0 + vpunpcklqdq 16(%esp),%xmm1,%xmm1 + vpunpcklqdq 32(%esp),%xmm2,%xmm2 + vpunpcklqdq 48(%esp),%xmm3,%xmm3 + vpunpcklqdq 64(%esp),%xmm4,%xmm4 + jmp .L018square +.L019square_break: + vpsllq $32,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm1 + vpsllq $32,%xmm2,%xmm2 + vpsllq $32,%xmm3,%xmm3 + vpsllq $32,%xmm4,%xmm4 + vpor (%esp),%xmm0,%xmm0 + vpor 16(%esp),%xmm1,%xmm1 + vpor 32(%esp),%xmm2,%xmm2 + vpor 48(%esp),%xmm3,%xmm3 + vpor 64(%esp),%xmm4,%xmm4 + vpshufd $141,%xmm0,%xmm0 + vpshufd $141,%xmm1,%xmm1 + vpshufd $141,%xmm2,%xmm2 + vpshufd $141,%xmm3,%xmm3 + vpshufd $141,%xmm4,%xmm4 + vmovdqu %xmm0,(%edi) + vmovdqu %xmm1,16(%edi) + vmovdqu %xmm2,32(%edi) + vmovdqu %xmm3,48(%edi) + vmovdqu %xmm4,64(%edi) + vpslld $2,%xmm1,%xmm6 + vpslld $2,%xmm2,%xmm5 + vpaddd %xmm1,%xmm6,%xmm6 + vpaddd %xmm2,%xmm5,%xmm5 + vmovdqu %xmm6,80(%edi) + vmovdqu %xmm5,96(%edi) + vpslld $2,%xmm3,%xmm6 + vpslld $2,%xmm4,%xmm5 + vpaddd %xmm3,%xmm6,%xmm6 + vpaddd %xmm4,%xmm5,%xmm5 + vmovdqu %xmm6,112(%edi) + vmovdqu %xmm5,128(%edi) + movl %ebp,%esp + leal -48(%edi),%edi + ret +.size _poly1305_init_avx2,.-_poly1305_init_avx2 +.align 32 +.type _poly1305_blocks_avx2,@function +.align 16 +_poly1305_blocks_avx2: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%ecx + movl 20(%edi),%eax + andl $-16,%ecx + jz .L020nodata + cmpl $64,%ecx + jae .L021enter_avx2 + testl %eax,%eax + jz .Lenter_blocks +.L021enter_avx2: + vzeroupper + call .L022pic_point +.L022pic_point: + popl %ebx + leal .Lconst_sse2-.L022pic_point(%ebx),%ebx + testl %eax,%eax + jnz .L023base2_26 + call _poly1305_init_avx2 + movl (%edi),%eax + movl 3(%edi),%ecx + movl 6(%edi),%edx + movl 9(%edi),%esi + movl 13(%edi),%ebp + shrl $2,%ecx + andl $67108863,%eax + shrl $4,%edx + andl $67108863,%ecx + shrl $6,%esi + andl $67108863,%edx + movl %eax,(%edi) + movl %ecx,4(%edi) + movl %edx,8(%edi) + movl %esi,12(%edi) + movl %ebp,16(%edi) + movl $1,20(%edi) + movl 24(%esp),%esi + movl 28(%esp),%ecx +.L023base2_26: + movl 32(%esp),%eax + movl %esp,%ebp + subl $448,%esp + andl $-512,%esp + vmovdqu 48(%edi),%xmm0 + leal 288(%esp),%edx + vmovdqu 64(%edi),%xmm1 + vmovdqu 80(%edi),%xmm2 + vmovdqu 96(%edi),%xmm3 + vmovdqu 112(%edi),%xmm4 + leal 48(%edi),%edi + vpermq $64,%ymm0,%ymm0 + vpermq $64,%ymm1,%ymm1 + vpermq $64,%ymm2,%ymm2 + vpermq $64,%ymm3,%ymm3 + vpermq $64,%ymm4,%ymm4 + vpshufd $200,%ymm0,%ymm0 + vpshufd $200,%ymm1,%ymm1 + vpshufd $200,%ymm2,%ymm2 + vpshufd $200,%ymm3,%ymm3 + vpshufd $200,%ymm4,%ymm4 + vmovdqa %ymm0,-128(%edx) + vmovdqu 80(%edi),%xmm0 + vmovdqa %ymm1,-96(%edx) + vmovdqu 96(%edi),%xmm1 + vmovdqa %ymm2,-64(%edx) + vmovdqu 112(%edi),%xmm2 + vmovdqa %ymm3,-32(%edx) + vmovdqu 128(%edi),%xmm3 + vmovdqa %ymm4,(%edx) + vpermq $64,%ymm0,%ymm0 + vpermq $64,%ymm1,%ymm1 + vpermq $64,%ymm2,%ymm2 + vpermq $64,%ymm3,%ymm3 + vpshufd $200,%ymm0,%ymm0 + vpshufd $200,%ymm1,%ymm1 + vpshufd $200,%ymm2,%ymm2 + vpshufd $200,%ymm3,%ymm3 + vmovdqa %ymm0,32(%edx) + vmovd -48(%edi),%xmm0 + vmovdqa %ymm1,64(%edx) + vmovd -44(%edi),%xmm1 + vmovdqa %ymm2,96(%edx) + vmovd -40(%edi),%xmm2 + vmovdqa %ymm3,128(%edx) + vmovd -36(%edi),%xmm3 + vmovd -32(%edi),%xmm4 + vmovdqa 64(%ebx),%ymm7 + negl %eax + testl $63,%ecx + jz .L024even + movl %ecx,%edx + andl $-64,%ecx + andl $63,%edx + vmovdqu (%esi),%xmm5 + cmpl $32,%edx + jb .L025one + vmovdqu 16(%esi),%xmm6 + je .L026two + vinserti128 $1,32(%esi),%ymm5,%ymm5 + leal 48(%esi),%esi + leal 8(%ebx),%ebx + leal 296(%esp),%edx + jmp .L027tail +.L026two: + leal 32(%esi),%esi + leal 16(%ebx),%ebx + leal 304(%esp),%edx + jmp .L027tail +.L025one: + leal 16(%esi),%esi + vpxor %ymm6,%ymm6,%ymm6 + leal 32(%ebx,%eax,8),%ebx + leal 312(%esp),%edx + jmp .L027tail +.align 32 +.L024even: + vmovdqu (%esi),%xmm5 + vmovdqu 16(%esi),%xmm6 + vinserti128 $1,32(%esi),%ymm5,%ymm5 + vinserti128 $1,48(%esi),%ymm6,%ymm6 + leal 64(%esi),%esi + subl $64,%ecx + jz .L027tail +.L028loop: + vmovdqa %ymm2,64(%esp) + vpsrldq $6,%ymm5,%ymm2 + vmovdqa %ymm0,(%esp) + vpsrldq $6,%ymm6,%ymm0 + vmovdqa %ymm1,32(%esp) + vpunpckhqdq %ymm6,%ymm5,%ymm1 + vpunpcklqdq %ymm6,%ymm5,%ymm5 + vpunpcklqdq %ymm0,%ymm2,%ymm2 + vpsrlq $30,%ymm2,%ymm0 + vpsrlq $4,%ymm2,%ymm2 + vpsrlq $26,%ymm5,%ymm6 + vpsrlq $40,%ymm1,%ymm1 + vpand %ymm7,%ymm2,%ymm2 + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + vpand %ymm7,%ymm0,%ymm0 + vpor (%ebx),%ymm1,%ymm1 + vpaddq 64(%esp),%ymm2,%ymm2 + vpaddq (%esp),%ymm5,%ymm5 + vpaddq 32(%esp),%ymm6,%ymm6 + vpaddq %ymm3,%ymm0,%ymm0 + vpaddq %ymm4,%ymm1,%ymm1 + vpmuludq -96(%edx),%ymm2,%ymm3 + vmovdqa %ymm6,32(%esp) + vpmuludq -64(%edx),%ymm2,%ymm4 + vmovdqa %ymm0,96(%esp) + vpmuludq 96(%edx),%ymm2,%ymm0 + vmovdqa %ymm1,128(%esp) + vpmuludq 128(%edx),%ymm2,%ymm1 + vpmuludq -128(%edx),%ymm2,%ymm2 + vpmuludq -32(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 + vpmuludq (%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm4,%ymm4 + vpmuludq -128(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm0,%ymm0 + vmovdqa 32(%esp),%ymm7 + vpmuludq -96(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm1,%ymm1 + vpmuludq -64(%edx),%ymm5,%ymm5 + vpaddq %ymm5,%ymm2,%ymm2 + vpmuludq -64(%edx),%ymm7,%ymm6 + vpaddq %ymm6,%ymm3,%ymm3 + vpmuludq -32(%edx),%ymm7,%ymm5 + vpaddq %ymm5,%ymm4,%ymm4 + vpmuludq 128(%edx),%ymm7,%ymm6 + vpaddq %ymm6,%ymm0,%ymm0 + vmovdqa 96(%esp),%ymm6 + vpmuludq -128(%edx),%ymm7,%ymm5 + vpaddq %ymm5,%ymm1,%ymm1 + vpmuludq -96(%edx),%ymm7,%ymm7 + vpaddq %ymm7,%ymm2,%ymm2 + vpmuludq -128(%edx),%ymm6,%ymm5 + vpaddq %ymm5,%ymm3,%ymm3 + vpmuludq -96(%edx),%ymm6,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 + vpmuludq 64(%edx),%ymm6,%ymm5 + vpaddq %ymm5,%ymm0,%ymm0 + vmovdqa 128(%esp),%ymm5 + vpmuludq 96(%edx),%ymm6,%ymm7 + vpaddq %ymm7,%ymm1,%ymm1 + vpmuludq 128(%edx),%ymm6,%ymm6 + vpaddq %ymm6,%ymm2,%ymm2 + vpmuludq 128(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 + vpmuludq 32(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm0,%ymm0 + vpmuludq -128(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 + vmovdqa 64(%ebx),%ymm7 + vpmuludq 64(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm1,%ymm1 + vpmuludq 96(%edx),%ymm5,%ymm5 + vpaddq %ymm5,%ymm2,%ymm2 + vpsrlq $26,%ymm3,%ymm5 + vpand %ymm7,%ymm3,%ymm3 + vpsrlq $26,%ymm0,%ymm6 + vpand %ymm7,%ymm0,%ymm0 + vpaddq %ymm5,%ymm4,%ymm4 + vpaddq %ymm6,%ymm1,%ymm1 + vpsrlq $26,%ymm4,%ymm5 + vpand %ymm7,%ymm4,%ymm4 + vpsrlq $26,%ymm1,%ymm6 + vpand %ymm7,%ymm1,%ymm1 + vpaddq %ymm6,%ymm2,%ymm2 + vpaddq %ymm5,%ymm0,%ymm0 + vpsllq $2,%ymm5,%ymm5 + vpsrlq $26,%ymm2,%ymm6 + vpand %ymm7,%ymm2,%ymm2 + vpaddq %ymm5,%ymm0,%ymm0 + vpaddq %ymm6,%ymm3,%ymm3 + vpsrlq $26,%ymm3,%ymm6 + vpsrlq $26,%ymm0,%ymm5 + vpand %ymm7,%ymm0,%ymm0 + vpand %ymm7,%ymm3,%ymm3 + vpaddq %ymm5,%ymm1,%ymm1 + vpaddq %ymm6,%ymm4,%ymm4 + vmovdqu (%esi),%xmm5 + vmovdqu 16(%esi),%xmm6 + vinserti128 $1,32(%esi),%ymm5,%ymm5 + vinserti128 $1,48(%esi),%ymm6,%ymm6 + leal 64(%esi),%esi + subl $64,%ecx + jnz .L028loop +.L027tail: + vmovdqa %ymm2,64(%esp) + vpsrldq $6,%ymm5,%ymm2 + vmovdqa %ymm0,(%esp) + vpsrldq $6,%ymm6,%ymm0 + vmovdqa %ymm1,32(%esp) + vpunpckhqdq %ymm6,%ymm5,%ymm1 + vpunpcklqdq %ymm6,%ymm5,%ymm5 + vpunpcklqdq %ymm0,%ymm2,%ymm2 + vpsrlq $30,%ymm2,%ymm0 + vpsrlq $4,%ymm2,%ymm2 + vpsrlq $26,%ymm5,%ymm6 + vpsrlq $40,%ymm1,%ymm1 + vpand %ymm7,%ymm2,%ymm2 + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + vpand %ymm7,%ymm0,%ymm0 + vpor (%ebx),%ymm1,%ymm1 + andl $-64,%ebx + vpaddq 64(%esp),%ymm2,%ymm2 + vpaddq (%esp),%ymm5,%ymm5 + vpaddq 32(%esp),%ymm6,%ymm6 + vpaddq %ymm3,%ymm0,%ymm0 + vpaddq %ymm4,%ymm1,%ymm1 + vpmuludq -92(%edx),%ymm2,%ymm3 + vmovdqa %ymm6,32(%esp) + vpmuludq -60(%edx),%ymm2,%ymm4 + vmovdqa %ymm0,96(%esp) + vpmuludq 100(%edx),%ymm2,%ymm0 + vmovdqa %ymm1,128(%esp) + vpmuludq 132(%edx),%ymm2,%ymm1 + vpmuludq -124(%edx),%ymm2,%ymm2 + vpmuludq -28(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 + vpmuludq 4(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm4,%ymm4 + vpmuludq -124(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm0,%ymm0 + vmovdqa 32(%esp),%ymm7 + vpmuludq -92(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm1,%ymm1 + vpmuludq -60(%edx),%ymm5,%ymm5 + vpaddq %ymm5,%ymm2,%ymm2 + vpmuludq -60(%edx),%ymm7,%ymm6 + vpaddq %ymm6,%ymm3,%ymm3 + vpmuludq -28(%edx),%ymm7,%ymm5 + vpaddq %ymm5,%ymm4,%ymm4 + vpmuludq 132(%edx),%ymm7,%ymm6 + vpaddq %ymm6,%ymm0,%ymm0 + vmovdqa 96(%esp),%ymm6 + vpmuludq -124(%edx),%ymm7,%ymm5 + vpaddq %ymm5,%ymm1,%ymm1 + vpmuludq -92(%edx),%ymm7,%ymm7 + vpaddq %ymm7,%ymm2,%ymm2 + vpmuludq -124(%edx),%ymm6,%ymm5 + vpaddq %ymm5,%ymm3,%ymm3 + vpmuludq -92(%edx),%ymm6,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 + vpmuludq 68(%edx),%ymm6,%ymm5 + vpaddq %ymm5,%ymm0,%ymm0 + vmovdqa 128(%esp),%ymm5 + vpmuludq 100(%edx),%ymm6,%ymm7 + vpaddq %ymm7,%ymm1,%ymm1 + vpmuludq 132(%edx),%ymm6,%ymm6 + vpaddq %ymm6,%ymm2,%ymm2 + vpmuludq 132(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 + vpmuludq 36(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm0,%ymm0 + vpmuludq -124(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 + vmovdqa 64(%ebx),%ymm7 + vpmuludq 68(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm1,%ymm1 + vpmuludq 100(%edx),%ymm5,%ymm5 + vpaddq %ymm5,%ymm2,%ymm2 + vpsrldq $8,%ymm4,%ymm5 + vpsrldq $8,%ymm3,%ymm6 + vpaddq %ymm5,%ymm4,%ymm4 + vpsrldq $8,%ymm0,%ymm5 + vpaddq %ymm6,%ymm3,%ymm3 + vpsrldq $8,%ymm1,%ymm6 + vpaddq %ymm5,%ymm0,%ymm0 + vpsrldq $8,%ymm2,%ymm5 + vpaddq %ymm6,%ymm1,%ymm1 + vpermq $2,%ymm4,%ymm6 + vpaddq %ymm5,%ymm2,%ymm2 + vpermq $2,%ymm3,%ymm5 + vpaddq %ymm6,%ymm4,%ymm4 + vpermq $2,%ymm0,%ymm6 + vpaddq %ymm5,%ymm3,%ymm3 + vpermq $2,%ymm1,%ymm5 + vpaddq %ymm6,%ymm0,%ymm0 + vpermq $2,%ymm2,%ymm6 + vpaddq %ymm5,%ymm1,%ymm1 + vpaddq %ymm6,%ymm2,%ymm2 + vpsrlq $26,%ymm3,%ymm5 + vpand %ymm7,%ymm3,%ymm3 + vpsrlq $26,%ymm0,%ymm6 + vpand %ymm7,%ymm0,%ymm0 + vpaddq %ymm5,%ymm4,%ymm4 + vpaddq %ymm6,%ymm1,%ymm1 + vpsrlq $26,%ymm4,%ymm5 + vpand %ymm7,%ymm4,%ymm4 + vpsrlq $26,%ymm1,%ymm6 + vpand %ymm7,%ymm1,%ymm1 + vpaddq %ymm6,%ymm2,%ymm2 + vpaddq %ymm5,%ymm0,%ymm0 + vpsllq $2,%ymm5,%ymm5 + vpsrlq $26,%ymm2,%ymm6 + vpand %ymm7,%ymm2,%ymm2 + vpaddq %ymm5,%ymm0,%ymm0 + vpaddq %ymm6,%ymm3,%ymm3 + vpsrlq $26,%ymm3,%ymm6 + vpsrlq $26,%ymm0,%ymm5 + vpand %ymm7,%ymm0,%ymm0 + vpand %ymm7,%ymm3,%ymm3 + vpaddq %ymm5,%ymm1,%ymm1 + vpaddq %ymm6,%ymm4,%ymm4 + cmpl $0,%ecx + je .L029done + vpshufd $252,%xmm0,%xmm0 + leal 288(%esp),%edx + vpshufd $252,%xmm1,%xmm1 + vpshufd $252,%xmm2,%xmm2 + vpshufd $252,%xmm3,%xmm3 + vpshufd $252,%xmm4,%xmm4 + jmp .L024even +.align 16 +.L029done: + vmovd %xmm0,-48(%edi) + vmovd %xmm1,-44(%edi) + vmovd %xmm2,-40(%edi) + vmovd %xmm3,-36(%edi) + vmovd %xmm4,-32(%edi) + vzeroupper + movl %ebp,%esp +.L020nodata: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size _poly1305_blocks_avx2,.-_poly1305_blocks_avx2 .align 64 .Lconst_sse2: .long 16777216,0,16777216,0,16777216,0,16777216,0 diff --git a/secure/lib/libcrypto/i386/sha1-586.S b/secure/lib/libcrypto/i386/sha1-586.S index 49e7482b816..7e90e2d9b1d 100644 --- a/secure/lib/libcrypto/i386/sha1-586.S +++ b/secure/lib/libcrypto/i386/sha1-586.S @@ -25,6 +25,11 @@ sha1_block_data_order: jz .L001x86 testl $536870912,%ecx jnz .Lshaext_shortcut + andl $268435456,%edx + andl $1073741824,%eax + orl %edx,%eax + cmpl $1342177280,%eax + je .Lavx_shortcut jmp .Lssse3_shortcut .align 16 .L001x86: @@ -2782,1534 +2787,2709 @@ _sha1_block_data_order_ssse3: popl %ebp ret .size _sha1_block_data_order_ssse3,.-_sha1_block_data_order_ssse3 -.align 64 -.LK_XX_XX: -.long 1518500249,1518500249,1518500249,1518500249 -.long 1859775393,1859775393,1859775393,1859775393 -.long 2400959708,2400959708,2400959708,2400959708 -.long 3395469782,3395469782,3395469782,3395469782 -.long 66051,67438087,134810123,202182159 -.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 -.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115 -.byte 102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82 -.byte 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 -.byte 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.comm OPENSSL_ia32cap_P,16,4 -#else -.text -.globl sha1_block_data_order -.type sha1_block_data_order,@function +.type _sha1_block_data_order_avx,@function .align 16 -sha1_block_data_order: -.L_sha1_block_data_order_begin: +_sha1_block_data_order_avx: pushl %ebp pushl %ebx pushl %esi pushl %edi - call .L000pic_point -.L000pic_point: + call .L008pic_point +.L008pic_point: popl %ebp - leal OPENSSL_ia32cap_P,%esi - leal .LK_XX_XX-.L000pic_point(%ebp),%ebp - movl (%esi),%eax - movl 4(%esi),%edx - testl $512,%edx - jz .L001x86 - movl 8(%esi),%ecx - testl $16777216,%eax - jz .L001x86 - testl $536870912,%ecx - jnz .Lshaext_shortcut - jmp .Lssse3_shortcut -.align 16 -.L001x86: - movl 20(%esp),%ebp - movl 24(%esp),%esi - movl 28(%esp),%eax - subl $76,%esp - shll $6,%eax - addl %esi,%eax - movl %eax,104(%esp) - movl 16(%ebp),%edi - jmp .L002loop + leal .LK_XX_XX-.L008pic_point(%ebp),%ebp +.Lavx_shortcut: + vzeroall + vmovdqa (%ebp),%xmm7 + vmovdqa 16(%ebp),%xmm0 + vmovdqa 32(%ebp),%xmm1 + vmovdqa 48(%ebp),%xmm2 + vmovdqa 64(%ebp),%xmm6 + movl 20(%esp),%edi + movl 24(%esp),%ebp + movl 28(%esp),%edx + movl %esp,%esi + subl $208,%esp + andl $-64,%esp + vmovdqa %xmm0,112(%esp) + vmovdqa %xmm1,128(%esp) + vmovdqa %xmm2,144(%esp) + shll $6,%edx + vmovdqa %xmm7,160(%esp) + addl %ebp,%edx + vmovdqa %xmm6,176(%esp) + addl $64,%ebp + movl %edi,192(%esp) + movl %ebp,196(%esp) + movl %edx,200(%esp) + movl %esi,204(%esp) + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl 16(%edi),%edi + movl %ebx,%esi + vmovdqu -64(%ebp),%xmm0 + vmovdqu -48(%ebp),%xmm1 + vmovdqu -32(%ebp),%xmm2 + vmovdqu -16(%ebp),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + vpshufb %xmm6,%xmm1,%xmm1 + vpshufb %xmm6,%xmm2,%xmm2 + vmovdqa %xmm7,96(%esp) + vpshufb %xmm6,%xmm3,%xmm3 + vpaddd %xmm7,%xmm0,%xmm4 + vpaddd %xmm7,%xmm1,%xmm5 + vpaddd %xmm7,%xmm2,%xmm6 + vmovdqa %xmm4,(%esp) + movl %ecx,%ebp + vmovdqa %xmm5,16(%esp) + xorl %edx,%ebp + vmovdqa %xmm6,32(%esp) + andl %ebp,%esi + jmp .L009loop .align 16 -.L002loop: - movl (%esi),%eax - movl 4(%esi),%ebx - movl 8(%esi),%ecx - movl 12(%esi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - movl %eax,(%esp) - movl %ebx,4(%esp) - movl %ecx,8(%esp) - movl %edx,12(%esp) - movl 16(%esi),%eax - movl 20(%esi),%ebx - movl 24(%esi),%ecx - movl 28(%esi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - movl %eax,16(%esp) - movl %ebx,20(%esp) - movl %ecx,24(%esp) - movl %edx,28(%esp) - movl 32(%esi),%eax - movl 36(%esi),%ebx - movl 40(%esi),%ecx - movl 44(%esi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - movl %eax,32(%esp) - movl %ebx,36(%esp) - movl %ecx,40(%esp) - movl %edx,44(%esp) - movl 48(%esi),%eax - movl 52(%esi),%ebx - movl 56(%esi),%ecx - movl 60(%esi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - movl %eax,48(%esp) - movl %ebx,52(%esp) - movl %ecx,56(%esp) - movl %edx,60(%esp) - movl %esi,100(%esp) - movl (%ebp),%eax - movl 4(%ebp),%ebx - movl 8(%ebp),%ecx - movl 12(%ebp),%edx - - movl %ecx,%esi - movl %eax,%ebp - roll $5,%ebp - xorl %edx,%esi - addl %edi,%ebp - movl (%esp),%edi - andl %ebx,%esi - rorl $2,%ebx +.L009loop: + shrdl $2,%ebx,%ebx xorl %edx,%esi - leal 1518500249(%ebp,%edi,1),%ebp - addl %esi,%ebp - - movl %ebx,%edi - movl %ebp,%esi - roll $5,%ebp - xorl %ecx,%edi - addl %edx,%ebp - movl 4(%esp),%edx - andl %eax,%edi - rorl $2,%eax - xorl %ecx,%edi - leal 1518500249(%ebp,%edx,1),%ebp - addl %edi,%ebp - - movl %eax,%edx - movl %ebp,%edi - roll $5,%ebp - xorl %ebx,%edx - addl %ecx,%ebp - movl 8(%esp),%ecx - andl %esi,%edx - rorl $2,%esi - xorl %ebx,%edx - leal 1518500249(%ebp,%ecx,1),%ebp - addl %edx,%ebp - - movl %esi,%ecx - movl %ebp,%edx - roll $5,%ebp - xorl %eax,%ecx - addl %ebx,%ebp - movl 12(%esp),%ebx - andl %edi,%ecx - rorl $2,%edi - xorl %eax,%ecx - leal 1518500249(%ebp,%ebx,1),%ebp - addl %ecx,%ebp - - movl %edi,%ebx - movl %ebp,%ecx - roll $5,%ebp - xorl %esi,%ebx - addl %eax,%ebp - movl 16(%esp),%eax - andl %edx,%ebx - rorl $2,%edx - xorl %esi,%ebx - leal 1518500249(%ebp,%eax,1),%ebp - addl %ebx,%ebp - - movl %edx,%eax - movl %ebp,%ebx - roll $5,%ebp - xorl %edi,%eax - addl %esi,%ebp - movl 20(%esp),%esi - andl %ecx,%eax - rorl $2,%ecx - xorl %edi,%eax - leal 1518500249(%ebp,%esi,1),%ebp - addl %eax,%ebp - + vpalignr $8,%xmm0,%xmm1,%xmm4 + movl %eax,%ebp + addl (%esp),%edi + vpaddd %xmm3,%xmm7,%xmm7 + vmovdqa %xmm0,64(%esp) + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrldq $4,%xmm3,%xmm6 + addl %esi,%edi + andl %ebx,%ebp + vpxor %xmm0,%xmm4,%xmm4 + xorl %ecx,%ebx + addl %eax,%edi + vpxor %xmm2,%xmm6,%xmm6 + shrdl $7,%eax,%eax + xorl %ecx,%ebp + vmovdqa %xmm7,48(%esp) + movl %edi,%esi + addl 4(%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + xorl %ebx,%eax + shldl $5,%edi,%edi + addl %ebp,%edx + andl %eax,%esi + vpsrld $31,%xmm4,%xmm6 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi + xorl %ebx,%esi + vpslldq $12,%xmm4,%xmm0 + vpaddd %xmm4,%xmm4,%xmm4 + movl %edx,%ebp + addl 8(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpsrld $30,%xmm0,%xmm7 + vpor %xmm6,%xmm4,%xmm4 + addl %esi,%ecx + andl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + vpslld $2,%xmm0,%xmm0 + shrdl $7,%edx,%edx + xorl %eax,%ebp + vpxor %xmm7,%xmm4,%xmm4 movl %ecx,%esi - movl %ebp,%eax - roll $5,%ebp - xorl %edx,%esi - addl %edi,%ebp - movl 24(%esp),%edi + addl 12(%esp),%ebx + xorl %edi,%edx + shldl $5,%ecx,%ecx + vpxor %xmm0,%xmm4,%xmm4 + addl %ebp,%ebx + andl %edx,%esi + vmovdqa 96(%esp),%xmm0 + xorl %edi,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %edi,%esi + vpalignr $8,%xmm1,%xmm2,%xmm5 + movl %ebx,%ebp + addl 16(%esp),%eax + vpaddd %xmm4,%xmm0,%xmm0 + vmovdqa %xmm1,80(%esp) + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrldq $4,%xmm4,%xmm7 + addl %esi,%eax + andl %ecx,%ebp + vpxor %xmm1,%xmm5,%xmm5 + xorl %edx,%ecx + addl %ebx,%eax + vpxor %xmm3,%xmm7,%xmm7 + shrdl $7,%ebx,%ebx + xorl %edx,%ebp + vmovdqa %xmm0,(%esp) + movl %eax,%esi + addl 20(%esp),%edi + vpxor %xmm7,%xmm5,%xmm5 + xorl %ecx,%ebx + shldl $5,%eax,%eax + addl %ebp,%edi andl %ebx,%esi - rorl $2,%ebx - xorl %edx,%esi - leal 1518500249(%ebp,%edi,1),%ebp - addl %esi,%ebp - - movl %ebx,%edi - movl %ebp,%esi - roll $5,%ebp - xorl %ecx,%edi - addl %edx,%ebp - movl 28(%esp),%edx - andl %eax,%edi - rorl $2,%eax - xorl %ecx,%edi - leal 1518500249(%ebp,%edx,1),%ebp - addl %edi,%ebp - - movl %eax,%edx - movl %ebp,%edi - roll $5,%ebp - xorl %ebx,%edx - addl %ecx,%ebp - movl 32(%esp),%ecx - andl %esi,%edx - rorl $2,%esi - xorl %ebx,%edx - leal 1518500249(%ebp,%ecx,1),%ebp - addl %edx,%ebp - - movl %esi,%ecx - movl %ebp,%edx - roll $5,%ebp - xorl %eax,%ecx - addl %ebx,%ebp - movl 36(%esp),%ebx - andl %edi,%ecx - rorl $2,%edi - xorl %eax,%ecx - leal 1518500249(%ebp,%ebx,1),%ebp - addl %ecx,%ebp - - movl %edi,%ebx - movl %ebp,%ecx - roll $5,%ebp - xorl %esi,%ebx - addl %eax,%ebp - movl 40(%esp),%eax - andl %edx,%ebx - rorl $2,%edx - xorl %esi,%ebx - leal 1518500249(%ebp,%eax,1),%ebp - addl %ebx,%ebp - - movl %edx,%eax - movl %ebp,%ebx - roll $5,%ebp - xorl %edi,%eax - addl %esi,%ebp - movl 44(%esp),%esi - andl %ecx,%eax - rorl $2,%ecx - xorl %edi,%eax - leal 1518500249(%ebp,%esi,1),%ebp - addl %eax,%ebp - - movl %ecx,%esi - movl %ebp,%eax - roll $5,%ebp - xorl %edx,%esi - addl %edi,%ebp - movl 48(%esp),%edi - andl %ebx,%esi - rorl $2,%ebx - xorl %edx,%esi - leal 1518500249(%ebp,%edi,1),%ebp - addl %esi,%ebp - - movl %ebx,%edi - movl %ebp,%esi - roll $5,%ebp - xorl %ecx,%edi - addl %edx,%ebp - movl 52(%esp),%edx - andl %eax,%edi - rorl $2,%eax - xorl %ecx,%edi - leal 1518500249(%ebp,%edx,1),%ebp - addl %edi,%ebp - - movl %eax,%edx - movl %ebp,%edi - roll $5,%ebp - xorl %ebx,%edx - addl %ecx,%ebp - movl 56(%esp),%ecx - andl %esi,%edx - rorl $2,%esi - xorl %ebx,%edx - leal 1518500249(%ebp,%ecx,1),%ebp - addl %edx,%ebp - - movl %esi,%ecx - movl %ebp,%edx - roll $5,%ebp - xorl %eax,%ecx - addl %ebx,%ebp - movl 60(%esp),%ebx - andl %edi,%ecx - rorl $2,%edi - xorl %eax,%ecx - leal 1518500249(%ebp,%ebx,1),%ebp - movl (%esp),%ebx - addl %ebp,%ecx - + vpsrld $31,%xmm5,%xmm7 + xorl %ecx,%ebx + addl %eax,%edi + shrdl $7,%eax,%eax + xorl %ecx,%esi + vpslldq $12,%xmm5,%xmm1 + vpaddd %xmm5,%xmm5,%xmm5 movl %edi,%ebp - xorl 8(%esp),%ebx - xorl %esi,%ebp - xorl 32(%esp),%ebx - andl %edx,%ebp - xorl 52(%esp),%ebx - roll $1,%ebx - xorl %esi,%ebp - addl %ebp,%eax + addl 24(%esp),%edx + xorl %ebx,%eax + shldl $5,%edi,%edi + vpsrld $30,%xmm1,%xmm0 + vpor %xmm7,%xmm5,%xmm5 + addl %esi,%edx + andl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + vpslld $2,%xmm1,%xmm1 + shrdl $7,%edi,%edi + xorl %ebx,%ebp + vpxor %xmm0,%xmm5,%xmm5 + movl %edx,%esi + addl 28(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpxor %xmm1,%xmm5,%xmm5 + addl %ebp,%ecx + andl %edi,%esi + vmovdqa 112(%esp),%xmm1 + xorl %eax,%edi + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + vpalignr $8,%xmm2,%xmm3,%xmm6 movl %ecx,%ebp - rorl $2,%edx - movl %ebx,(%esp) - roll $5,%ebp - leal 1518500249(%ebx,%eax,1),%ebx - movl 4(%esp),%eax - addl %ebp,%ebx - - movl %edx,%ebp - xorl 12(%esp),%eax - xorl %edi,%ebp - xorl 36(%esp),%eax - andl %ecx,%ebp - xorl 56(%esp),%eax - roll $1,%eax + addl 32(%esp),%ebx + vpaddd %xmm5,%xmm1,%xmm1 + vmovdqa %xmm2,96(%esp) + xorl %edi,%edx + shldl $5,%ecx,%ecx + vpsrldq $4,%xmm5,%xmm0 + addl %esi,%ebx + andl %edx,%ebp + vpxor %xmm2,%xmm6,%xmm6 + xorl %edi,%edx + addl %ecx,%ebx + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%ecx,%ecx xorl %edi,%ebp - addl %ebp,%esi - movl %ebx,%ebp - rorl $2,%ecx - movl %eax,4(%esp) - roll $5,%ebp - leal 1518500249(%eax,%esi,1),%eax - movl 8(%esp),%esi + vmovdqa %xmm1,16(%esp) + movl %ebx,%esi + addl 36(%esp),%eax + vpxor %xmm0,%xmm6,%xmm6 + xorl %edx,%ecx + shldl $5,%ebx,%ebx addl %ebp,%eax - - movl %ecx,%ebp - xorl 16(%esp),%esi - xorl %edx,%ebp - xorl 40(%esp),%esi - andl %ebx,%ebp - xorl 60(%esp),%esi - roll $1,%esi - xorl %edx,%ebp - addl %ebp,%edi + andl %ecx,%esi + vpsrld $31,%xmm6,%xmm0 + xorl %edx,%ecx + addl %ebx,%eax + shrdl $7,%ebx,%ebx + xorl %edx,%esi + vpslldq $12,%xmm6,%xmm2 + vpaddd %xmm6,%xmm6,%xmm6 movl %eax,%ebp - rorl $2,%ebx - movl %esi,8(%esp) - roll $5,%ebp - leal 1518500249(%esi,%edi,1),%esi - movl 12(%esp),%edi - addl %ebp,%esi - - movl %ebx,%ebp - xorl 20(%esp),%edi - xorl %ecx,%ebp - xorl 44(%esp),%edi - andl %eax,%ebp - xorl (%esp),%edi - roll $1,%edi + addl 40(%esp),%edi + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm1 + vpor %xmm0,%xmm6,%xmm6 + addl %esi,%edi + andl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + vpslld $2,%xmm2,%xmm2 + vmovdqa 64(%esp),%xmm0 + shrdl $7,%eax,%eax xorl %ecx,%ebp + vpxor %xmm1,%xmm6,%xmm6 + movl %edi,%esi + addl 44(%esp),%edx + xorl %ebx,%eax + shldl $5,%edi,%edi + vpxor %xmm2,%xmm6,%xmm6 addl %ebp,%edx - movl %esi,%ebp - rorl $2,%eax - movl %edi,12(%esp) - roll $5,%ebp - leal 1518500249(%edi,%edx,1),%edi - movl 16(%esp),%edx - addl %ebp,%edi - - movl %esi,%ebp - xorl 24(%esp),%edx + andl %eax,%esi + vmovdqa 112(%esp),%xmm2 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi + xorl %ebx,%esi + vpalignr $8,%xmm3,%xmm4,%xmm7 + movl %edx,%ebp + addl 48(%esp),%ecx + vpaddd %xmm6,%xmm2,%xmm2 + vmovdqa %xmm3,64(%esp) + xorl %eax,%edi + shldl $5,%edx,%edx + vpsrldq $4,%xmm6,%xmm1 + addl %esi,%ecx + andl %edi,%ebp + vpxor %xmm3,%xmm7,%xmm7 + xorl %eax,%edi + addl %edx,%ecx + vpxor %xmm5,%xmm1,%xmm1 + shrdl $7,%edx,%edx xorl %eax,%ebp - xorl 48(%esp),%edx + vmovdqa %xmm2,32(%esp) + movl %ecx,%esi + addl 52(%esp),%ebx + vpxor %xmm1,%xmm7,%xmm7 + xorl %edi,%edx + shldl $5,%ecx,%ecx + addl %ebp,%ebx + andl %edx,%esi + vpsrld $31,%xmm7,%xmm1 + xorl %edi,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %edi,%esi + vpslldq $12,%xmm7,%xmm3 + vpaddd %xmm7,%xmm7,%xmm7 + movl %ebx,%ebp + addl 56(%esp),%eax + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm2 + vpor %xmm1,%xmm7,%xmm7 + addl %esi,%eax + andl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + vmovdqa 80(%esp),%xmm1 + shrdl $7,%ebx,%ebx + xorl %edx,%ebp + vpxor %xmm2,%xmm7,%xmm7 + movl %eax,%esi + addl 60(%esp),%edi + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpxor %xmm3,%xmm7,%xmm7 + addl %ebp,%edi + andl %ebx,%esi + vmovdqa 112(%esp),%xmm3 + xorl %ecx,%ebx + addl %eax,%edi + vpalignr $8,%xmm6,%xmm7,%xmm2 + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%esi + movl %edi,%ebp + addl (%esp),%edx + vpxor %xmm1,%xmm0,%xmm0 + vmovdqa %xmm4,80(%esp) + xorl %ebx,%eax + shldl $5,%edi,%edi + vmovdqa %xmm3,%xmm4 + vpaddd %xmm7,%xmm3,%xmm3 + addl %esi,%edx + andl %eax,%ebp + vpxor %xmm2,%xmm0,%xmm0 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi xorl %ebx,%ebp - xorl 4(%esp),%edx - roll $1,%edx + vpsrld $30,%xmm0,%xmm2 + vmovdqa %xmm3,48(%esp) + movl %edx,%esi + addl 4(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpslld $2,%xmm0,%xmm0 addl %ebp,%ecx - rorl $2,%esi - movl %edi,%ebp - roll $5,%ebp - movl %edx,16(%esp) - leal 1859775393(%edx,%ecx,1),%edx - movl 20(%esp),%ecx - addl %ebp,%edx - - movl %edi,%ebp - xorl 28(%esp),%ecx - xorl %esi,%ebp - xorl 52(%esp),%ecx - xorl %eax,%ebp - xorl 8(%esp),%ecx - roll $1,%ecx - addl %ebp,%ebx - rorl $2,%edi - movl %edx,%ebp - roll $5,%ebp - movl %ecx,20(%esp) - leal 1859775393(%ecx,%ebx,1),%ecx - movl 24(%esp),%ebx - addl %ebp,%ecx - - movl %edx,%ebp - xorl 32(%esp),%ebx - xorl %edi,%ebp - xorl 56(%esp),%ebx - xorl %esi,%ebp - xorl 12(%esp),%ebx - roll $1,%ebx - addl %ebp,%eax - rorl $2,%edx - movl %ecx,%ebp - roll $5,%ebp - movl %ebx,24(%esp) - leal 1859775393(%ebx,%eax,1),%ebx - movl 28(%esp),%eax - addl %ebp,%ebx - + andl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi movl %ecx,%ebp - xorl 36(%esp),%eax - xorl %edx,%ebp - xorl 60(%esp),%eax + addl 8(%esp),%ebx + vpor %xmm2,%xmm0,%xmm0 + xorl %edi,%edx + shldl $5,%ecx,%ecx + vmovdqa 96(%esp),%xmm2 + addl %esi,%ebx + andl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + addl 12(%esp),%eax xorl %edi,%ebp - xorl 16(%esp),%eax - roll $1,%eax - addl %ebp,%esi - rorl $2,%ecx - movl %ebx,%ebp - roll $5,%ebp - movl %eax,28(%esp) - leal 1859775393(%eax,%esi,1),%eax - movl 32(%esp),%esi + movl %ebx,%esi + shldl $5,%ebx,%ebx addl %ebp,%eax - - movl %ebx,%ebp - xorl 40(%esp),%esi - xorl %ecx,%ebp - xorl (%esp),%esi - xorl %edx,%ebp - xorl 20(%esp),%esi - roll $1,%esi - addl %ebp,%edi - rorl $2,%ebx - movl %eax,%ebp - roll $5,%ebp - movl %esi,32(%esp) - leal 1859775393(%esi,%edi,1),%esi - movl 36(%esp),%edi - addl %ebp,%esi - + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm7,%xmm0,%xmm3 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%esp),%edi + xorl %ecx,%esi movl %eax,%ebp - xorl 44(%esp),%edi - xorl %ebx,%ebp - xorl 4(%esp),%edi + shldl $5,%eax,%eax + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm5,96(%esp) + addl %esi,%edi xorl %ecx,%ebp - xorl 24(%esp),%edi - roll $1,%edi - addl %ebp,%edx - rorl $2,%eax - movl %esi,%ebp - roll $5,%ebp - movl %edi,36(%esp) - leal 1859775393(%edi,%edx,1),%edi - movl 40(%esp),%edx - addl %ebp,%edi - - movl %esi,%ebp - xorl 48(%esp),%edx - xorl %eax,%ebp - xorl 8(%esp),%edx + vmovdqa %xmm4,%xmm5 + vpaddd %xmm0,%xmm4,%xmm4 + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpxor %xmm3,%xmm1,%xmm1 + addl 20(%esp),%edx xorl %ebx,%ebp - xorl 28(%esp),%edx - roll $1,%edx - addl %ebp,%ecx - rorl $2,%esi - movl %edi,%ebp - roll $5,%ebp - movl %edx,40(%esp) - leal 1859775393(%edx,%ecx,1),%edx - movl 44(%esp),%ecx + movl %edi,%esi + shldl $5,%edi,%edi + vpsrld $30,%xmm1,%xmm3 + vmovdqa %xmm4,(%esp) addl %ebp,%edx - - movl %edi,%ebp - xorl 52(%esp),%ecx - xorl %esi,%ebp - xorl 12(%esp),%ecx - xorl %eax,%ebp - xorl 32(%esp),%ecx - roll $1,%ecx - addl %ebp,%ebx - rorl $2,%edi - movl %edx,%ebp - roll $5,%ebp - movl %ecx,44(%esp) - leal 1859775393(%ecx,%ebx,1),%ecx - movl 48(%esp),%ebx - addl %ebp,%ecx - + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpslld $2,%xmm1,%xmm1 + addl 24(%esp),%ecx + xorl %eax,%esi movl %edx,%ebp - xorl 56(%esp),%ebx + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + vpor %xmm3,%xmm1,%xmm1 + addl 28(%esp),%ebx xorl %edi,%ebp - xorl 16(%esp),%ebx - xorl %esi,%ebp - xorl 36(%esp),%ebx - roll $1,%ebx - addl %ebp,%eax - rorl $2,%edx - movl %ecx,%ebp - roll $5,%ebp - movl %ebx,48(%esp) - leal 1859775393(%ebx,%eax,1),%ebx - movl 52(%esp),%eax + vmovdqa 64(%esp),%xmm3 + movl %ecx,%esi + shldl $5,%ecx,%ecx addl %ebp,%ebx - - movl %ecx,%ebp - xorl 60(%esp),%eax - xorl %edx,%ebp - xorl 20(%esp),%eax - xorl %edi,%ebp - xorl 40(%esp),%eax - roll $1,%eax - addl %ebp,%esi - rorl $2,%ecx - movl %ebx,%ebp - roll $5,%ebp - movl %eax,52(%esp) - leal 1859775393(%eax,%esi,1),%eax - movl 56(%esp),%esi - addl %ebp,%eax - + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpalignr $8,%xmm0,%xmm1,%xmm4 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%esp),%eax + xorl %edx,%esi movl %ebx,%ebp - xorl (%esp),%esi - xorl %ecx,%ebp - xorl 24(%esp),%esi + shldl $5,%ebx,%ebx + vpxor %xmm3,%xmm2,%xmm2 + vmovdqa %xmm6,64(%esp) + addl %esi,%eax xorl %edx,%ebp - xorl 44(%esp),%esi - roll $1,%esi - addl %ebp,%edi - rorl $2,%ebx - movl %eax,%ebp - roll $5,%ebp - movl %esi,56(%esp) - leal 1859775393(%esi,%edi,1),%esi - movl 60(%esp),%edi - addl %ebp,%esi - - movl %eax,%ebp - xorl 4(%esp),%edi - xorl %ebx,%ebp - xorl 28(%esp),%edi + vmovdqa 128(%esp),%xmm6 + vpaddd %xmm1,%xmm5,%xmm5 + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpxor %xmm4,%xmm2,%xmm2 + addl 36(%esp),%edi xorl %ecx,%ebp - xorl 48(%esp),%edi - roll $1,%edi - addl %ebp,%edx - rorl $2,%eax - movl %esi,%ebp - roll $5,%ebp - movl %edi,60(%esp) - leal 1859775393(%edi,%edx,1),%edi - movl (%esp),%edx + movl %eax,%esi + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm4 + vmovdqa %xmm5,16(%esp) addl %ebp,%edi - - movl %esi,%ebp - xorl 8(%esp),%edx - xorl %eax,%ebp - xorl 32(%esp),%edx - xorl %ebx,%ebp - xorl 52(%esp),%edx - roll $1,%edx - addl %ebp,%ecx - rorl $2,%esi - movl %edi,%ebp - roll $5,%ebp - movl %edx,(%esp) - leal 1859775393(%edx,%ecx,1),%edx - movl 4(%esp),%ecx - addl %ebp,%edx - + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpslld $2,%xmm2,%xmm2 + addl 40(%esp),%edx + xorl %ebx,%esi movl %edi,%ebp - xorl 12(%esp),%ecx - xorl %esi,%ebp - xorl 36(%esp),%ecx + shldl $5,%edi,%edi + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + vpor %xmm4,%xmm2,%xmm2 + addl 44(%esp),%ecx xorl %eax,%ebp - xorl 56(%esp),%ecx - roll $1,%ecx - addl %ebp,%ebx - rorl $2,%edi - movl %edx,%ebp - roll $5,%ebp - movl %ecx,4(%esp) - leal 1859775393(%ecx,%ebx,1),%ecx - movl 8(%esp),%ebx + vmovdqa 80(%esp),%xmm4 + movl %edx,%esi + shldl $5,%edx,%edx addl %ebp,%ecx - - movl %edx,%ebp - xorl 16(%esp),%ebx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + vpalignr $8,%xmm1,%xmm2,%xmm5 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + vpxor %xmm4,%xmm3,%xmm3 + vmovdqa %xmm7,80(%esp) + addl %esi,%ebx xorl %edi,%ebp - xorl 40(%esp),%ebx - xorl %esi,%ebp - xorl 60(%esp),%ebx - roll $1,%ebx + vmovdqa %xmm6,%xmm7 + vpaddd %xmm2,%xmm6,%xmm6 + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpxor %xmm5,%xmm3,%xmm3 + addl 52(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm5 + vmovdqa %xmm6,32(%esp) addl %ebp,%eax - rorl $2,%edx - movl %ecx,%ebp - roll $5,%ebp - movl %ebx,8(%esp) - leal 1859775393(%ebx,%eax,1),%ebx - movl 12(%esp),%eax - addl %ebp,%ebx - - movl %ecx,%ebp - xorl 20(%esp),%eax - xorl %edx,%ebp - xorl 44(%esp),%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + addl 56(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ecx,%ebp + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpor %xmm5,%xmm3,%xmm3 + addl 60(%esp),%edx + xorl %ebx,%ebp + vmovdqa 96(%esp),%xmm5 + movl %edi,%esi + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpalignr $8,%xmm2,%xmm3,%xmm6 + vpxor %xmm0,%xmm4,%xmm4 + addl (%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + vpxor %xmm5,%xmm4,%xmm4 + vmovdqa %xmm0,96(%esp) + addl %esi,%ecx + xorl %eax,%ebp + vmovdqa %xmm7,%xmm0 + vpaddd %xmm3,%xmm7,%xmm7 + shrdl $7,%edi,%edi + addl %edx,%ecx + vpxor %xmm6,%xmm4,%xmm4 + addl 4(%esp),%ebx xorl %edi,%ebp - xorl (%esp),%eax - roll $1,%eax - addl %ebp,%esi - rorl $2,%ecx - movl %ebx,%ebp - roll $5,%ebp - movl %eax,12(%esp) - leal 1859775393(%eax,%esi,1),%eax - movl 16(%esp),%esi - addl %ebp,%eax - + movl %ecx,%esi + shldl $5,%ecx,%ecx + vpsrld $30,%xmm4,%xmm6 + vmovdqa %xmm7,48(%esp) + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpslld $2,%xmm4,%xmm4 + addl 8(%esp),%eax + xorl %edx,%esi movl %ebx,%ebp - xorl 24(%esp),%esi - xorl %ecx,%ebp - xorl 48(%esp),%esi + shldl $5,%ebx,%ebx + addl %esi,%eax xorl %edx,%ebp - xorl 4(%esp),%esi - roll $1,%esi - addl %ebp,%edi - rorl $2,%ebx - movl %eax,%ebp - roll $5,%ebp - movl %esi,16(%esp) - leal 1859775393(%esi,%edi,1),%esi - movl 20(%esp),%edi - addl %ebp,%esi - - movl %eax,%ebp - xorl 28(%esp),%edi - xorl %ebx,%ebp - xorl 52(%esp),%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpor %xmm6,%xmm4,%xmm4 + addl 12(%esp),%edi xorl %ecx,%ebp - xorl 8(%esp),%edi - roll $1,%edi - addl %ebp,%edx - rorl $2,%eax - movl %esi,%ebp - roll $5,%ebp - movl %edi,20(%esp) - leal 1859775393(%edi,%edx,1),%edi - movl 24(%esp),%edx + vmovdqa 64(%esp),%xmm6 + movl %eax,%esi + shldl $5,%eax,%eax addl %ebp,%edi - - movl %esi,%ebp - xorl 32(%esp),%edx - xorl %eax,%ebp - xorl 56(%esp),%edx - xorl %ebx,%ebp - xorl 12(%esp),%edx - roll $1,%edx - addl %ebp,%ecx - rorl $2,%esi - movl %edi,%ebp - roll $5,%ebp - movl %edx,24(%esp) - leal 1859775393(%edx,%ecx,1),%edx - movl 28(%esp),%ecx - addl %ebp,%edx - + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpalignr $8,%xmm3,%xmm4,%xmm7 + vpxor %xmm1,%xmm5,%xmm5 + addl 16(%esp),%edx + xorl %ebx,%esi movl %edi,%ebp - xorl 36(%esp),%ecx - xorl %esi,%ebp - xorl 60(%esp),%ecx + shldl $5,%edi,%edi + vpxor %xmm6,%xmm5,%xmm5 + vmovdqa %xmm1,64(%esp) + addl %esi,%edx + xorl %ebx,%ebp + vmovdqa %xmm0,%xmm1 + vpaddd %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + addl %edi,%edx + vpxor %xmm7,%xmm5,%xmm5 + addl 20(%esp),%ecx xorl %eax,%ebp - xorl 16(%esp),%ecx - roll $1,%ecx - addl %ebp,%ebx - rorl $2,%edi - movl %edx,%ebp - roll $5,%ebp - movl %ecx,28(%esp) - leal 1859775393(%ecx,%ebx,1),%ecx - movl 32(%esp),%ebx + movl %edx,%esi + shldl $5,%edx,%edx + vpsrld $30,%xmm5,%xmm7 + vmovdqa %xmm0,(%esp) addl %ebp,%ecx - - movl %edi,%ebp - xorl 40(%esp),%ebx - xorl %esi,%ebp - xorl (%esp),%ebx - andl %edx,%ebp - xorl 20(%esp),%ebx - roll $1,%ebx - addl %eax,%ebp - rorl $2,%edx - movl %ecx,%eax - roll $5,%eax - movl %ebx,32(%esp) - leal 2400959708(%ebx,%ebp,1),%ebx - movl %edi,%ebp - addl %eax,%ebx - andl %esi,%ebp - movl 36(%esp),%eax - addl %ebp,%ebx - - movl %edx,%ebp - xorl 44(%esp),%eax + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + vpslld $2,%xmm5,%xmm5 + addl 24(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx xorl %edi,%ebp - xorl 4(%esp),%eax - andl %ecx,%ebp - xorl 24(%esp),%eax - roll $1,%eax - addl %esi,%ebp - rorl $2,%ecx + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpor %xmm7,%xmm5,%xmm5 + addl 28(%esp),%eax + vmovdqa 80(%esp),%xmm7 + shrdl $7,%ecx,%ecx movl %ebx,%esi - roll $5,%esi - movl %eax,36(%esp) - leal 2400959708(%eax,%ebp,1),%eax - movl %edx,%ebp - addl %esi,%eax - andl %edi,%ebp - movl 40(%esp),%esi - addl %ebp,%eax - - movl %ecx,%ebp - xorl 48(%esp),%esi xorl %edx,%ebp - xorl 8(%esp),%esi - andl %ebx,%ebp - xorl 28(%esp),%esi - roll $1,%esi - addl %edi,%ebp - rorl $2,%ebx - movl %eax,%edi - roll $5,%edi - movl %esi,40(%esp) - leal 2400959708(%esi,%ebp,1),%esi - movl %ecx,%ebp - addl %edi,%esi - andl %edx,%ebp - movl 44(%esp),%edi - addl %ebp,%esi - - movl %ebx,%ebp - xorl 52(%esp),%edi - xorl %ecx,%ebp - xorl 12(%esp),%edi - andl %eax,%ebp - xorl 32(%esp),%edi - roll $1,%edi - addl %edx,%ebp - rorl $2,%eax - movl %esi,%edx - roll $5,%edx - movl %edi,44(%esp) - leal 2400959708(%edi,%ebp,1),%edi - movl %ebx,%ebp - addl %edx,%edi - andl %ecx,%ebp - movl 48(%esp),%edx - addl %ebp,%edi - + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm4,%xmm5,%xmm0 + vpxor %xmm2,%xmm6,%xmm6 + addl 32(%esp),%edi + andl %ecx,%esi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vpxor %xmm7,%xmm6,%xmm6 + vmovdqa %xmm2,80(%esp) movl %eax,%ebp - xorl 56(%esp),%edx + xorl %ecx,%esi + vmovdqa %xmm1,%xmm2 + vpaddd %xmm5,%xmm1,%xmm1 + shldl $5,%eax,%eax + addl %esi,%edi + vpxor %xmm0,%xmm6,%xmm6 xorl %ebx,%ebp - xorl 16(%esp),%edx - andl %esi,%ebp - xorl 36(%esp),%edx - roll $1,%edx - addl %ecx,%ebp - rorl $2,%esi - movl %edi,%ecx - roll $5,%ecx - movl %edx,48(%esp) - leal 2400959708(%edx,%ebp,1),%edx - movl %eax,%ebp - addl %ecx,%edx + xorl %ecx,%ebx + addl %eax,%edi + addl 36(%esp),%edx + vpsrld $30,%xmm6,%xmm0 + vmovdqa %xmm1,16(%esp) andl %ebx,%ebp - movl 52(%esp),%ecx + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %edi,%esi + vpslld $2,%xmm6,%xmm6 + xorl %ebx,%ebp + shldl $5,%edi,%edi addl %ebp,%edx - - movl %esi,%ebp - xorl 60(%esp),%ecx - xorl %eax,%ebp - xorl 20(%esp),%ecx + xorl %eax,%esi + xorl %ebx,%eax + addl %edi,%edx + addl 40(%esp),%ecx + andl %eax,%esi + vpor %xmm0,%xmm6,%xmm6 + xorl %ebx,%eax + shrdl $7,%edi,%edi + vmovdqa 96(%esp),%xmm0 + movl %edx,%ebp + xorl %eax,%esi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + addl 44(%esp),%ebx andl %edi,%ebp - xorl 40(%esp),%ecx - roll $1,%ecx - addl %ebx,%ebp - rorl $2,%edi - movl %edx,%ebx - roll $5,%ebx - movl %ecx,52(%esp) - leal 2400959708(%ecx,%ebp,1),%ecx - movl %esi,%ebp - addl %ebx,%ecx - andl %eax,%ebp - movl 56(%esp),%ebx - addl %ebp,%ecx - - movl %edi,%ebp - xorl (%esp),%ebx - xorl %esi,%ebp - xorl 24(%esp),%ebx - andl %edx,%ebp - xorl 44(%esp),%ebx - roll $1,%ebx - addl %eax,%ebp - rorl $2,%edx - movl %ecx,%eax - roll $5,%eax - movl %ebx,56(%esp) - leal 2400959708(%ebx,%ebp,1),%ebx - movl %edi,%ebp - addl %eax,%ebx - andl %esi,%ebp - movl 60(%esp),%eax - addl %ebp,%ebx - - movl %edx,%ebp - xorl 4(%esp),%eax + xorl %eax,%edi + shrdl $7,%edx,%edx + movl %ecx,%esi xorl %edi,%ebp - xorl 28(%esp),%eax - andl %ecx,%ebp - xorl 48(%esp),%eax - roll $1,%eax - addl %esi,%ebp - rorl $2,%ecx - movl %ebx,%esi - roll $5,%esi - movl %eax,60(%esp) - leal 2400959708(%eax,%ebp,1),%eax - movl %edx,%ebp - addl %esi,%eax - andl %edi,%ebp - movl (%esp),%esi - addl %ebp,%eax - - movl %ecx,%ebp - xorl 8(%esp),%esi - xorl %edx,%ebp - xorl 32(%esp),%esi - andl %ebx,%ebp - xorl 52(%esp),%esi - roll $1,%esi - addl %edi,%ebp - rorl $2,%ebx - movl %eax,%edi - roll $5,%edi - movl %esi,(%esp) - leal 2400959708(%esi,%ebp,1),%esi - movl %ecx,%ebp - addl %edi,%esi - andl %edx,%ebp - movl 4(%esp),%edi - addl %ebp,%esi - + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edx,%esi + xorl %edi,%edx + addl %ecx,%ebx + vpalignr $8,%xmm5,%xmm6,%xmm1 + vpxor %xmm3,%xmm7,%xmm7 + addl 48(%esp),%eax + andl %edx,%esi + xorl %edi,%edx + shrdl $7,%ecx,%ecx + vpxor %xmm0,%xmm7,%xmm7 + vmovdqa %xmm3,96(%esp) movl %ebx,%ebp - xorl 12(%esp),%edi + xorl %edx,%esi + vmovdqa 144(%esp),%xmm3 + vpaddd %xmm6,%xmm2,%xmm2 + shldl $5,%ebx,%ebx + addl %esi,%eax + vpxor %xmm1,%xmm7,%xmm7 xorl %ecx,%ebp - xorl 36(%esp),%edi - andl %eax,%ebp - xorl 56(%esp),%edi - roll $1,%edi - addl %edx,%ebp - rorl $2,%eax - movl %esi,%edx - roll $5,%edx - movl %edi,4(%esp) - leal 2400959708(%edi,%ebp,1),%edi - movl %ebx,%ebp - addl %edx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 52(%esp),%edi + vpsrld $30,%xmm7,%xmm1 + vmovdqa %xmm2,32(%esp) andl %ecx,%ebp - movl 8(%esp),%edx + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + vpslld $2,%xmm7,%xmm7 + xorl %ecx,%ebp + shldl $5,%eax,%eax addl %ebp,%edi - - movl %eax,%ebp - xorl 16(%esp),%edx - xorl %ebx,%ebp - xorl 40(%esp),%edx - andl %esi,%ebp - xorl 60(%esp),%edx - roll $1,%edx - addl %ecx,%ebp - rorl $2,%esi - movl %edi,%ecx - roll $5,%ecx - movl %edx,8(%esp) - leal 2400959708(%edx,%ebp,1),%edx - movl %eax,%ebp - addl %ecx,%edx - andl %ebx,%ebp - movl 12(%esp),%ecx - addl %ebp,%edx - - movl %esi,%ebp - xorl 20(%esp),%ecx + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%edi + addl 56(%esp),%edx + andl %ebx,%esi + vpor %xmm1,%xmm7,%xmm7 + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vmovdqa 64(%esp),%xmm1 + movl %edi,%ebp + xorl %ebx,%esi + shldl $5,%edi,%edi + addl %esi,%edx xorl %eax,%ebp - xorl 44(%esp),%ecx - andl %edi,%ebp - xorl (%esp),%ecx - roll $1,%ecx - addl %ebx,%ebp - rorl $2,%edi - movl %edx,%ebx - roll $5,%ebx - movl %ecx,12(%esp) - leal 2400959708(%ecx,%ebp,1),%ecx - movl %esi,%ebp - addl %ebx,%ecx + xorl %ebx,%eax + addl %edi,%edx + addl 60(%esp),%ecx andl %eax,%ebp - movl 16(%esp),%ebx + xorl %ebx,%eax + shrdl $7,%edi,%edi + movl %edx,%esi + xorl %eax,%ebp + shldl $5,%edx,%edx addl %ebp,%ecx - - movl %edi,%ebp - xorl 24(%esp),%ebx - xorl %esi,%ebp - xorl 48(%esp),%ebx - andl %edx,%ebp - xorl 4(%esp),%ebx - roll $1,%ebx - addl %eax,%ebp - rorl $2,%edx - movl %ecx,%eax - roll $5,%eax - movl %ebx,16(%esp) - leal 2400959708(%ebx,%ebp,1),%ebx - movl %edi,%ebp - addl %eax,%ebx - andl %esi,%ebp - movl 20(%esp),%eax - addl %ebp,%ebx - - movl %edx,%ebp - xorl 28(%esp),%eax - xorl %edi,%ebp - xorl 52(%esp),%eax - andl %ecx,%ebp - xorl 8(%esp),%eax - roll $1,%eax - addl %esi,%ebp - rorl $2,%ecx - movl %ebx,%esi - roll $5,%esi - movl %eax,20(%esp) - leal 2400959708(%eax,%ebp,1),%eax - movl %edx,%ebp - addl %esi,%eax - andl %edi,%ebp - movl 24(%esp),%esi - addl %ebp,%eax - + xorl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + vpalignr $8,%xmm6,%xmm7,%xmm2 + vpxor %xmm4,%xmm0,%xmm0 + addl (%esp),%ebx + andl %edi,%esi + xorl %eax,%edi + shrdl $7,%edx,%edx + vpxor %xmm1,%xmm0,%xmm0 + vmovdqa %xmm4,64(%esp) movl %ecx,%ebp - xorl 32(%esp),%esi + xorl %edi,%esi + vmovdqa %xmm3,%xmm4 + vpaddd %xmm7,%xmm3,%xmm3 + shldl $5,%ecx,%ecx + addl %esi,%ebx + vpxor %xmm2,%xmm0,%xmm0 xorl %edx,%ebp - xorl 56(%esp),%esi - andl %ebx,%ebp - xorl 12(%esp),%esi - roll $1,%esi - addl %edi,%ebp - rorl $2,%ebx - movl %eax,%edi - roll $5,%edi - movl %esi,24(%esp) - leal 2400959708(%esi,%ebp,1),%esi - movl %ecx,%ebp - addl %edi,%esi + xorl %edi,%edx + addl %ecx,%ebx + addl 4(%esp),%eax + vpsrld $30,%xmm0,%xmm2 + vmovdqa %xmm3,48(%esp) andl %edx,%ebp - movl 28(%esp),%edi - addl %ebp,%esi - - movl %ebx,%ebp - xorl 36(%esp),%edi - xorl %ecx,%ebp - xorl 60(%esp),%edi - andl %eax,%ebp - xorl 16(%esp),%edi - roll $1,%edi - addl %edx,%ebp - rorl $2,%eax - movl %esi,%edx - roll $5,%edx - movl %edi,28(%esp) - leal 2400959708(%edi,%ebp,1),%edi - movl %ebx,%ebp - addl %edx,%edi - andl %ecx,%ebp - movl 32(%esp),%edx - addl %ebp,%edi - + xorl %edi,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + vpslld $2,%xmm0,%xmm0 + xorl %edx,%ebp + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + addl 8(%esp),%edi + andl %ecx,%esi + vpor %xmm2,%xmm0,%xmm0 + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vmovdqa 80(%esp),%xmm2 movl %eax,%ebp - xorl 40(%esp),%edx + xorl %ecx,%esi + shldl $5,%eax,%eax + addl %esi,%edi xorl %ebx,%ebp - xorl (%esp),%edx - andl %esi,%ebp - xorl 20(%esp),%edx - roll $1,%edx - addl %ecx,%ebp - rorl $2,%esi - movl %edi,%ecx - roll $5,%ecx - movl %edx,32(%esp) - leal 2400959708(%edx,%ebp,1),%edx - movl %eax,%ebp - addl %ecx,%edx + xorl %ecx,%ebx + addl %eax,%edi + addl 12(%esp),%edx andl %ebx,%ebp - movl 36(%esp),%ecx - addl %ebp,%edx - - movl %esi,%ebp - xorl 44(%esp),%ecx - xorl %eax,%ebp - xorl 4(%esp),%ecx - andl %edi,%ebp - xorl 24(%esp),%ecx - roll $1,%ecx - addl %ebx,%ebp - rorl $2,%edi - movl %edx,%ebx - roll $5,%ebx - movl %ecx,36(%esp) - leal 2400959708(%ecx,%ebp,1),%ecx - movl %esi,%ebp - addl %ebx,%ecx - andl %eax,%ebp - movl 40(%esp),%ebx - addl %ebp,%ecx - - movl %edi,%ebp - xorl 48(%esp),%ebx - xorl %esi,%ebp - xorl 8(%esp),%ebx - andl %edx,%ebp - xorl 28(%esp),%ebx - roll $1,%ebx - addl %eax,%ebp - rorl $2,%edx - movl %ecx,%eax - roll $5,%eax - movl %ebx,40(%esp) - leal 2400959708(%ebx,%ebp,1),%ebx - movl %edi,%ebp - addl %eax,%ebx - andl %esi,%ebp - movl 44(%esp),%eax - addl %ebp,%ebx - + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %edi,%esi + xorl %ebx,%ebp + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %edi,%edx + vpalignr $8,%xmm7,%xmm0,%xmm3 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%esp),%ecx + andl %eax,%esi + xorl %ebx,%eax + shrdl $7,%edi,%edi + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm5,80(%esp) movl %edx,%ebp - xorl 52(%esp),%eax + xorl %eax,%esi + vmovdqa %xmm4,%xmm5 + vpaddd %xmm0,%xmm4,%xmm4 + shldl $5,%edx,%edx + addl %esi,%ecx + vpxor %xmm3,%xmm1,%xmm1 xorl %edi,%ebp - xorl 12(%esp),%eax - andl %ecx,%ebp - xorl 32(%esp),%eax - roll $1,%eax - addl %esi,%ebp - rorl $2,%ecx - movl %ebx,%esi - roll $5,%esi - movl %eax,44(%esp) - leal 2400959708(%eax,%ebp,1),%eax - movl %edx,%ebp - addl %esi,%eax + xorl %eax,%edi + addl %edx,%ecx + addl 20(%esp),%ebx + vpsrld $30,%xmm1,%xmm3 + vmovdqa %xmm4,(%esp) andl %edi,%ebp - movl 48(%esp),%esi - addl %ebp,%eax - + xorl %eax,%edi + shrdl $7,%edx,%edx + movl %ecx,%esi + vpslld $2,%xmm1,%xmm1 + xorl %edi,%ebp + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edx,%esi + xorl %edi,%edx + addl %ecx,%ebx + addl 24(%esp),%eax + andl %edx,%esi + vpor %xmm3,%xmm1,%xmm1 + xorl %edi,%edx + shrdl $7,%ecx,%ecx + vmovdqa 96(%esp),%xmm3 movl %ebx,%ebp - xorl 56(%esp),%esi + xorl %edx,%esi + shldl $5,%ebx,%ebx + addl %esi,%eax xorl %ecx,%ebp - xorl 16(%esp),%esi - xorl %edx,%ebp - xorl 36(%esp),%esi - roll $1,%esi - addl %ebp,%edi - rorl $2,%ebx - movl %eax,%ebp - roll $5,%ebp - movl %esi,48(%esp) - leal 3395469782(%esi,%edi,1),%esi - movl 52(%esp),%edi - addl %ebp,%esi - - movl %eax,%ebp - xorl 60(%esp),%edi - xorl %ebx,%ebp - xorl 20(%esp),%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 28(%esp),%edi + andl %ecx,%ebp + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi xorl %ecx,%ebp - xorl 40(%esp),%edi - roll $1,%edi - addl %ebp,%edx - rorl $2,%eax - movl %esi,%ebp - roll $5,%ebp - movl %edi,52(%esp) - leal 3395469782(%edi,%edx,1),%edi - movl 56(%esp),%edx + shldl $5,%eax,%eax addl %ebp,%edi - - movl %esi,%ebp - xorl (%esp),%edx - xorl %eax,%ebp - xorl 24(%esp),%edx - xorl %ebx,%ebp - xorl 44(%esp),%edx - roll $1,%edx - addl %ebp,%ecx - rorl $2,%esi - movl %edi,%ebp - roll $5,%ebp - movl %edx,56(%esp) - leal 3395469782(%edx,%ecx,1),%edx - movl 60(%esp),%ecx - addl %ebp,%edx - + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%edi + vpalignr $8,%xmm0,%xmm1,%xmm4 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%esp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vpxor %xmm3,%xmm2,%xmm2 + vmovdqa %xmm6,96(%esp) movl %edi,%ebp - xorl 4(%esp),%ecx - xorl %esi,%ebp - xorl 28(%esp),%ecx + xorl %ebx,%esi + vmovdqa %xmm5,%xmm6 + vpaddd %xmm1,%xmm5,%xmm5 + shldl $5,%edi,%edi + addl %esi,%edx + vpxor %xmm4,%xmm2,%xmm2 xorl %eax,%ebp - xorl 48(%esp),%ecx - roll $1,%ecx - addl %ebp,%ebx - rorl $2,%edi - movl %edx,%ebp - roll $5,%ebp - movl %ecx,60(%esp) - leal 3395469782(%ecx,%ebx,1),%ecx - movl (%esp),%ebx + xorl %ebx,%eax + addl %edi,%edx + addl 36(%esp),%ecx + vpsrld $30,%xmm2,%xmm4 + vmovdqa %xmm5,16(%esp) + andl %eax,%ebp + xorl %ebx,%eax + shrdl $7,%edi,%edi + movl %edx,%esi + vpslld $2,%xmm2,%xmm2 + xorl %eax,%ebp + shldl $5,%edx,%edx addl %ebp,%ecx - - movl %edx,%ebp - xorl 8(%esp),%ebx - xorl %edi,%ebp - xorl 32(%esp),%ebx - xorl %esi,%ebp - xorl 52(%esp),%ebx - roll $1,%ebx - addl %ebp,%eax - rorl $2,%edx - movl %ecx,%ebp - roll $5,%ebp - movl %ebx,(%esp) - leal 3395469782(%ebx,%eax,1),%ebx - movl 4(%esp),%eax - addl %ebp,%ebx - + xorl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + addl 40(%esp),%ebx + andl %edi,%esi + vpor %xmm4,%xmm2,%xmm2 + xorl %eax,%edi + shrdl $7,%edx,%edx + vmovdqa 64(%esp),%xmm4 movl %ecx,%ebp - xorl 12(%esp),%eax + xorl %edi,%esi + shldl $5,%ecx,%ecx + addl %esi,%ebx xorl %edx,%ebp - xorl 36(%esp),%eax - xorl %edi,%ebp - xorl 56(%esp),%eax - roll $1,%eax - addl %ebp,%esi - rorl $2,%ecx - movl %ebx,%ebp - roll $5,%ebp - movl %eax,4(%esp) - leal 3395469782(%eax,%esi,1),%eax - movl 8(%esp),%esi - addl %ebp,%eax - - movl %ebx,%ebp - xorl 16(%esp),%esi - xorl %ecx,%ebp - xorl 40(%esp),%esi + xorl %edi,%edx + addl %ecx,%ebx + addl 44(%esp),%eax + andl %edx,%ebp + xorl %edi,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi xorl %edx,%ebp - xorl 60(%esp),%esi - roll $1,%esi - addl %ebp,%edi - rorl $2,%ebx - movl %eax,%ebp - roll $5,%ebp - movl %esi,8(%esp) - leal 3395469782(%esi,%edi,1),%esi - movl 12(%esp),%edi - addl %ebp,%esi - + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + addl %ebx,%eax + vpalignr $8,%xmm1,%xmm2,%xmm5 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%esp),%edi + xorl %ecx,%esi movl %eax,%ebp - xorl 20(%esp),%edi - xorl %ebx,%ebp - xorl 44(%esp),%edi + shldl $5,%eax,%eax + vpxor %xmm4,%xmm3,%xmm3 + vmovdqa %xmm7,64(%esp) + addl %esi,%edi xorl %ecx,%ebp - xorl (%esp),%edi - roll $1,%edi - addl %ebp,%edx - rorl $2,%eax - movl %esi,%ebp - roll $5,%ebp - movl %edi,12(%esp) - leal 3395469782(%edi,%edx,1),%edi - movl 16(%esp),%edx - addl %ebp,%edi - - movl %esi,%ebp - xorl 24(%esp),%edx - xorl %eax,%ebp - xorl 48(%esp),%edx + vmovdqa %xmm6,%xmm7 + vpaddd %xmm2,%xmm6,%xmm6 + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpxor %xmm5,%xmm3,%xmm3 + addl 52(%esp),%edx xorl %ebx,%ebp - xorl 4(%esp),%edx - roll $1,%edx - addl %ebp,%ecx - rorl $2,%esi - movl %edi,%ebp - roll $5,%ebp - movl %edx,16(%esp) - leal 3395469782(%edx,%ecx,1),%edx - movl 20(%esp),%ecx + movl %edi,%esi + shldl $5,%edi,%edi + vpsrld $30,%xmm3,%xmm5 + vmovdqa %xmm6,32(%esp) addl %ebp,%edx - - movl %edi,%ebp - xorl 28(%esp),%ecx - xorl %esi,%ebp - xorl 52(%esp),%ecx - xorl %eax,%ebp - xorl 8(%esp),%ecx - roll $1,%ecx - addl %ebp,%ebx - rorl $2,%edi - movl %edx,%ebp - roll $5,%ebp - movl %ecx,20(%esp) - leal 3395469782(%ecx,%ebx,1),%ecx - movl 24(%esp),%ebx - addl %ebp,%ecx - + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpslld $2,%xmm3,%xmm3 + addl 56(%esp),%ecx + xorl %eax,%esi movl %edx,%ebp - xorl 32(%esp),%ebx + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + vpor %xmm5,%xmm3,%xmm3 + addl 60(%esp),%ebx xorl %edi,%ebp - xorl 56(%esp),%ebx - xorl %esi,%ebp - xorl 12(%esp),%ebx - roll $1,%ebx - addl %ebp,%eax - rorl $2,%edx - movl %ecx,%ebp - roll $5,%ebp - movl %ebx,24(%esp) - leal 3395469782(%ebx,%eax,1),%ebx - movl 28(%esp),%eax + movl %ecx,%esi + shldl $5,%ecx,%ecx addl %ebp,%ebx - - movl %ecx,%ebp - xorl 36(%esp),%eax - xorl %edx,%ebp - xorl 60(%esp),%eax - xorl %edi,%ebp - xorl 16(%esp),%eax - roll $1,%eax - addl %ebp,%esi - rorl $2,%ecx - movl %ebx,%ebp - roll $5,%ebp - movl %eax,28(%esp) - leal 3395469782(%eax,%esi,1),%eax - movl 32(%esp),%esi - addl %ebp,%eax - + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl (%esp),%eax + vpaddd %xmm3,%xmm7,%xmm7 + xorl %edx,%esi movl %ebx,%ebp - xorl 40(%esp),%esi - xorl %ecx,%ebp - xorl (%esp),%esi + shldl $5,%ebx,%ebx + addl %esi,%eax + vmovdqa %xmm7,48(%esp) xorl %edx,%ebp - xorl 20(%esp),%esi - roll $1,%esi - addl %ebp,%edi - rorl $2,%ebx - movl %eax,%ebp - roll $5,%ebp - movl %esi,32(%esp) - leal 3395469782(%esi,%edi,1),%esi - movl 36(%esp),%edi - addl %ebp,%esi - - movl %eax,%ebp - xorl 44(%esp),%edi - xorl %ebx,%ebp - xorl 4(%esp),%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 4(%esp),%edi xorl %ecx,%ebp - xorl 24(%esp),%edi - roll $1,%edi - addl %ebp,%edx - rorl $2,%eax - movl %esi,%ebp - roll $5,%ebp - movl %edi,36(%esp) - leal 3395469782(%edi,%edx,1),%edi - movl 40(%esp),%edx + movl %eax,%esi + shldl $5,%eax,%eax addl %ebp,%edi - - movl %esi,%ebp - xorl 48(%esp),%edx - xorl %eax,%ebp - xorl 8(%esp),%edx + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 8(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + addl %esi,%edx xorl %ebx,%ebp - xorl 28(%esp),%edx - roll $1,%edx + shrdl $7,%eax,%eax + addl %edi,%edx + addl 12(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx addl %ebp,%ecx - rorl $2,%esi - movl %edi,%ebp - roll $5,%ebp - movl %edx,40(%esp) - leal 3395469782(%edx,%ecx,1),%edx - movl 44(%esp),%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + movl 196(%esp),%ebp + cmpl 200(%esp),%ebp + je .L010done + vmovdqa 160(%esp),%xmm7 + vmovdqa 176(%esp),%xmm6 + vmovdqu (%ebp),%xmm0 + vmovdqu 16(%ebp),%xmm1 + vmovdqu 32(%ebp),%xmm2 + vmovdqu 48(%ebp),%xmm3 + addl $64,%ebp + vpshufb %xmm6,%xmm0,%xmm0 + movl %ebp,196(%esp) + vmovdqa %xmm7,96(%esp) + addl 16(%esp),%ebx + xorl %edi,%esi + vpshufb %xmm6,%xmm1,%xmm1 + movl %ecx,%ebp + shldl $5,%ecx,%ecx + vpaddd %xmm7,%xmm0,%xmm4 + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + vmovdqa %xmm4,(%esp) + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ecx,%ebp + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 28(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi addl %ebp,%edx - - movl %edi,%ebp - xorl 52(%esp),%ecx - xorl %esi,%ebp - xorl 12(%esp),%ecx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + vpshufb %xmm6,%xmm2,%xmm2 + movl %edx,%ebp + shldl $5,%edx,%edx + vpaddd %xmm7,%xmm1,%xmm5 + addl %esi,%ecx xorl %eax,%ebp - xorl 32(%esp),%ecx - roll $1,%ecx + shrdl $7,%edi,%edi + addl %edx,%ecx + vmovdqa %xmm5,16(%esp) + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx addl %ebp,%ebx - rorl $2,%edi - movl %edx,%ebp - roll $5,%ebp - movl %ecx,44(%esp) - leal 3395469782(%ecx,%ebx,1),%ecx - movl 48(%esp),%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + vpshufb %xmm6,%xmm3,%xmm3 + movl %edi,%ebp + shldl $5,%edi,%edi + vpaddd %xmm7,%xmm2,%xmm6 + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + vmovdqa %xmm6,32(%esp) + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx addl %ebp,%ecx - - movl %edx,%ebp - xorl 56(%esp),%ebx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx xorl %edi,%ebp - xorl 16(%esp),%ebx - xorl %esi,%ebp - xorl 36(%esp),%ebx - roll $1,%ebx + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx addl %ebp,%eax - rorl $2,%edx - movl %ecx,%ebp - roll $5,%ebp - movl %ebx,48(%esp) - leal 3395469782(%ebx,%eax,1),%ebx - movl 52(%esp),%eax - addl %ebp,%ebx - + shrdl $7,%ecx,%ecx + addl %ebx,%eax + movl 192(%esp),%ebp + addl (%ebp),%eax + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,%ebx + movl %ecx,8(%ebp) + xorl %edx,%ebx + movl %edx,12(%ebp) + movl %edi,16(%ebp) + movl %esi,%ebp + andl %ebx,%esi + movl %ebp,%ebx + jmp .L009loop +.align 16 +.L010done: + addl 16(%esp),%ebx + xorl %edi,%esi movl %ecx,%ebp - xorl 60(%esp),%eax - xorl %edx,%ebp - xorl 20(%esp),%eax + shldl $5,%ecx,%ecx + addl %esi,%ebx xorl %edi,%ebp - xorl 40(%esp),%eax - roll $1,%eax - addl %ebp,%esi - rorl $2,%ecx - movl %ebx,%ebp - roll $5,%ebp - leal 3395469782(%eax,%esi,1),%eax - movl 56(%esp),%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx addl %ebp,%eax - - movl %ebx,%ebp - xorl (%esp),%esi + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi xorl %ecx,%ebp - xorl 24(%esp),%esi - xorl %edx,%ebp - xorl 44(%esp),%esi - roll $1,%esi - addl %ebp,%edi - rorl $2,%ebx - movl %eax,%ebp - roll $5,%ebp - leal 3395469782(%esi,%edi,1),%esi - movl 60(%esp),%edi - addl %ebp,%esi - - movl %eax,%ebp - xorl 4(%esp),%edi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 28(%esp),%edx xorl %ebx,%ebp - xorl 28(%esp),%edi - xorl %ecx,%ebp - xorl 48(%esp),%edi - roll $1,%edi + movl %edi,%esi + shldl $5,%edi,%edi addl %ebp,%edx - rorl $2,%eax - movl %esi,%ebp - roll $5,%ebp - leal 3395469782(%edi,%edx,1),%edi + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax addl %ebp,%edi - movl 96(%esp),%ebp - movl 100(%esp),%edx - addl (%ebp),%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vzeroall + movl 192(%esp),%ebp + addl (%ebp),%eax + movl 204(%esp),%esp addl 4(%ebp),%esi - addl 8(%ebp),%eax - addl 12(%ebp),%ebx - addl 16(%ebp),%ecx - movl %edi,(%ebp) - addl $64,%edx + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx movl %esi,4(%ebp) - cmpl 104(%esp),%edx - movl %eax,8(%ebp) - movl %ecx,%edi - movl %ebx,12(%ebp) - movl %edx,%esi - movl %ecx,16(%ebp) - jb .L002loop - addl $76,%esp + addl 16(%ebp),%edi + movl %ecx,8(%ebp) + movl %edx,12(%ebp) + movl %edi,16(%ebp) popl %edi popl %esi popl %ebx popl %ebp ret -.size sha1_block_data_order,.-.L_sha1_block_data_order_begin -.type _sha1_block_data_order_shaext,@function +.size _sha1_block_data_order_avx,.-_sha1_block_data_order_avx +.align 64 +.LK_XX_XX: +.long 1518500249,1518500249,1518500249,1518500249 +.long 1859775393,1859775393,1859775393,1859775393 +.long 2400959708,2400959708,2400959708,2400959708 +.long 3395469782,3395469782,3395469782,3395469782 +.long 66051,67438087,134810123,202182159 +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115 +.byte 102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82 +.byte 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 +.byte 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.comm OPENSSL_ia32cap_P,16,4 +#else +.text +.globl sha1_block_data_order +.type sha1_block_data_order,@function .align 16 -_sha1_block_data_order_shaext: +sha1_block_data_order: +.L_sha1_block_data_order_begin: pushl %ebp pushl %ebx pushl %esi pushl %edi - call .L003pic_point -.L003pic_point: + call .L000pic_point +.L000pic_point: popl %ebp - leal .LK_XX_XX-.L003pic_point(%ebp),%ebp -.Lshaext_shortcut: - movl 20(%esp),%edi - movl %esp,%ebx + leal OPENSSL_ia32cap_P,%esi + leal .LK_XX_XX-.L000pic_point(%ebp),%ebp + movl (%esi),%eax + movl 4(%esi),%edx + testl $512,%edx + jz .L001x86 + movl 8(%esi),%ecx + testl $16777216,%eax + jz .L001x86 + testl $536870912,%ecx + jnz .Lshaext_shortcut + andl $268435456,%edx + andl $1073741824,%eax + orl %edx,%eax + cmpl $1342177280,%eax + je .Lavx_shortcut + jmp .Lssse3_shortcut +.align 16 +.L001x86: + movl 20(%esp),%ebp movl 24(%esp),%esi - movl 28(%esp),%ecx - subl $32,%esp - movdqu (%edi),%xmm0 - movd 16(%edi),%xmm1 - andl $-32,%esp - movdqa 80(%ebp),%xmm3 - movdqu (%esi),%xmm4 - pshufd $27,%xmm0,%xmm0 - movdqu 16(%esi),%xmm5 - pshufd $27,%xmm1,%xmm1 - movdqu 32(%esi),%xmm6 -.byte 102,15,56,0,227 - movdqu 48(%esi),%xmm7 -.byte 102,15,56,0,235 -.byte 102,15,56,0,243 -.byte 102,15,56,0,251 - jmp .L004loop_shaext + movl 28(%esp),%eax + subl $76,%esp + shll $6,%eax + addl %esi,%eax + movl %eax,104(%esp) + movl 16(%ebp),%edi + jmp .L002loop .align 16 -.L004loop_shaext: - decl %ecx - leal 64(%esi),%eax - movdqa %xmm1,(%esp) - paddd %xmm4,%xmm1 - cmovnel %eax,%esi - movdqa %xmm0,16(%esp) -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,0 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,0 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,1 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,1 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,2 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,2 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,2 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,2 -.byte 15,56,200,206 +.L002loop: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + bswap %eax + bswap %ebx + bswap %ecx + bswap %edx + movl %eax,(%esp) + movl %ebx,4(%esp) + movl %ecx,8(%esp) + movl %edx,12(%esp) + movl 16(%esi),%eax + movl 20(%esi),%ebx + movl 24(%esi),%ecx + movl 28(%esi),%edx + bswap %eax + bswap %ebx + bswap %ecx + bswap %edx + movl %eax,16(%esp) + movl %ebx,20(%esp) + movl %ecx,24(%esp) + movl %edx,28(%esp) + movl 32(%esi),%eax + movl 36(%esi),%ebx + movl 40(%esi),%ecx + movl 44(%esi),%edx + bswap %eax + bswap %ebx + bswap %ecx + bswap %edx + movl %eax,32(%esp) + movl %ebx,36(%esp) + movl %ecx,40(%esp) + movl %edx,44(%esp) + movl 48(%esi),%eax + movl 52(%esi),%ebx + movl 56(%esi),%ecx + movl 60(%esi),%edx + bswap %eax + bswap %ebx + bswap %ecx + bswap %edx + movl %eax,48(%esp) + movl %ebx,52(%esp) + movl %ecx,56(%esp) + movl %edx,60(%esp) + movl %esi,100(%esp) + movl (%ebp),%eax + movl 4(%ebp),%ebx + movl 8(%ebp),%ecx + movl 12(%ebp),%edx + + movl %ecx,%esi + movl %eax,%ebp + roll $5,%ebp + xorl %edx,%esi + addl %edi,%ebp + movl (%esp),%edi + andl %ebx,%esi + rorl $2,%ebx + xorl %edx,%esi + leal 1518500249(%ebp,%edi,1),%ebp + addl %esi,%ebp + + movl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + xorl %ecx,%edi + addl %edx,%ebp + movl 4(%esp),%edx + andl %eax,%edi + rorl $2,%eax + xorl %ecx,%edi + leal 1518500249(%ebp,%edx,1),%ebp + addl %edi,%ebp + + movl %eax,%edx + movl %ebp,%edi + roll $5,%ebp + xorl %ebx,%edx + addl %ecx,%ebp + movl 8(%esp),%ecx + andl %esi,%edx + rorl $2,%esi + xorl %ebx,%edx + leal 1518500249(%ebp,%ecx,1),%ebp + addl %edx,%ebp + + movl %esi,%ecx + movl %ebp,%edx + roll $5,%ebp + xorl %eax,%ecx + addl %ebx,%ebp + movl 12(%esp),%ebx + andl %edi,%ecx + rorl $2,%edi + xorl %eax,%ecx + leal 1518500249(%ebp,%ebx,1),%ebp + addl %ecx,%ebp + + movl %edi,%ebx + movl %ebp,%ecx + roll $5,%ebp + xorl %esi,%ebx + addl %eax,%ebp + movl 16(%esp),%eax + andl %edx,%ebx + rorl $2,%edx + xorl %esi,%ebx + leal 1518500249(%ebp,%eax,1),%ebp + addl %ebx,%ebp + + movl %edx,%eax + movl %ebp,%ebx + roll $5,%ebp + xorl %edi,%eax + addl %esi,%ebp + movl 20(%esp),%esi + andl %ecx,%eax + rorl $2,%ecx + xorl %edi,%eax + leal 1518500249(%ebp,%esi,1),%ebp + addl %eax,%ebp + + movl %ecx,%esi + movl %ebp,%eax + roll $5,%ebp + xorl %edx,%esi + addl %edi,%ebp + movl 24(%esp),%edi + andl %ebx,%esi + rorl $2,%ebx + xorl %edx,%esi + leal 1518500249(%ebp,%edi,1),%ebp + addl %esi,%ebp + + movl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + xorl %ecx,%edi + addl %edx,%ebp + movl 28(%esp),%edx + andl %eax,%edi + rorl $2,%eax + xorl %ecx,%edi + leal 1518500249(%ebp,%edx,1),%ebp + addl %edi,%ebp + + movl %eax,%edx + movl %ebp,%edi + roll $5,%ebp + xorl %ebx,%edx + addl %ecx,%ebp + movl 32(%esp),%ecx + andl %esi,%edx + rorl $2,%esi + xorl %ebx,%edx + leal 1518500249(%ebp,%ecx,1),%ebp + addl %edx,%ebp + + movl %esi,%ecx + movl %ebp,%edx + roll $5,%ebp + xorl %eax,%ecx + addl %ebx,%ebp + movl 36(%esp),%ebx + andl %edi,%ecx + rorl $2,%edi + xorl %eax,%ecx + leal 1518500249(%ebp,%ebx,1),%ebp + addl %ecx,%ebp + + movl %edi,%ebx + movl %ebp,%ecx + roll $5,%ebp + xorl %esi,%ebx + addl %eax,%ebp + movl 40(%esp),%eax + andl %edx,%ebx + rorl $2,%edx + xorl %esi,%ebx + leal 1518500249(%ebp,%eax,1),%ebp + addl %ebx,%ebp + + movl %edx,%eax + movl %ebp,%ebx + roll $5,%ebp + xorl %edi,%eax + addl %esi,%ebp + movl 44(%esp),%esi + andl %ecx,%eax + rorl $2,%ecx + xorl %edi,%eax + leal 1518500249(%ebp,%esi,1),%ebp + addl %eax,%ebp + + movl %ecx,%esi + movl %ebp,%eax + roll $5,%ebp + xorl %edx,%esi + addl %edi,%ebp + movl 48(%esp),%edi + andl %ebx,%esi + rorl $2,%ebx + xorl %edx,%esi + leal 1518500249(%ebp,%edi,1),%ebp + addl %esi,%ebp + + movl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + xorl %ecx,%edi + addl %edx,%ebp + movl 52(%esp),%edx + andl %eax,%edi + rorl $2,%eax + xorl %ecx,%edi + leal 1518500249(%ebp,%edx,1),%ebp + addl %edi,%ebp + + movl %eax,%edx + movl %ebp,%edi + roll $5,%ebp + xorl %ebx,%edx + addl %ecx,%ebp + movl 56(%esp),%ecx + andl %esi,%edx + rorl $2,%esi + xorl %ebx,%edx + leal 1518500249(%ebp,%ecx,1),%ebp + addl %edx,%ebp + + movl %esi,%ecx + movl %ebp,%edx + roll $5,%ebp + xorl %eax,%ecx + addl %ebx,%ebp + movl 60(%esp),%ebx + andl %edi,%ecx + rorl $2,%edi + xorl %eax,%ecx + leal 1518500249(%ebp,%ebx,1),%ebp + movl (%esp),%ebx + addl %ebp,%ecx + + movl %edi,%ebp + xorl 8(%esp),%ebx + xorl %esi,%ebp + xorl 32(%esp),%ebx + andl %edx,%ebp + xorl 52(%esp),%ebx + roll $1,%ebx + xorl %esi,%ebp + addl %ebp,%eax + movl %ecx,%ebp + rorl $2,%edx + movl %ebx,(%esp) + roll $5,%ebp + leal 1518500249(%ebx,%eax,1),%ebx + movl 4(%esp),%eax + addl %ebp,%ebx + + movl %edx,%ebp + xorl 12(%esp),%eax + xorl %edi,%ebp + xorl 36(%esp),%eax + andl %ecx,%ebp + xorl 56(%esp),%eax + roll $1,%eax + xorl %edi,%ebp + addl %ebp,%esi + movl %ebx,%ebp + rorl $2,%ecx + movl %eax,4(%esp) + roll $5,%ebp + leal 1518500249(%eax,%esi,1),%eax + movl 8(%esp),%esi + addl %ebp,%eax + + movl %ecx,%ebp + xorl 16(%esp),%esi + xorl %edx,%ebp + xorl 40(%esp),%esi + andl %ebx,%ebp + xorl 60(%esp),%esi + roll $1,%esi + xorl %edx,%ebp + addl %ebp,%edi + movl %eax,%ebp + rorl $2,%ebx + movl %esi,8(%esp) + roll $5,%ebp + leal 1518500249(%esi,%edi,1),%esi + movl 12(%esp),%edi + addl %ebp,%esi + + movl %ebx,%ebp + xorl 20(%esp),%edi + xorl %ecx,%ebp + xorl 44(%esp),%edi + andl %eax,%ebp + xorl (%esp),%edi + roll $1,%edi + xorl %ecx,%ebp + addl %ebp,%edx + movl %esi,%ebp + rorl $2,%eax + movl %edi,12(%esp) + roll $5,%ebp + leal 1518500249(%edi,%edx,1),%edi + movl 16(%esp),%edx + addl %ebp,%edi + + movl %esi,%ebp + xorl 24(%esp),%edx + xorl %eax,%ebp + xorl 48(%esp),%edx + xorl %ebx,%ebp + xorl 4(%esp),%edx + roll $1,%edx + addl %ebp,%ecx + rorl $2,%esi + movl %edi,%ebp + roll $5,%ebp + movl %edx,16(%esp) + leal 1859775393(%edx,%ecx,1),%edx + movl 20(%esp),%ecx + addl %ebp,%edx + + movl %edi,%ebp + xorl 28(%esp),%ecx + xorl %esi,%ebp + xorl 52(%esp),%ecx + xorl %eax,%ebp + xorl 8(%esp),%ecx + roll $1,%ecx + addl %ebp,%ebx + rorl $2,%edi + movl %edx,%ebp + roll $5,%ebp + movl %ecx,20(%esp) + leal 1859775393(%ecx,%ebx,1),%ecx + movl 24(%esp),%ebx + addl %ebp,%ecx + + movl %edx,%ebp + xorl 32(%esp),%ebx + xorl %edi,%ebp + xorl 56(%esp),%ebx + xorl %esi,%ebp + xorl 12(%esp),%ebx + roll $1,%ebx + addl %ebp,%eax + rorl $2,%edx + movl %ecx,%ebp + roll $5,%ebp + movl %ebx,24(%esp) + leal 1859775393(%ebx,%eax,1),%ebx + movl 28(%esp),%eax + addl %ebp,%ebx + + movl %ecx,%ebp + xorl 36(%esp),%eax + xorl %edx,%ebp + xorl 60(%esp),%eax + xorl %edi,%ebp + xorl 16(%esp),%eax + roll $1,%eax + addl %ebp,%esi + rorl $2,%ecx + movl %ebx,%ebp + roll $5,%ebp + movl %eax,28(%esp) + leal 1859775393(%eax,%esi,1),%eax + movl 32(%esp),%esi + addl %ebp,%eax + + movl %ebx,%ebp + xorl 40(%esp),%esi + xorl %ecx,%ebp + xorl (%esp),%esi + xorl %edx,%ebp + xorl 20(%esp),%esi + roll $1,%esi + addl %ebp,%edi + rorl $2,%ebx + movl %eax,%ebp + roll $5,%ebp + movl %esi,32(%esp) + leal 1859775393(%esi,%edi,1),%esi + movl 36(%esp),%edi + addl %ebp,%esi + + movl %eax,%ebp + xorl 44(%esp),%edi + xorl %ebx,%ebp + xorl 4(%esp),%edi + xorl %ecx,%ebp + xorl 24(%esp),%edi + roll $1,%edi + addl %ebp,%edx + rorl $2,%eax + movl %esi,%ebp + roll $5,%ebp + movl %edi,36(%esp) + leal 1859775393(%edi,%edx,1),%edi + movl 40(%esp),%edx + addl %ebp,%edi + + movl %esi,%ebp + xorl 48(%esp),%edx + xorl %eax,%ebp + xorl 8(%esp),%edx + xorl %ebx,%ebp + xorl 28(%esp),%edx + roll $1,%edx + addl %ebp,%ecx + rorl $2,%esi + movl %edi,%ebp + roll $5,%ebp + movl %edx,40(%esp) + leal 1859775393(%edx,%ecx,1),%edx + movl 44(%esp),%ecx + addl %ebp,%edx + + movl %edi,%ebp + xorl 52(%esp),%ecx + xorl %esi,%ebp + xorl 12(%esp),%ecx + xorl %eax,%ebp + xorl 32(%esp),%ecx + roll $1,%ecx + addl %ebp,%ebx + rorl $2,%edi + movl %edx,%ebp + roll $5,%ebp + movl %ecx,44(%esp) + leal 1859775393(%ecx,%ebx,1),%ecx + movl 48(%esp),%ebx + addl %ebp,%ecx + + movl %edx,%ebp + xorl 56(%esp),%ebx + xorl %edi,%ebp + xorl 16(%esp),%ebx + xorl %esi,%ebp + xorl 36(%esp),%ebx + roll $1,%ebx + addl %ebp,%eax + rorl $2,%edx + movl %ecx,%ebp + roll $5,%ebp + movl %ebx,48(%esp) + leal 1859775393(%ebx,%eax,1),%ebx + movl 52(%esp),%eax + addl %ebp,%ebx + + movl %ecx,%ebp + xorl 60(%esp),%eax + xorl %edx,%ebp + xorl 20(%esp),%eax + xorl %edi,%ebp + xorl 40(%esp),%eax + roll $1,%eax + addl %ebp,%esi + rorl $2,%ecx + movl %ebx,%ebp + roll $5,%ebp + movl %eax,52(%esp) + leal 1859775393(%eax,%esi,1),%eax + movl 56(%esp),%esi + addl %ebp,%eax + + movl %ebx,%ebp + xorl (%esp),%esi + xorl %ecx,%ebp + xorl 24(%esp),%esi + xorl %edx,%ebp + xorl 44(%esp),%esi + roll $1,%esi + addl %ebp,%edi + rorl $2,%ebx + movl %eax,%ebp + roll $5,%ebp + movl %esi,56(%esp) + leal 1859775393(%esi,%edi,1),%esi + movl 60(%esp),%edi + addl %ebp,%esi + + movl %eax,%ebp + xorl 4(%esp),%edi + xorl %ebx,%ebp + xorl 28(%esp),%edi + xorl %ecx,%ebp + xorl 48(%esp),%edi + roll $1,%edi + addl %ebp,%edx + rorl $2,%eax + movl %esi,%ebp + roll $5,%ebp + movl %edi,60(%esp) + leal 1859775393(%edi,%edx,1),%edi + movl (%esp),%edx + addl %ebp,%edi + + movl %esi,%ebp + xorl 8(%esp),%edx + xorl %eax,%ebp + xorl 32(%esp),%edx + xorl %ebx,%ebp + xorl 52(%esp),%edx + roll $1,%edx + addl %ebp,%ecx + rorl $2,%esi + movl %edi,%ebp + roll $5,%ebp + movl %edx,(%esp) + leal 1859775393(%edx,%ecx,1),%edx + movl 4(%esp),%ecx + addl %ebp,%edx + + movl %edi,%ebp + xorl 12(%esp),%ecx + xorl %esi,%ebp + xorl 36(%esp),%ecx + xorl %eax,%ebp + xorl 56(%esp),%ecx + roll $1,%ecx + addl %ebp,%ebx + rorl $2,%edi + movl %edx,%ebp + roll $5,%ebp + movl %ecx,4(%esp) + leal 1859775393(%ecx,%ebx,1),%ecx + movl 8(%esp),%ebx + addl %ebp,%ecx + + movl %edx,%ebp + xorl 16(%esp),%ebx + xorl %edi,%ebp + xorl 40(%esp),%ebx + xorl %esi,%ebp + xorl 60(%esp),%ebx + roll $1,%ebx + addl %ebp,%eax + rorl $2,%edx + movl %ecx,%ebp + roll $5,%ebp + movl %ebx,8(%esp) + leal 1859775393(%ebx,%eax,1),%ebx + movl 12(%esp),%eax + addl %ebp,%ebx + + movl %ecx,%ebp + xorl 20(%esp),%eax + xorl %edx,%ebp + xorl 44(%esp),%eax + xorl %edi,%ebp + xorl (%esp),%eax + roll $1,%eax + addl %ebp,%esi + rorl $2,%ecx + movl %ebx,%ebp + roll $5,%ebp + movl %eax,12(%esp) + leal 1859775393(%eax,%esi,1),%eax + movl 16(%esp),%esi + addl %ebp,%eax + + movl %ebx,%ebp + xorl 24(%esp),%esi + xorl %ecx,%ebp + xorl 48(%esp),%esi + xorl %edx,%ebp + xorl 4(%esp),%esi + roll $1,%esi + addl %ebp,%edi + rorl $2,%ebx + movl %eax,%ebp + roll $5,%ebp + movl %esi,16(%esp) + leal 1859775393(%esi,%edi,1),%esi + movl 20(%esp),%edi + addl %ebp,%esi + + movl %eax,%ebp + xorl 28(%esp),%edi + xorl %ebx,%ebp + xorl 52(%esp),%edi + xorl %ecx,%ebp + xorl 8(%esp),%edi + roll $1,%edi + addl %ebp,%edx + rorl $2,%eax + movl %esi,%ebp + roll $5,%ebp + movl %edi,20(%esp) + leal 1859775393(%edi,%edx,1),%edi + movl 24(%esp),%edx + addl %ebp,%edi + + movl %esi,%ebp + xorl 32(%esp),%edx + xorl %eax,%ebp + xorl 56(%esp),%edx + xorl %ebx,%ebp + xorl 12(%esp),%edx + roll $1,%edx + addl %ebp,%ecx + rorl $2,%esi + movl %edi,%ebp + roll $5,%ebp + movl %edx,24(%esp) + leal 1859775393(%edx,%ecx,1),%edx + movl 28(%esp),%ecx + addl %ebp,%edx + + movl %edi,%ebp + xorl 36(%esp),%ecx + xorl %esi,%ebp + xorl 60(%esp),%ecx + xorl %eax,%ebp + xorl 16(%esp),%ecx + roll $1,%ecx + addl %ebp,%ebx + rorl $2,%edi + movl %edx,%ebp + roll $5,%ebp + movl %ecx,28(%esp) + leal 1859775393(%ecx,%ebx,1),%ecx + movl 32(%esp),%ebx + addl %ebp,%ecx + + movl %edi,%ebp + xorl 40(%esp),%ebx + xorl %esi,%ebp + xorl (%esp),%ebx + andl %edx,%ebp + xorl 20(%esp),%ebx + roll $1,%ebx + addl %eax,%ebp + rorl $2,%edx + movl %ecx,%eax + roll $5,%eax + movl %ebx,32(%esp) + leal 2400959708(%ebx,%ebp,1),%ebx + movl %edi,%ebp + addl %eax,%ebx + andl %esi,%ebp + movl 36(%esp),%eax + addl %ebp,%ebx + + movl %edx,%ebp + xorl 44(%esp),%eax + xorl %edi,%ebp + xorl 4(%esp),%eax + andl %ecx,%ebp + xorl 24(%esp),%eax + roll $1,%eax + addl %esi,%ebp + rorl $2,%ecx + movl %ebx,%esi + roll $5,%esi + movl %eax,36(%esp) + leal 2400959708(%eax,%ebp,1),%eax + movl %edx,%ebp + addl %esi,%eax + andl %edi,%ebp + movl 40(%esp),%esi + addl %ebp,%eax + + movl %ecx,%ebp + xorl 48(%esp),%esi + xorl %edx,%ebp + xorl 8(%esp),%esi + andl %ebx,%ebp + xorl 28(%esp),%esi + roll $1,%esi + addl %edi,%ebp + rorl $2,%ebx + movl %eax,%edi + roll $5,%edi + movl %esi,40(%esp) + leal 2400959708(%esi,%ebp,1),%esi + movl %ecx,%ebp + addl %edi,%esi + andl %edx,%ebp + movl 44(%esp),%edi + addl %ebp,%esi + + movl %ebx,%ebp + xorl 52(%esp),%edi + xorl %ecx,%ebp + xorl 12(%esp),%edi + andl %eax,%ebp + xorl 32(%esp),%edi + roll $1,%edi + addl %edx,%ebp + rorl $2,%eax + movl %esi,%edx + roll $5,%edx + movl %edi,44(%esp) + leal 2400959708(%edi,%ebp,1),%edi + movl %ebx,%ebp + addl %edx,%edi + andl %ecx,%ebp + movl 48(%esp),%edx + addl %ebp,%edi + + movl %eax,%ebp + xorl 56(%esp),%edx + xorl %ebx,%ebp + xorl 16(%esp),%edx + andl %esi,%ebp + xorl 36(%esp),%edx + roll $1,%edx + addl %ecx,%ebp + rorl $2,%esi + movl %edi,%ecx + roll $5,%ecx + movl %edx,48(%esp) + leal 2400959708(%edx,%ebp,1),%edx + movl %eax,%ebp + addl %ecx,%edx + andl %ebx,%ebp + movl 52(%esp),%ecx + addl %ebp,%edx + + movl %esi,%ebp + xorl 60(%esp),%ecx + xorl %eax,%ebp + xorl 20(%esp),%ecx + andl %edi,%ebp + xorl 40(%esp),%ecx + roll $1,%ecx + addl %ebx,%ebp + rorl $2,%edi + movl %edx,%ebx + roll $5,%ebx + movl %ecx,52(%esp) + leal 2400959708(%ecx,%ebp,1),%ecx + movl %esi,%ebp + addl %ebx,%ecx + andl %eax,%ebp + movl 56(%esp),%ebx + addl %ebp,%ecx + + movl %edi,%ebp + xorl (%esp),%ebx + xorl %esi,%ebp + xorl 24(%esp),%ebx + andl %edx,%ebp + xorl 44(%esp),%ebx + roll $1,%ebx + addl %eax,%ebp + rorl $2,%edx + movl %ecx,%eax + roll $5,%eax + movl %ebx,56(%esp) + leal 2400959708(%ebx,%ebp,1),%ebx + movl %edi,%ebp + addl %eax,%ebx + andl %esi,%ebp + movl 60(%esp),%eax + addl %ebp,%ebx + + movl %edx,%ebp + xorl 4(%esp),%eax + xorl %edi,%ebp + xorl 28(%esp),%eax + andl %ecx,%ebp + xorl 48(%esp),%eax + roll $1,%eax + addl %esi,%ebp + rorl $2,%ecx + movl %ebx,%esi + roll $5,%esi + movl %eax,60(%esp) + leal 2400959708(%eax,%ebp,1),%eax + movl %edx,%ebp + addl %esi,%eax + andl %edi,%ebp + movl (%esp),%esi + addl %ebp,%eax + + movl %ecx,%ebp + xorl 8(%esp),%esi + xorl %edx,%ebp + xorl 32(%esp),%esi + andl %ebx,%ebp + xorl 52(%esp),%esi + roll $1,%esi + addl %edi,%ebp + rorl $2,%ebx + movl %eax,%edi + roll $5,%edi + movl %esi,(%esp) + leal 2400959708(%esi,%ebp,1),%esi + movl %ecx,%ebp + addl %edi,%esi + andl %edx,%ebp + movl 4(%esp),%edi + addl %ebp,%esi + + movl %ebx,%ebp + xorl 12(%esp),%edi + xorl %ecx,%ebp + xorl 36(%esp),%edi + andl %eax,%ebp + xorl 56(%esp),%edi + roll $1,%edi + addl %edx,%ebp + rorl $2,%eax + movl %esi,%edx + roll $5,%edx + movl %edi,4(%esp) + leal 2400959708(%edi,%ebp,1),%edi + movl %ebx,%ebp + addl %edx,%edi + andl %ecx,%ebp + movl 8(%esp),%edx + addl %ebp,%edi + + movl %eax,%ebp + xorl 16(%esp),%edx + xorl %ebx,%ebp + xorl 40(%esp),%edx + andl %esi,%ebp + xorl 60(%esp),%edx + roll $1,%edx + addl %ecx,%ebp + rorl $2,%esi + movl %edi,%ecx + roll $5,%ecx + movl %edx,8(%esp) + leal 2400959708(%edx,%ebp,1),%edx + movl %eax,%ebp + addl %ecx,%edx + andl %ebx,%ebp + movl 12(%esp),%ecx + addl %ebp,%edx + + movl %esi,%ebp + xorl 20(%esp),%ecx + xorl %eax,%ebp + xorl 44(%esp),%ecx + andl %edi,%ebp + xorl (%esp),%ecx + roll $1,%ecx + addl %ebx,%ebp + rorl $2,%edi + movl %edx,%ebx + roll $5,%ebx + movl %ecx,12(%esp) + leal 2400959708(%ecx,%ebp,1),%ecx + movl %esi,%ebp + addl %ebx,%ecx + andl %eax,%ebp + movl 16(%esp),%ebx + addl %ebp,%ecx + + movl %edi,%ebp + xorl 24(%esp),%ebx + xorl %esi,%ebp + xorl 48(%esp),%ebx + andl %edx,%ebp + xorl 4(%esp),%ebx + roll $1,%ebx + addl %eax,%ebp + rorl $2,%edx + movl %ecx,%eax + roll $5,%eax + movl %ebx,16(%esp) + leal 2400959708(%ebx,%ebp,1),%ebx + movl %edi,%ebp + addl %eax,%ebx + andl %esi,%ebp + movl 20(%esp),%eax + addl %ebp,%ebx + + movl %edx,%ebp + xorl 28(%esp),%eax + xorl %edi,%ebp + xorl 52(%esp),%eax + andl %ecx,%ebp + xorl 8(%esp),%eax + roll $1,%eax + addl %esi,%ebp + rorl $2,%ecx + movl %ebx,%esi + roll $5,%esi + movl %eax,20(%esp) + leal 2400959708(%eax,%ebp,1),%eax + movl %edx,%ebp + addl %esi,%eax + andl %edi,%ebp + movl 24(%esp),%esi + addl %ebp,%eax + + movl %ecx,%ebp + xorl 32(%esp),%esi + xorl %edx,%ebp + xorl 56(%esp),%esi + andl %ebx,%ebp + xorl 12(%esp),%esi + roll $1,%esi + addl %edi,%ebp + rorl $2,%ebx + movl %eax,%edi + roll $5,%edi + movl %esi,24(%esp) + leal 2400959708(%esi,%ebp,1),%esi + movl %ecx,%ebp + addl %edi,%esi + andl %edx,%ebp + movl 28(%esp),%edi + addl %ebp,%esi + + movl %ebx,%ebp + xorl 36(%esp),%edi + xorl %ecx,%ebp + xorl 60(%esp),%edi + andl %eax,%ebp + xorl 16(%esp),%edi + roll $1,%edi + addl %edx,%ebp + rorl $2,%eax + movl %esi,%edx + roll $5,%edx + movl %edi,28(%esp) + leal 2400959708(%edi,%ebp,1),%edi + movl %ebx,%ebp + addl %edx,%edi + andl %ecx,%ebp + movl 32(%esp),%edx + addl %ebp,%edi + + movl %eax,%ebp + xorl 40(%esp),%edx + xorl %ebx,%ebp + xorl (%esp),%edx + andl %esi,%ebp + xorl 20(%esp),%edx + roll $1,%edx + addl %ecx,%ebp + rorl $2,%esi + movl %edi,%ecx + roll $5,%ecx + movl %edx,32(%esp) + leal 2400959708(%edx,%ebp,1),%edx + movl %eax,%ebp + addl %ecx,%edx + andl %ebx,%ebp + movl 36(%esp),%ecx + addl %ebp,%edx + + movl %esi,%ebp + xorl 44(%esp),%ecx + xorl %eax,%ebp + xorl 4(%esp),%ecx + andl %edi,%ebp + xorl 24(%esp),%ecx + roll $1,%ecx + addl %ebx,%ebp + rorl $2,%edi + movl %edx,%ebx + roll $5,%ebx + movl %ecx,36(%esp) + leal 2400959708(%ecx,%ebp,1),%ecx + movl %esi,%ebp + addl %ebx,%ecx + andl %eax,%ebp + movl 40(%esp),%ebx + addl %ebp,%ecx + + movl %edi,%ebp + xorl 48(%esp),%ebx + xorl %esi,%ebp + xorl 8(%esp),%ebx + andl %edx,%ebp + xorl 28(%esp),%ebx + roll $1,%ebx + addl %eax,%ebp + rorl $2,%edx + movl %ecx,%eax + roll $5,%eax + movl %ebx,40(%esp) + leal 2400959708(%ebx,%ebp,1),%ebx + movl %edi,%ebp + addl %eax,%ebx + andl %esi,%ebp + movl 44(%esp),%eax + addl %ebp,%ebx + + movl %edx,%ebp + xorl 52(%esp),%eax + xorl %edi,%ebp + xorl 12(%esp),%eax + andl %ecx,%ebp + xorl 32(%esp),%eax + roll $1,%eax + addl %esi,%ebp + rorl $2,%ecx + movl %ebx,%esi + roll $5,%esi + movl %eax,44(%esp) + leal 2400959708(%eax,%ebp,1),%eax + movl %edx,%ebp + addl %esi,%eax + andl %edi,%ebp + movl 48(%esp),%esi + addl %ebp,%eax + + movl %ebx,%ebp + xorl 56(%esp),%esi + xorl %ecx,%ebp + xorl 16(%esp),%esi + xorl %edx,%ebp + xorl 36(%esp),%esi + roll $1,%esi + addl %ebp,%edi + rorl $2,%ebx + movl %eax,%ebp + roll $5,%ebp + movl %esi,48(%esp) + leal 3395469782(%esi,%edi,1),%esi + movl 52(%esp),%edi + addl %ebp,%esi + + movl %eax,%ebp + xorl 60(%esp),%edi + xorl %ebx,%ebp + xorl 20(%esp),%edi + xorl %ecx,%ebp + xorl 40(%esp),%edi + roll $1,%edi + addl %ebp,%edx + rorl $2,%eax + movl %esi,%ebp + roll $5,%ebp + movl %edi,52(%esp) + leal 3395469782(%edi,%edx,1),%edi + movl 56(%esp),%edx + addl %ebp,%edi + + movl %esi,%ebp + xorl (%esp),%edx + xorl %eax,%ebp + xorl 24(%esp),%edx + xorl %ebx,%ebp + xorl 44(%esp),%edx + roll $1,%edx + addl %ebp,%ecx + rorl $2,%esi + movl %edi,%ebp + roll $5,%ebp + movl %edx,56(%esp) + leal 3395469782(%edx,%ecx,1),%edx + movl 60(%esp),%ecx + addl %ebp,%edx + + movl %edi,%ebp + xorl 4(%esp),%ecx + xorl %esi,%ebp + xorl 28(%esp),%ecx + xorl %eax,%ebp + xorl 48(%esp),%ecx + roll $1,%ecx + addl %ebp,%ebx + rorl $2,%edi + movl %edx,%ebp + roll $5,%ebp + movl %ecx,60(%esp) + leal 3395469782(%ecx,%ebx,1),%ecx + movl (%esp),%ebx + addl %ebp,%ecx + + movl %edx,%ebp + xorl 8(%esp),%ebx + xorl %edi,%ebp + xorl 32(%esp),%ebx + xorl %esi,%ebp + xorl 52(%esp),%ebx + roll $1,%ebx + addl %ebp,%eax + rorl $2,%edx + movl %ecx,%ebp + roll $5,%ebp + movl %ebx,(%esp) + leal 3395469782(%ebx,%eax,1),%ebx + movl 4(%esp),%eax + addl %ebp,%ebx + + movl %ecx,%ebp + xorl 12(%esp),%eax + xorl %edx,%ebp + xorl 36(%esp),%eax + xorl %edi,%ebp + xorl 56(%esp),%eax + roll $1,%eax + addl %ebp,%esi + rorl $2,%ecx + movl %ebx,%ebp + roll $5,%ebp + movl %eax,4(%esp) + leal 3395469782(%eax,%esi,1),%eax + movl 8(%esp),%esi + addl %ebp,%eax + + movl %ebx,%ebp + xorl 16(%esp),%esi + xorl %ecx,%ebp + xorl 40(%esp),%esi + xorl %edx,%ebp + xorl 60(%esp),%esi + roll $1,%esi + addl %ebp,%edi + rorl $2,%ebx + movl %eax,%ebp + roll $5,%ebp + movl %esi,8(%esp) + leal 3395469782(%esi,%edi,1),%esi + movl 12(%esp),%edi + addl %ebp,%esi + + movl %eax,%ebp + xorl 20(%esp),%edi + xorl %ebx,%ebp + xorl 44(%esp),%edi + xorl %ecx,%ebp + xorl (%esp),%edi + roll $1,%edi + addl %ebp,%edx + rorl $2,%eax + movl %esi,%ebp + roll $5,%ebp + movl %edi,12(%esp) + leal 3395469782(%edi,%edx,1),%edi + movl 16(%esp),%edx + addl %ebp,%edi + + movl %esi,%ebp + xorl 24(%esp),%edx + xorl %eax,%ebp + xorl 48(%esp),%edx + xorl %ebx,%ebp + xorl 4(%esp),%edx + roll $1,%edx + addl %ebp,%ecx + rorl $2,%esi + movl %edi,%ebp + roll $5,%ebp + movl %edx,16(%esp) + leal 3395469782(%edx,%ecx,1),%edx + movl 20(%esp),%ecx + addl %ebp,%edx + + movl %edi,%ebp + xorl 28(%esp),%ecx + xorl %esi,%ebp + xorl 52(%esp),%ecx + xorl %eax,%ebp + xorl 8(%esp),%ecx + roll $1,%ecx + addl %ebp,%ebx + rorl $2,%edi + movl %edx,%ebp + roll $5,%ebp + movl %ecx,20(%esp) + leal 3395469782(%ecx,%ebx,1),%ecx + movl 24(%esp),%ebx + addl %ebp,%ecx + + movl %edx,%ebp + xorl 32(%esp),%ebx + xorl %edi,%ebp + xorl 56(%esp),%ebx + xorl %esi,%ebp + xorl 12(%esp),%ebx + roll $1,%ebx + addl %ebp,%eax + rorl $2,%edx + movl %ecx,%ebp + roll $5,%ebp + movl %ebx,24(%esp) + leal 3395469782(%ebx,%eax,1),%ebx + movl 28(%esp),%eax + addl %ebp,%ebx + + movl %ecx,%ebp + xorl 36(%esp),%eax + xorl %edx,%ebp + xorl 60(%esp),%eax + xorl %edi,%ebp + xorl 16(%esp),%eax + roll $1,%eax + addl %ebp,%esi + rorl $2,%ecx + movl %ebx,%ebp + roll $5,%ebp + movl %eax,28(%esp) + leal 3395469782(%eax,%esi,1),%eax + movl 32(%esp),%esi + addl %ebp,%eax + + movl %ebx,%ebp + xorl 40(%esp),%esi + xorl %ecx,%ebp + xorl (%esp),%esi + xorl %edx,%ebp + xorl 20(%esp),%esi + roll $1,%esi + addl %ebp,%edi + rorl $2,%ebx + movl %eax,%ebp + roll $5,%ebp + movl %esi,32(%esp) + leal 3395469782(%esi,%edi,1),%esi + movl 36(%esp),%edi + addl %ebp,%esi + + movl %eax,%ebp + xorl 44(%esp),%edi + xorl %ebx,%ebp + xorl 4(%esp),%edi + xorl %ecx,%ebp + xorl 24(%esp),%edi + roll $1,%edi + addl %ebp,%edx + rorl $2,%eax + movl %esi,%ebp + roll $5,%ebp + movl %edi,36(%esp) + leal 3395469782(%edi,%edx,1),%edi + movl 40(%esp),%edx + addl %ebp,%edi + + movl %esi,%ebp + xorl 48(%esp),%edx + xorl %eax,%ebp + xorl 8(%esp),%edx + xorl %ebx,%ebp + xorl 28(%esp),%edx + roll $1,%edx + addl %ebp,%ecx + rorl $2,%esi + movl %edi,%ebp + roll $5,%ebp + movl %edx,40(%esp) + leal 3395469782(%edx,%ecx,1),%edx + movl 44(%esp),%ecx + addl %ebp,%edx + + movl %edi,%ebp + xorl 52(%esp),%ecx + xorl %esi,%ebp + xorl 12(%esp),%ecx + xorl %eax,%ebp + xorl 32(%esp),%ecx + roll $1,%ecx + addl %ebp,%ebx + rorl $2,%edi + movl %edx,%ebp + roll $5,%ebp + movl %ecx,44(%esp) + leal 3395469782(%ecx,%ebx,1),%ecx + movl 48(%esp),%ebx + addl %ebp,%ecx + + movl %edx,%ebp + xorl 56(%esp),%ebx + xorl %edi,%ebp + xorl 16(%esp),%ebx + xorl %esi,%ebp + xorl 36(%esp),%ebx + roll $1,%ebx + addl %ebp,%eax + rorl $2,%edx + movl %ecx,%ebp + roll $5,%ebp + movl %ebx,48(%esp) + leal 3395469782(%ebx,%eax,1),%ebx + movl 52(%esp),%eax + addl %ebp,%ebx + + movl %ecx,%ebp + xorl 60(%esp),%eax + xorl %edx,%ebp + xorl 20(%esp),%eax + xorl %edi,%ebp + xorl 40(%esp),%eax + roll $1,%eax + addl %ebp,%esi + rorl $2,%ecx + movl %ebx,%ebp + roll $5,%ebp + leal 3395469782(%eax,%esi,1),%eax + movl 56(%esp),%esi + addl %ebp,%eax + + movl %ebx,%ebp + xorl (%esp),%esi + xorl %ecx,%ebp + xorl 24(%esp),%esi + xorl %edx,%ebp + xorl 44(%esp),%esi + roll $1,%esi + addl %ebp,%edi + rorl $2,%ebx + movl %eax,%ebp + roll $5,%ebp + leal 3395469782(%esi,%edi,1),%esi + movl 60(%esp),%edi + addl %ebp,%esi + + movl %eax,%ebp + xorl 4(%esp),%edi + xorl %ebx,%ebp + xorl 28(%esp),%edi + xorl %ecx,%ebp + xorl 48(%esp),%edi + roll $1,%edi + addl %ebp,%edx + rorl $2,%eax + movl %esi,%ebp + roll $5,%ebp + leal 3395469782(%edi,%edx,1),%edi + addl %ebp,%edi + movl 96(%esp),%ebp + movl 100(%esp),%edx + addl (%ebp),%edi + addl 4(%ebp),%esi + addl 8(%ebp),%eax + addl 12(%ebp),%ebx + addl 16(%ebp),%ecx + movl %edi,(%ebp) + addl $64,%edx + movl %esi,4(%ebp) + cmpl 104(%esp),%edx + movl %eax,8(%ebp) + movl %ecx,%edi + movl %ebx,12(%ebp) + movl %edx,%esi + movl %ecx,16(%ebp) + jb .L002loop + addl $76,%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size sha1_block_data_order,.-.L_sha1_block_data_order_begin +.type _sha1_block_data_order_shaext,@function +.align 16 +_sha1_block_data_order_shaext: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + call .L003pic_point +.L003pic_point: + popl %ebp + leal .LK_XX_XX-.L003pic_point(%ebp),%ebp +.Lshaext_shortcut: + movl 20(%esp),%edi + movl %esp,%ebx + movl 24(%esp),%esi + movl 28(%esp),%ecx + subl $32,%esp + movdqu (%edi),%xmm0 + movd 16(%edi),%xmm1 + andl $-32,%esp + movdqa 80(%ebp),%xmm3 + movdqu (%esi),%xmm4 + pshufd $27,%xmm0,%xmm0 + movdqu 16(%esi),%xmm5 + pshufd $27,%xmm1,%xmm1 + movdqu 32(%esi),%xmm6 +.byte 102,15,56,0,227 + movdqu 48(%esi),%xmm7 +.byte 102,15,56,0,235 +.byte 102,15,56,0,243 +.byte 102,15,56,0,251 + jmp .L004loop_shaext +.align 16 +.L004loop_shaext: + decl %ecx + leal 64(%esi),%eax + movdqa %xmm1,(%esp) + paddd %xmm4,%xmm1 + cmovnel %eax,%esi + movdqa %xmm0,16(%esp) +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,0 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,0 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,0 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,1 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,1 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,1 +.byte 15,56,200,206 + pxor %xmm7,%xmm5 +.byte 15,56,202,236 +.byte 15,56,201,247 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,2 +.byte 15,56,200,215 + pxor %xmm4,%xmm6 +.byte 15,56,201,252 +.byte 15,56,202,245 + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,2 +.byte 15,56,200,204 + pxor %xmm5,%xmm7 +.byte 15,56,202,254 +.byte 15,56,201,229 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,2 +.byte 15,56,200,213 + pxor %xmm6,%xmm4 +.byte 15,56,201,238 +.byte 15,56,202,231 + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,2 +.byte 15,56,200,206 pxor %xmm7,%xmm5 .byte 15,56,202,236 .byte 15,56,201,247 @@ -4326,68 +5506,1288 @@ _sha1_block_data_order_shaext: .byte 15,56,202,254 movdqu (%esi),%xmm4 movdqa %xmm0,%xmm2 -.byte 15,58,204,193,3 -.byte 15,56,200,213 - movdqu 16(%esi),%xmm5 -.byte 102,15,56,0,227 +.byte 15,58,204,193,3 +.byte 15,56,200,213 + movdqu 16(%esi),%xmm5 +.byte 102,15,56,0,227 + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,3 +.byte 15,56,200,206 + movdqu 32(%esi),%xmm6 +.byte 102,15,56,0,235 + movdqa %xmm0,%xmm2 +.byte 15,58,204,193,3 +.byte 15,56,200,215 + movdqu 48(%esi),%xmm7 +.byte 102,15,56,0,243 + movdqa %xmm0,%xmm1 +.byte 15,58,204,194,3 + movdqa (%esp),%xmm2 +.byte 102,15,56,0,251 +.byte 15,56,200,202 + paddd 16(%esp),%xmm0 + jnz .L004loop_shaext + pshufd $27,%xmm0,%xmm0 + pshufd $27,%xmm1,%xmm1 + movdqu %xmm0,(%edi) + movd %xmm1,16(%edi) + movl %ebx,%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size _sha1_block_data_order_shaext,.-_sha1_block_data_order_shaext +.type _sha1_block_data_order_ssse3,@function +.align 16 +_sha1_block_data_order_ssse3: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + call .L005pic_point +.L005pic_point: + popl %ebp + leal .LK_XX_XX-.L005pic_point(%ebp),%ebp +.Lssse3_shortcut: + movdqa (%ebp),%xmm7 + movdqa 16(%ebp),%xmm0 + movdqa 32(%ebp),%xmm1 + movdqa 48(%ebp),%xmm2 + movdqa 64(%ebp),%xmm6 + movl 20(%esp),%edi + movl 24(%esp),%ebp + movl 28(%esp),%edx + movl %esp,%esi + subl $208,%esp + andl $-64,%esp + movdqa %xmm0,112(%esp) + movdqa %xmm1,128(%esp) + movdqa %xmm2,144(%esp) + shll $6,%edx + movdqa %xmm7,160(%esp) + addl %ebp,%edx + movdqa %xmm6,176(%esp) + addl $64,%ebp + movl %edi,192(%esp) + movl %ebp,196(%esp) + movl %edx,200(%esp) + movl %esi,204(%esp) + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl 16(%edi),%edi + movl %ebx,%esi + movdqu -64(%ebp),%xmm0 + movdqu -48(%ebp),%xmm1 + movdqu -32(%ebp),%xmm2 + movdqu -16(%ebp),%xmm3 +.byte 102,15,56,0,198 +.byte 102,15,56,0,206 +.byte 102,15,56,0,214 + movdqa %xmm7,96(%esp) +.byte 102,15,56,0,222 + paddd %xmm7,%xmm0 + paddd %xmm7,%xmm1 + paddd %xmm7,%xmm2 + movdqa %xmm0,(%esp) + psubd %xmm7,%xmm0 + movdqa %xmm1,16(%esp) + psubd %xmm7,%xmm1 + movdqa %xmm2,32(%esp) + movl %ecx,%ebp + psubd %xmm7,%xmm2 + xorl %edx,%ebp + pshufd $238,%xmm0,%xmm4 + andl %ebp,%esi + jmp .L006loop +.align 16 +.L006loop: + rorl $2,%ebx + xorl %edx,%esi + movl %eax,%ebp + punpcklqdq %xmm1,%xmm4 + movdqa %xmm3,%xmm6 + addl (%esp),%edi + xorl %ecx,%ebx + paddd %xmm3,%xmm7 + movdqa %xmm0,64(%esp) + roll $5,%eax + addl %esi,%edi + psrldq $4,%xmm6 + andl %ebx,%ebp + xorl %ecx,%ebx + pxor %xmm0,%xmm4 + addl %eax,%edi + rorl $7,%eax + pxor %xmm2,%xmm6 + xorl %ecx,%ebp + movl %edi,%esi + addl 4(%esp),%edx + pxor %xmm6,%xmm4 + xorl %ebx,%eax + roll $5,%edi + movdqa %xmm7,48(%esp) + addl %ebp,%edx + andl %eax,%esi + movdqa %xmm4,%xmm0 + xorl %ebx,%eax + addl %edi,%edx + rorl $7,%edi + movdqa %xmm4,%xmm6 + xorl %ebx,%esi + pslldq $12,%xmm0 + paddd %xmm4,%xmm4 + movl %edx,%ebp + addl 8(%esp),%ecx + psrld $31,%xmm6 + xorl %eax,%edi + roll $5,%edx + movdqa %xmm0,%xmm7 + addl %esi,%ecx + andl %edi,%ebp + xorl %eax,%edi + psrld $30,%xmm0 + addl %edx,%ecx + rorl $7,%edx + por %xmm6,%xmm4 + xorl %eax,%ebp + movl %ecx,%esi + addl 12(%esp),%ebx + pslld $2,%xmm7 + xorl %edi,%edx + roll $5,%ecx + pxor %xmm0,%xmm4 + movdqa 96(%esp),%xmm0 + addl %ebp,%ebx + andl %edx,%esi + pxor %xmm7,%xmm4 + pshufd $238,%xmm1,%xmm5 + xorl %edi,%edx + addl %ecx,%ebx + rorl $7,%ecx + xorl %edi,%esi + movl %ebx,%ebp + punpcklqdq %xmm2,%xmm5 + movdqa %xmm4,%xmm7 + addl 16(%esp),%eax + xorl %edx,%ecx + paddd %xmm4,%xmm0 + movdqa %xmm1,80(%esp) + roll $5,%ebx + addl %esi,%eax + psrldq $4,%xmm7 + andl %ecx,%ebp + xorl %edx,%ecx + pxor %xmm1,%xmm5 + addl %ebx,%eax + rorl $7,%ebx + pxor %xmm3,%xmm7 + xorl %edx,%ebp + movl %eax,%esi + addl 20(%esp),%edi + pxor %xmm7,%xmm5 + xorl %ecx,%ebx + roll $5,%eax + movdqa %xmm0,(%esp) + addl %ebp,%edi + andl %ebx,%esi + movdqa %xmm5,%xmm1 + xorl %ecx,%ebx + addl %eax,%edi + rorl $7,%eax + movdqa %xmm5,%xmm7 + xorl %ecx,%esi + pslldq $12,%xmm1 + paddd %xmm5,%xmm5 + movl %edi,%ebp + addl 24(%esp),%edx + psrld $31,%xmm7 + xorl %ebx,%eax + roll $5,%edi + movdqa %xmm1,%xmm0 + addl %esi,%edx + andl %eax,%ebp + xorl %ebx,%eax + psrld $30,%xmm1 + addl %edi,%edx + rorl $7,%edi + por %xmm7,%xmm5 + xorl %ebx,%ebp + movl %edx,%esi + addl 28(%esp),%ecx + pslld $2,%xmm0 + xorl %eax,%edi + roll $5,%edx + pxor %xmm1,%xmm5 + movdqa 112(%esp),%xmm1 + addl %ebp,%ecx + andl %edi,%esi + pxor %xmm0,%xmm5 + pshufd $238,%xmm2,%xmm6 + xorl %eax,%edi + addl %edx,%ecx + rorl $7,%edx + xorl %eax,%esi + movl %ecx,%ebp + punpcklqdq %xmm3,%xmm6 + movdqa %xmm5,%xmm0 + addl 32(%esp),%ebx + xorl %edi,%edx + paddd %xmm5,%xmm1 + movdqa %xmm2,96(%esp) + roll $5,%ecx + addl %esi,%ebx + psrldq $4,%xmm0 + andl %edx,%ebp + xorl %edi,%edx + pxor %xmm2,%xmm6 + addl %ecx,%ebx + rorl $7,%ecx + pxor %xmm4,%xmm0 + xorl %edi,%ebp + movl %ebx,%esi + addl 36(%esp),%eax + pxor %xmm0,%xmm6 + xorl %edx,%ecx + roll $5,%ebx + movdqa %xmm1,16(%esp) + addl %ebp,%eax + andl %ecx,%esi + movdqa %xmm6,%xmm2 + xorl %edx,%ecx + addl %ebx,%eax + rorl $7,%ebx + movdqa %xmm6,%xmm0 + xorl %edx,%esi + pslldq $12,%xmm2 + paddd %xmm6,%xmm6 + movl %eax,%ebp + addl 40(%esp),%edi + psrld $31,%xmm0 + xorl %ecx,%ebx + roll $5,%eax + movdqa %xmm2,%xmm1 + addl %esi,%edi + andl %ebx,%ebp + xorl %ecx,%ebx + psrld $30,%xmm2 + addl %eax,%edi + rorl $7,%eax + por %xmm0,%xmm6 + xorl %ecx,%ebp + movdqa 64(%esp),%xmm0 + movl %edi,%esi + addl 44(%esp),%edx + pslld $2,%xmm1 + xorl %ebx,%eax + roll $5,%edi + pxor %xmm2,%xmm6 + movdqa 112(%esp),%xmm2 + addl %ebp,%edx + andl %eax,%esi + pxor %xmm1,%xmm6 + pshufd $238,%xmm3,%xmm7 + xorl %ebx,%eax + addl %edi,%edx + rorl $7,%edi + xorl %ebx,%esi + movl %edx,%ebp + punpcklqdq %xmm4,%xmm7 + movdqa %xmm6,%xmm1 + addl 48(%esp),%ecx + xorl %eax,%edi + paddd %xmm6,%xmm2 + movdqa %xmm3,64(%esp) + roll $5,%edx + addl %esi,%ecx + psrldq $4,%xmm1 + andl %edi,%ebp + xorl %eax,%edi + pxor %xmm3,%xmm7 + addl %edx,%ecx + rorl $7,%edx + pxor %xmm5,%xmm1 + xorl %eax,%ebp + movl %ecx,%esi + addl 52(%esp),%ebx + pxor %xmm1,%xmm7 + xorl %edi,%edx + roll $5,%ecx + movdqa %xmm2,32(%esp) + addl %ebp,%ebx + andl %edx,%esi + movdqa %xmm7,%xmm3 + xorl %edi,%edx + addl %ecx,%ebx + rorl $7,%ecx + movdqa %xmm7,%xmm1 + xorl %edi,%esi + pslldq $12,%xmm3 + paddd %xmm7,%xmm7 + movl %ebx,%ebp + addl 56(%esp),%eax + psrld $31,%xmm1 + xorl %edx,%ecx + roll $5,%ebx + movdqa %xmm3,%xmm2 + addl %esi,%eax + andl %ecx,%ebp + xorl %edx,%ecx + psrld $30,%xmm3 + addl %ebx,%eax + rorl $7,%ebx + por %xmm1,%xmm7 + xorl %edx,%ebp + movdqa 80(%esp),%xmm1 + movl %eax,%esi + addl 60(%esp),%edi + pslld $2,%xmm2 + xorl %ecx,%ebx + roll $5,%eax + pxor %xmm3,%xmm7 + movdqa 112(%esp),%xmm3 + addl %ebp,%edi + andl %ebx,%esi + pxor %xmm2,%xmm7 + pshufd $238,%xmm6,%xmm2 + xorl %ecx,%ebx + addl %eax,%edi + rorl $7,%eax + pxor %xmm4,%xmm0 + punpcklqdq %xmm7,%xmm2 + xorl %ecx,%esi + movl %edi,%ebp + addl (%esp),%edx + pxor %xmm1,%xmm0 + movdqa %xmm4,80(%esp) + xorl %ebx,%eax + roll $5,%edi + movdqa %xmm3,%xmm4 + addl %esi,%edx + paddd %xmm7,%xmm3 + andl %eax,%ebp + pxor %xmm2,%xmm0 + xorl %ebx,%eax + addl %edi,%edx + rorl $7,%edi + xorl %ebx,%ebp + movdqa %xmm0,%xmm2 + movdqa %xmm3,48(%esp) + movl %edx,%esi + addl 4(%esp),%ecx + xorl %eax,%edi + roll $5,%edx + pslld $2,%xmm0 + addl %ebp,%ecx + andl %edi,%esi + psrld $30,%xmm2 + xorl %eax,%edi + addl %edx,%ecx + rorl $7,%edx + xorl %eax,%esi + movl %ecx,%ebp + addl 8(%esp),%ebx + xorl %edi,%edx + roll $5,%ecx + por %xmm2,%xmm0 + addl %esi,%ebx + andl %edx,%ebp + movdqa 96(%esp),%xmm2 + xorl %edi,%edx + addl %ecx,%ebx + addl 12(%esp),%eax + xorl %edi,%ebp + movl %ebx,%esi + pshufd $238,%xmm7,%xmm3 + roll $5,%ebx + addl %ebp,%eax + xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + addl 16(%esp),%edi + pxor %xmm5,%xmm1 + punpcklqdq %xmm0,%xmm3 + xorl %ecx,%esi + movl %eax,%ebp + roll $5,%eax + pxor %xmm2,%xmm1 + movdqa %xmm5,96(%esp) + addl %esi,%edi + xorl %ecx,%ebp + movdqa %xmm4,%xmm5 + rorl $7,%ebx + paddd %xmm0,%xmm4 + addl %eax,%edi + pxor %xmm3,%xmm1 + addl 20(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + roll $5,%edi + movdqa %xmm1,%xmm3 + movdqa %xmm4,(%esp) + addl %ebp,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %edi,%edx + pslld $2,%xmm1 + addl 24(%esp),%ecx + xorl %eax,%esi + psrld $30,%xmm3 + movl %edx,%ebp + roll $5,%edx + addl %esi,%ecx + xorl %eax,%ebp + rorl $7,%edi + addl %edx,%ecx + por %xmm3,%xmm1 + addl 28(%esp),%ebx + xorl %edi,%ebp + movdqa 64(%esp),%xmm3 + movl %ecx,%esi + roll $5,%ecx + addl %ebp,%ebx + xorl %edi,%esi + rorl $7,%edx + pshufd $238,%xmm0,%xmm4 + addl %ecx,%ebx + addl 32(%esp),%eax + pxor %xmm6,%xmm2 + punpcklqdq %xmm1,%xmm4 + xorl %edx,%esi + movl %ebx,%ebp + roll $5,%ebx + pxor %xmm3,%xmm2 + movdqa %xmm6,64(%esp) + addl %esi,%eax + xorl %edx,%ebp + movdqa 128(%esp),%xmm6 + rorl $7,%ecx + paddd %xmm1,%xmm5 + addl %ebx,%eax + pxor %xmm4,%xmm2 + addl 36(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + roll $5,%eax + movdqa %xmm2,%xmm4 + movdqa %xmm5,16(%esp) + addl %ebp,%edi + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%edi + pslld $2,%xmm2 + addl 40(%esp),%edx + xorl %ebx,%esi + psrld $30,%xmm4 + movl %edi,%ebp + roll $5,%edi + addl %esi,%edx + xorl %ebx,%ebp + rorl $7,%eax + addl %edi,%edx + por %xmm4,%xmm2 + addl 44(%esp),%ecx + xorl %eax,%ebp + movdqa 80(%esp),%xmm4 + movl %edx,%esi + roll $5,%edx + addl %ebp,%ecx + xorl %eax,%esi + rorl $7,%edi + pshufd $238,%xmm1,%xmm5 + addl %edx,%ecx + addl 48(%esp),%ebx + pxor %xmm7,%xmm3 + punpcklqdq %xmm2,%xmm5 + xorl %edi,%esi + movl %ecx,%ebp + roll $5,%ecx + pxor %xmm4,%xmm3 + movdqa %xmm7,80(%esp) + addl %esi,%ebx + xorl %edi,%ebp + movdqa %xmm6,%xmm7 + rorl $7,%edx + paddd %xmm2,%xmm6 + addl %ecx,%ebx + pxor %xmm5,%xmm3 + addl 52(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + roll $5,%ebx + movdqa %xmm3,%xmm5 + movdqa %xmm6,32(%esp) + addl %ebp,%eax + xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + pslld $2,%xmm3 + addl 56(%esp),%edi + xorl %ecx,%esi + psrld $30,%xmm5 + movl %eax,%ebp + roll $5,%eax + addl %esi,%edi + xorl %ecx,%ebp + rorl $7,%ebx + addl %eax,%edi + por %xmm5,%xmm3 + addl 60(%esp),%edx + xorl %ebx,%ebp + movdqa 96(%esp),%xmm5 + movl %edi,%esi + roll $5,%edi + addl %ebp,%edx + xorl %ebx,%esi + rorl $7,%eax + pshufd $238,%xmm2,%xmm6 + addl %edi,%edx + addl (%esp),%ecx + pxor %xmm0,%xmm4 + punpcklqdq %xmm3,%xmm6 + xorl %eax,%esi + movl %edx,%ebp + roll $5,%edx + pxor %xmm5,%xmm4 + movdqa %xmm0,96(%esp) + addl %esi,%ecx + xorl %eax,%ebp + movdqa %xmm7,%xmm0 + rorl $7,%edi + paddd %xmm3,%xmm7 + addl %edx,%ecx + pxor %xmm6,%xmm4 + addl 4(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + roll $5,%ecx + movdqa %xmm4,%xmm6 + movdqa %xmm7,48(%esp) + addl %ebp,%ebx + xorl %edi,%esi + rorl $7,%edx + addl %ecx,%ebx + pslld $2,%xmm4 + addl 8(%esp),%eax + xorl %edx,%esi + psrld $30,%xmm6 + movl %ebx,%ebp + roll $5,%ebx + addl %esi,%eax + xorl %edx,%ebp + rorl $7,%ecx + addl %ebx,%eax + por %xmm6,%xmm4 + addl 12(%esp),%edi + xorl %ecx,%ebp + movdqa 64(%esp),%xmm6 + movl %eax,%esi + roll $5,%eax + addl %ebp,%edi + xorl %ecx,%esi + rorl $7,%ebx + pshufd $238,%xmm3,%xmm7 + addl %eax,%edi + addl 16(%esp),%edx + pxor %xmm1,%xmm5 + punpcklqdq %xmm4,%xmm7 + xorl %ebx,%esi + movl %edi,%ebp + roll $5,%edi + pxor %xmm6,%xmm5 + movdqa %xmm1,64(%esp) + addl %esi,%edx + xorl %ebx,%ebp movdqa %xmm0,%xmm1 -.byte 15,58,204,194,3 -.byte 15,56,200,206 - movdqu 32(%esi),%xmm6 -.byte 102,15,56,0,235 + rorl $7,%eax + paddd %xmm4,%xmm0 + addl %edi,%edx + pxor %xmm7,%xmm5 + addl 20(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + roll $5,%edx + movdqa %xmm5,%xmm7 + movdqa %xmm0,(%esp) + addl %ebp,%ecx + xorl %eax,%esi + rorl $7,%edi + addl %edx,%ecx + pslld $2,%xmm5 + addl 24(%esp),%ebx + xorl %edi,%esi + psrld $30,%xmm7 + movl %ecx,%ebp + roll $5,%ecx + addl %esi,%ebx + xorl %edi,%ebp + rorl $7,%edx + addl %ecx,%ebx + por %xmm7,%xmm5 + addl 28(%esp),%eax + movdqa 80(%esp),%xmm7 + rorl $7,%ecx + movl %ebx,%esi + xorl %edx,%ebp + roll $5,%ebx + pshufd $238,%xmm4,%xmm0 + addl %ebp,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + addl 32(%esp),%edi + pxor %xmm2,%xmm6 + punpcklqdq %xmm5,%xmm0 + andl %ecx,%esi + xorl %edx,%ecx + rorl $7,%ebx + pxor %xmm7,%xmm6 + movdqa %xmm2,80(%esp) + movl %eax,%ebp + xorl %ecx,%esi + roll $5,%eax + movdqa %xmm1,%xmm2 + addl %esi,%edi + paddd %xmm5,%xmm1 + xorl %ebx,%ebp + pxor %xmm0,%xmm6 + xorl %ecx,%ebx + addl %eax,%edi + addl 36(%esp),%edx + andl %ebx,%ebp + movdqa %xmm6,%xmm0 + movdqa %xmm1,16(%esp) + xorl %ecx,%ebx + rorl $7,%eax + movl %edi,%esi + xorl %ebx,%ebp + roll $5,%edi + pslld $2,%xmm6 + addl %ebp,%edx + xorl %eax,%esi + psrld $30,%xmm0 + xorl %ebx,%eax + addl %edi,%edx + addl 40(%esp),%ecx + andl %eax,%esi + xorl %ebx,%eax + rorl $7,%edi + por %xmm0,%xmm6 + movl %edx,%ebp + xorl %eax,%esi + movdqa 96(%esp),%xmm0 + roll $5,%edx + addl %esi,%ecx + xorl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + pshufd $238,%xmm5,%xmm1 + addl 44(%esp),%ebx + andl %edi,%ebp + xorl %eax,%edi + rorl $7,%edx + movl %ecx,%esi + xorl %edi,%ebp + roll $5,%ecx + addl %ebp,%ebx + xorl %edx,%esi + xorl %edi,%edx + addl %ecx,%ebx + addl 48(%esp),%eax + pxor %xmm3,%xmm7 + punpcklqdq %xmm6,%xmm1 + andl %edx,%esi + xorl %edi,%edx + rorl $7,%ecx + pxor %xmm0,%xmm7 + movdqa %xmm3,96(%esp) + movl %ebx,%ebp + xorl %edx,%esi + roll $5,%ebx + movdqa 144(%esp),%xmm3 + addl %esi,%eax + paddd %xmm6,%xmm2 + xorl %ecx,%ebp + pxor %xmm1,%xmm7 + xorl %edx,%ecx + addl %ebx,%eax + addl 52(%esp),%edi + andl %ecx,%ebp + movdqa %xmm7,%xmm1 + movdqa %xmm2,32(%esp) + xorl %edx,%ecx + rorl $7,%ebx + movl %eax,%esi + xorl %ecx,%ebp + roll $5,%eax + pslld $2,%xmm7 + addl %ebp,%edi + xorl %ebx,%esi + psrld $30,%xmm1 + xorl %ecx,%ebx + addl %eax,%edi + addl 56(%esp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + rorl $7,%eax + por %xmm1,%xmm7 + movl %edi,%ebp + xorl %ebx,%esi + movdqa 64(%esp),%xmm1 + roll $5,%edi + addl %esi,%edx + xorl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + pshufd $238,%xmm6,%xmm2 + addl 60(%esp),%ecx + andl %eax,%ebp + xorl %ebx,%eax + rorl $7,%edi + movl %edx,%esi + xorl %eax,%ebp + roll $5,%edx + addl %ebp,%ecx + xorl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + addl (%esp),%ebx + pxor %xmm4,%xmm0 + punpcklqdq %xmm7,%xmm2 + andl %edi,%esi + xorl %eax,%edi + rorl $7,%edx + pxor %xmm1,%xmm0 + movdqa %xmm4,64(%esp) + movl %ecx,%ebp + xorl %edi,%esi + roll $5,%ecx + movdqa %xmm3,%xmm4 + addl %esi,%ebx + paddd %xmm7,%xmm3 + xorl %edx,%ebp + pxor %xmm2,%xmm0 + xorl %edi,%edx + addl %ecx,%ebx + addl 4(%esp),%eax + andl %edx,%ebp movdqa %xmm0,%xmm2 -.byte 15,58,204,193,3 -.byte 15,56,200,215 - movdqu 48(%esi),%xmm7 -.byte 102,15,56,0,243 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,3 - movdqa (%esp),%xmm2 -.byte 102,15,56,0,251 -.byte 15,56,200,202 - paddd 16(%esp),%xmm0 - jnz .L004loop_shaext - pshufd $27,%xmm0,%xmm0 - pshufd $27,%xmm1,%xmm1 - movdqu %xmm0,(%edi) - movd %xmm1,16(%edi) - movl %ebx,%esp + movdqa %xmm3,48(%esp) + xorl %edi,%edx + rorl $7,%ecx + movl %ebx,%esi + xorl %edx,%ebp + roll $5,%ebx + pslld $2,%xmm0 + addl %ebp,%eax + xorl %ecx,%esi + psrld $30,%xmm2 + xorl %edx,%ecx + addl %ebx,%eax + addl 8(%esp),%edi + andl %ecx,%esi + xorl %edx,%ecx + rorl $7,%ebx + por %xmm2,%xmm0 + movl %eax,%ebp + xorl %ecx,%esi + movdqa 80(%esp),%xmm2 + roll $5,%eax + addl %esi,%edi + xorl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + pshufd $238,%xmm7,%xmm3 + addl 12(%esp),%edx + andl %ebx,%ebp + xorl %ecx,%ebx + rorl $7,%eax + movl %edi,%esi + xorl %ebx,%ebp + roll $5,%edi + addl %ebp,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %edi,%edx + addl 16(%esp),%ecx + pxor %xmm5,%xmm1 + punpcklqdq %xmm0,%xmm3 + andl %eax,%esi + xorl %ebx,%eax + rorl $7,%edi + pxor %xmm2,%xmm1 + movdqa %xmm5,80(%esp) + movl %edx,%ebp + xorl %eax,%esi + roll $5,%edx + movdqa %xmm4,%xmm5 + addl %esi,%ecx + paddd %xmm0,%xmm4 + xorl %edi,%ebp + pxor %xmm3,%xmm1 + xorl %eax,%edi + addl %edx,%ecx + addl 20(%esp),%ebx + andl %edi,%ebp + movdqa %xmm1,%xmm3 + movdqa %xmm4,(%esp) + xorl %eax,%edi + rorl $7,%edx + movl %ecx,%esi + xorl %edi,%ebp + roll $5,%ecx + pslld $2,%xmm1 + addl %ebp,%ebx + xorl %edx,%esi + psrld $30,%xmm3 + xorl %edi,%edx + addl %ecx,%ebx + addl 24(%esp),%eax + andl %edx,%esi + xorl %edi,%edx + rorl $7,%ecx + por %xmm3,%xmm1 + movl %ebx,%ebp + xorl %edx,%esi + movdqa 96(%esp),%xmm3 + roll $5,%ebx + addl %esi,%eax + xorl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + pshufd $238,%xmm0,%xmm4 + addl 28(%esp),%edi + andl %ecx,%ebp + xorl %edx,%ecx + rorl $7,%ebx + movl %eax,%esi + xorl %ecx,%ebp + roll $5,%eax + addl %ebp,%edi + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%edi + addl 32(%esp),%edx + pxor %xmm6,%xmm2 + punpcklqdq %xmm1,%xmm4 + andl %ebx,%esi + xorl %ecx,%ebx + rorl $7,%eax + pxor %xmm3,%xmm2 + movdqa %xmm6,96(%esp) + movl %edi,%ebp + xorl %ebx,%esi + roll $5,%edi + movdqa %xmm5,%xmm6 + addl %esi,%edx + paddd %xmm1,%xmm5 + xorl %eax,%ebp + pxor %xmm4,%xmm2 + xorl %ebx,%eax + addl %edi,%edx + addl 36(%esp),%ecx + andl %eax,%ebp + movdqa %xmm2,%xmm4 + movdqa %xmm5,16(%esp) + xorl %ebx,%eax + rorl $7,%edi + movl %edx,%esi + xorl %eax,%ebp + roll $5,%edx + pslld $2,%xmm2 + addl %ebp,%ecx + xorl %edi,%esi + psrld $30,%xmm4 + xorl %eax,%edi + addl %edx,%ecx + addl 40(%esp),%ebx + andl %edi,%esi + xorl %eax,%edi + rorl $7,%edx + por %xmm4,%xmm2 + movl %ecx,%ebp + xorl %edi,%esi + movdqa 64(%esp),%xmm4 + roll $5,%ecx + addl %esi,%ebx + xorl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + pshufd $238,%xmm1,%xmm5 + addl 44(%esp),%eax + andl %edx,%ebp + xorl %edi,%edx + rorl $7,%ecx + movl %ebx,%esi + xorl %edx,%ebp + roll $5,%ebx + addl %ebp,%eax + xorl %edx,%esi + addl %ebx,%eax + addl 48(%esp),%edi + pxor %xmm7,%xmm3 + punpcklqdq %xmm2,%xmm5 + xorl %ecx,%esi + movl %eax,%ebp + roll $5,%eax + pxor %xmm4,%xmm3 + movdqa %xmm7,64(%esp) + addl %esi,%edi + xorl %ecx,%ebp + movdqa %xmm6,%xmm7 + rorl $7,%ebx + paddd %xmm2,%xmm6 + addl %eax,%edi + pxor %xmm5,%xmm3 + addl 52(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + roll $5,%edi + movdqa %xmm3,%xmm5 + movdqa %xmm6,32(%esp) + addl %ebp,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %edi,%edx + pslld $2,%xmm3 + addl 56(%esp),%ecx + xorl %eax,%esi + psrld $30,%xmm5 + movl %edx,%ebp + roll $5,%edx + addl %esi,%ecx + xorl %eax,%ebp + rorl $7,%edi + addl %edx,%ecx + por %xmm5,%xmm3 + addl 60(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + roll $5,%ecx + addl %ebp,%ebx + xorl %edi,%esi + rorl $7,%edx + addl %ecx,%ebx + addl (%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + roll $5,%ebx + addl %esi,%eax + xorl %edx,%ebp + rorl $7,%ecx + paddd %xmm3,%xmm7 + addl %ebx,%eax + addl 4(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + movdqa %xmm7,48(%esp) + roll $5,%eax + addl %ebp,%edi + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%edi + addl 8(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + roll $5,%edi + addl %esi,%edx + xorl %ebx,%ebp + rorl $7,%eax + addl %edi,%edx + addl 12(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + roll $5,%edx + addl %ebp,%ecx + xorl %eax,%esi + rorl $7,%edi + addl %edx,%ecx + movl 196(%esp),%ebp + cmpl 200(%esp),%ebp + je .L007done + movdqa 160(%esp),%xmm7 + movdqa 176(%esp),%xmm6 + movdqu (%ebp),%xmm0 + movdqu 16(%ebp),%xmm1 + movdqu 32(%ebp),%xmm2 + movdqu 48(%ebp),%xmm3 + addl $64,%ebp +.byte 102,15,56,0,198 + movl %ebp,196(%esp) + movdqa %xmm7,96(%esp) + addl 16(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + roll $5,%ecx + addl %esi,%ebx + xorl %edi,%ebp + rorl $7,%edx +.byte 102,15,56,0,206 + addl %ecx,%ebx + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + paddd %xmm7,%xmm0 + roll $5,%ebx + addl %ebp,%eax + xorl %edx,%esi + rorl $7,%ecx + movdqa %xmm0,(%esp) + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + psubd %xmm7,%xmm0 + roll $5,%eax + addl %esi,%edi + xorl %ecx,%ebp + rorl $7,%ebx + addl %eax,%edi + addl 28(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + roll $5,%edi + addl %ebp,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + roll $5,%edx + addl %esi,%ecx + xorl %eax,%ebp + rorl $7,%edi +.byte 102,15,56,0,214 + addl %edx,%ecx + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + paddd %xmm7,%xmm1 + roll $5,%ecx + addl %ebp,%ebx + xorl %edi,%esi + rorl $7,%edx + movdqa %xmm1,16(%esp) + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + psubd %xmm7,%xmm1 + roll $5,%ebx + addl %esi,%eax + xorl %edx,%ebp + rorl $7,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + roll $5,%eax + addl %ebp,%edi + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + roll $5,%edi + addl %esi,%edx + xorl %ebx,%ebp + rorl $7,%eax +.byte 102,15,56,0,222 + addl %edi,%edx + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + paddd %xmm7,%xmm2 + roll $5,%edx + addl %ebp,%ecx + xorl %eax,%esi + rorl $7,%edi + movdqa %xmm2,32(%esp) + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + psubd %xmm7,%xmm2 + roll $5,%ecx + addl %esi,%ebx + xorl %edi,%ebp + rorl $7,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + roll $5,%ebx + addl %ebp,%eax + rorl $7,%ecx + addl %ebx,%eax + movl 192(%esp),%ebp + addl (%ebp),%eax + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,8(%ebp) + movl %ecx,%ebx + movl %edx,12(%ebp) + xorl %edx,%ebx + movl %edi,16(%ebp) + movl %esi,%ebp + pshufd $238,%xmm0,%xmm4 + andl %ebx,%esi + movl %ebp,%ebx + jmp .L006loop +.align 16 +.L007done: + addl 16(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + roll $5,%ecx + addl %esi,%ebx + xorl %edi,%ebp + rorl $7,%edx + addl %ecx,%ebx + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + roll $5,%ebx + addl %ebp,%eax + xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + roll $5,%eax + addl %esi,%edi + xorl %ecx,%ebp + rorl $7,%ebx + addl %eax,%edi + addl 28(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + roll $5,%edi + addl %ebp,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + roll $5,%edx + addl %esi,%ecx + xorl %eax,%ebp + rorl $7,%edi + addl %edx,%ecx + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + roll $5,%ecx + addl %ebp,%ebx + xorl %edi,%esi + rorl $7,%edx + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + roll $5,%ebx + addl %esi,%eax + xorl %edx,%ebp + rorl $7,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + roll $5,%eax + addl %ebp,%edi + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + roll $5,%edi + addl %esi,%edx + xorl %ebx,%ebp + rorl $7,%eax + addl %edi,%edx + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + roll $5,%edx + addl %ebp,%ecx + xorl %eax,%esi + rorl $7,%edi + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + roll $5,%ecx + addl %esi,%ebx + xorl %edi,%ebp + rorl $7,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + roll $5,%ebx + addl %ebp,%eax + rorl $7,%ecx + addl %ebx,%eax + movl 192(%esp),%ebp + addl (%ebp),%eax + movl 204(%esp),%esp + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,8(%ebp) + movl %edx,12(%ebp) + movl %edi,16(%ebp) popl %edi popl %esi popl %ebx popl %ebp ret -.size _sha1_block_data_order_shaext,.-_sha1_block_data_order_shaext -.type _sha1_block_data_order_ssse3,@function +.size _sha1_block_data_order_ssse3,.-_sha1_block_data_order_ssse3 +.type _sha1_block_data_order_avx,@function .align 16 -_sha1_block_data_order_ssse3: +_sha1_block_data_order_avx: pushl %ebp pushl %ebx pushl %esi pushl %edi - call .L005pic_point -.L005pic_point: + call .L008pic_point +.L008pic_point: popl %ebp - leal .LK_XX_XX-.L005pic_point(%ebp),%ebp -.Lssse3_shortcut: - movdqa (%ebp),%xmm7 - movdqa 16(%ebp),%xmm0 - movdqa 32(%ebp),%xmm1 - movdqa 48(%ebp),%xmm2 - movdqa 64(%ebp),%xmm6 + leal .LK_XX_XX-.L008pic_point(%ebp),%ebp +.Lavx_shortcut: + vzeroall + vmovdqa (%ebp),%xmm7 + vmovdqa 16(%ebp),%xmm0 + vmovdqa 32(%ebp),%xmm1 + vmovdqa 48(%ebp),%xmm2 + vmovdqa 64(%ebp),%xmm6 movl 20(%esp),%edi movl 24(%esp),%ebp movl 28(%esp),%edx movl %esp,%esi subl $208,%esp andl $-64,%esp - movdqa %xmm0,112(%esp) - movdqa %xmm1,128(%esp) - movdqa %xmm2,144(%esp) + vmovdqa %xmm0,112(%esp) + vmovdqa %xmm1,128(%esp) + vmovdqa %xmm2,144(%esp) shll $6,%edx - movdqa %xmm7,160(%esp) + vmovdqa %xmm7,160(%esp) addl %ebp,%edx - movdqa %xmm6,176(%esp) + vmovdqa %xmm6,176(%esp) addl $64,%ebp movl %edi,192(%esp) movl %ebp,196(%esp) @@ -4399,1050 +6799,1000 @@ _sha1_block_data_order_ssse3: movl 12(%edi),%edx movl 16(%edi),%edi movl %ebx,%esi - movdqu -64(%ebp),%xmm0 - movdqu -48(%ebp),%xmm1 - movdqu -32(%ebp),%xmm2 - movdqu -16(%ebp),%xmm3 -.byte 102,15,56,0,198 -.byte 102,15,56,0,206 -.byte 102,15,56,0,214 - movdqa %xmm7,96(%esp) -.byte 102,15,56,0,222 - paddd %xmm7,%xmm0 - paddd %xmm7,%xmm1 - paddd %xmm7,%xmm2 - movdqa %xmm0,(%esp) - psubd %xmm7,%xmm0 - movdqa %xmm1,16(%esp) - psubd %xmm7,%xmm1 - movdqa %xmm2,32(%esp) + vmovdqu -64(%ebp),%xmm0 + vmovdqu -48(%ebp),%xmm1 + vmovdqu -32(%ebp),%xmm2 + vmovdqu -16(%ebp),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + vpshufb %xmm6,%xmm1,%xmm1 + vpshufb %xmm6,%xmm2,%xmm2 + vmovdqa %xmm7,96(%esp) + vpshufb %xmm6,%xmm3,%xmm3 + vpaddd %xmm7,%xmm0,%xmm4 + vpaddd %xmm7,%xmm1,%xmm5 + vpaddd %xmm7,%xmm2,%xmm6 + vmovdqa %xmm4,(%esp) movl %ecx,%ebp - psubd %xmm7,%xmm2 + vmovdqa %xmm5,16(%esp) xorl %edx,%ebp - pshufd $238,%xmm0,%xmm4 + vmovdqa %xmm6,32(%esp) andl %ebp,%esi - jmp .L006loop + jmp .L009loop .align 16 -.L006loop: - rorl $2,%ebx +.L009loop: + shrdl $2,%ebx,%ebx xorl %edx,%esi + vpalignr $8,%xmm0,%xmm1,%xmm4 movl %eax,%ebp - punpcklqdq %xmm1,%xmm4 - movdqa %xmm3,%xmm6 addl (%esp),%edi + vpaddd %xmm3,%xmm7,%xmm7 + vmovdqa %xmm0,64(%esp) xorl %ecx,%ebx - paddd %xmm3,%xmm7 - movdqa %xmm0,64(%esp) - roll $5,%eax + shldl $5,%eax,%eax + vpsrldq $4,%xmm3,%xmm6 addl %esi,%edi - psrldq $4,%xmm6 andl %ebx,%ebp + vpxor %xmm0,%xmm4,%xmm4 xorl %ecx,%ebx - pxor %xmm0,%xmm4 addl %eax,%edi - rorl $7,%eax - pxor %xmm2,%xmm6 + vpxor %xmm2,%xmm6,%xmm6 + shrdl $7,%eax,%eax xorl %ecx,%ebp + vmovdqa %xmm7,48(%esp) movl %edi,%esi addl 4(%esp),%edx - pxor %xmm6,%xmm4 + vpxor %xmm6,%xmm4,%xmm4 xorl %ebx,%eax - roll $5,%edi - movdqa %xmm7,48(%esp) + shldl $5,%edi,%edi addl %ebp,%edx andl %eax,%esi - movdqa %xmm4,%xmm0 + vpsrld $31,%xmm4,%xmm6 xorl %ebx,%eax addl %edi,%edx - rorl $7,%edi - movdqa %xmm4,%xmm6 + shrdl $7,%edi,%edi xorl %ebx,%esi - pslldq $12,%xmm0 - paddd %xmm4,%xmm4 + vpslldq $12,%xmm4,%xmm0 + vpaddd %xmm4,%xmm4,%xmm4 movl %edx,%ebp addl 8(%esp),%ecx - psrld $31,%xmm6 xorl %eax,%edi - roll $5,%edx - movdqa %xmm0,%xmm7 + shldl $5,%edx,%edx + vpsrld $30,%xmm0,%xmm7 + vpor %xmm6,%xmm4,%xmm4 addl %esi,%ecx andl %edi,%ebp xorl %eax,%edi - psrld $30,%xmm0 addl %edx,%ecx - rorl $7,%edx - por %xmm6,%xmm4 + vpslld $2,%xmm0,%xmm0 + shrdl $7,%edx,%edx xorl %eax,%ebp + vpxor %xmm7,%xmm4,%xmm4 movl %ecx,%esi addl 12(%esp),%ebx - pslld $2,%xmm7 xorl %edi,%edx - roll $5,%ecx - pxor %xmm0,%xmm4 - movdqa 96(%esp),%xmm0 + shldl $5,%ecx,%ecx + vpxor %xmm0,%xmm4,%xmm4 addl %ebp,%ebx andl %edx,%esi - pxor %xmm7,%xmm4 - pshufd $238,%xmm1,%xmm5 + vmovdqa 96(%esp),%xmm0 xorl %edi,%edx addl %ecx,%ebx - rorl $7,%ecx + shrdl $7,%ecx,%ecx xorl %edi,%esi + vpalignr $8,%xmm1,%xmm2,%xmm5 movl %ebx,%ebp - punpcklqdq %xmm2,%xmm5 - movdqa %xmm4,%xmm7 addl 16(%esp),%eax + vpaddd %xmm4,%xmm0,%xmm0 + vmovdqa %xmm1,80(%esp) xorl %edx,%ecx - paddd %xmm4,%xmm0 - movdqa %xmm1,80(%esp) - roll $5,%ebx + shldl $5,%ebx,%ebx + vpsrldq $4,%xmm4,%xmm7 addl %esi,%eax - psrldq $4,%xmm7 andl %ecx,%ebp + vpxor %xmm1,%xmm5,%xmm5 xorl %edx,%ecx - pxor %xmm1,%xmm5 addl %ebx,%eax - rorl $7,%ebx - pxor %xmm3,%xmm7 + vpxor %xmm3,%xmm7,%xmm7 + shrdl $7,%ebx,%ebx xorl %edx,%ebp + vmovdqa %xmm0,(%esp) movl %eax,%esi addl 20(%esp),%edi - pxor %xmm7,%xmm5 + vpxor %xmm7,%xmm5,%xmm5 xorl %ecx,%ebx - roll $5,%eax - movdqa %xmm0,(%esp) + shldl $5,%eax,%eax addl %ebp,%edi andl %ebx,%esi - movdqa %xmm5,%xmm1 + vpsrld $31,%xmm5,%xmm7 xorl %ecx,%ebx addl %eax,%edi - rorl $7,%eax - movdqa %xmm5,%xmm7 + shrdl $7,%eax,%eax xorl %ecx,%esi - pslldq $12,%xmm1 - paddd %xmm5,%xmm5 + vpslldq $12,%xmm5,%xmm1 + vpaddd %xmm5,%xmm5,%xmm5 movl %edi,%ebp addl 24(%esp),%edx - psrld $31,%xmm7 xorl %ebx,%eax - roll $5,%edi - movdqa %xmm1,%xmm0 + shldl $5,%edi,%edi + vpsrld $30,%xmm1,%xmm0 + vpor %xmm7,%xmm5,%xmm5 addl %esi,%edx andl %eax,%ebp xorl %ebx,%eax - psrld $30,%xmm1 addl %edi,%edx - rorl $7,%edi - por %xmm7,%xmm5 + vpslld $2,%xmm1,%xmm1 + shrdl $7,%edi,%edi xorl %ebx,%ebp + vpxor %xmm0,%xmm5,%xmm5 movl %edx,%esi addl 28(%esp),%ecx - pslld $2,%xmm0 xorl %eax,%edi - roll $5,%edx - pxor %xmm1,%xmm5 - movdqa 112(%esp),%xmm1 + shldl $5,%edx,%edx + vpxor %xmm1,%xmm5,%xmm5 addl %ebp,%ecx andl %edi,%esi - pxor %xmm0,%xmm5 - pshufd $238,%xmm2,%xmm6 + vmovdqa 112(%esp),%xmm1 xorl %eax,%edi addl %edx,%ecx - rorl $7,%edx + shrdl $7,%edx,%edx xorl %eax,%esi + vpalignr $8,%xmm2,%xmm3,%xmm6 movl %ecx,%ebp - punpcklqdq %xmm3,%xmm6 - movdqa %xmm5,%xmm0 addl 32(%esp),%ebx + vpaddd %xmm5,%xmm1,%xmm1 + vmovdqa %xmm2,96(%esp) xorl %edi,%edx - paddd %xmm5,%xmm1 - movdqa %xmm2,96(%esp) - roll $5,%ecx + shldl $5,%ecx,%ecx + vpsrldq $4,%xmm5,%xmm0 addl %esi,%ebx - psrldq $4,%xmm0 andl %edx,%ebp + vpxor %xmm2,%xmm6,%xmm6 xorl %edi,%edx - pxor %xmm2,%xmm6 addl %ecx,%ebx - rorl $7,%ecx - pxor %xmm4,%xmm0 + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%ecx,%ecx xorl %edi,%ebp + vmovdqa %xmm1,16(%esp) movl %ebx,%esi addl 36(%esp),%eax - pxor %xmm0,%xmm6 + vpxor %xmm0,%xmm6,%xmm6 xorl %edx,%ecx - roll $5,%ebx - movdqa %xmm1,16(%esp) + shldl $5,%ebx,%ebx addl %ebp,%eax andl %ecx,%esi - movdqa %xmm6,%xmm2 + vpsrld $31,%xmm6,%xmm0 xorl %edx,%ecx addl %ebx,%eax - rorl $7,%ebx - movdqa %xmm6,%xmm0 + shrdl $7,%ebx,%ebx xorl %edx,%esi - pslldq $12,%xmm2 - paddd %xmm6,%xmm6 + vpslldq $12,%xmm6,%xmm2 + vpaddd %xmm6,%xmm6,%xmm6 movl %eax,%ebp addl 40(%esp),%edi - psrld $31,%xmm0 xorl %ecx,%ebx - roll $5,%eax - movdqa %xmm2,%xmm1 + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm1 + vpor %xmm0,%xmm6,%xmm6 addl %esi,%edi andl %ebx,%ebp xorl %ecx,%ebx - psrld $30,%xmm2 addl %eax,%edi - rorl $7,%eax - por %xmm0,%xmm6 + vpslld $2,%xmm2,%xmm2 + vmovdqa 64(%esp),%xmm0 + shrdl $7,%eax,%eax xorl %ecx,%ebp - movdqa 64(%esp),%xmm0 + vpxor %xmm1,%xmm6,%xmm6 movl %edi,%esi addl 44(%esp),%edx - pslld $2,%xmm1 xorl %ebx,%eax - roll $5,%edi - pxor %xmm2,%xmm6 - movdqa 112(%esp),%xmm2 + shldl $5,%edi,%edi + vpxor %xmm2,%xmm6,%xmm6 addl %ebp,%edx andl %eax,%esi - pxor %xmm1,%xmm6 - pshufd $238,%xmm3,%xmm7 + vmovdqa 112(%esp),%xmm2 xorl %ebx,%eax addl %edi,%edx - rorl $7,%edi + shrdl $7,%edi,%edi xorl %ebx,%esi + vpalignr $8,%xmm3,%xmm4,%xmm7 movl %edx,%ebp - punpcklqdq %xmm4,%xmm7 - movdqa %xmm6,%xmm1 addl 48(%esp),%ecx + vpaddd %xmm6,%xmm2,%xmm2 + vmovdqa %xmm3,64(%esp) xorl %eax,%edi - paddd %xmm6,%xmm2 - movdqa %xmm3,64(%esp) - roll $5,%edx + shldl $5,%edx,%edx + vpsrldq $4,%xmm6,%xmm1 addl %esi,%ecx - psrldq $4,%xmm1 andl %edi,%ebp + vpxor %xmm3,%xmm7,%xmm7 xorl %eax,%edi - pxor %xmm3,%xmm7 addl %edx,%ecx - rorl $7,%edx - pxor %xmm5,%xmm1 + vpxor %xmm5,%xmm1,%xmm1 + shrdl $7,%edx,%edx xorl %eax,%ebp + vmovdqa %xmm2,32(%esp) movl %ecx,%esi addl 52(%esp),%ebx - pxor %xmm1,%xmm7 + vpxor %xmm1,%xmm7,%xmm7 xorl %edi,%edx - roll $5,%ecx - movdqa %xmm2,32(%esp) + shldl $5,%ecx,%ecx addl %ebp,%ebx andl %edx,%esi - movdqa %xmm7,%xmm3 + vpsrld $31,%xmm7,%xmm1 xorl %edi,%edx addl %ecx,%ebx - rorl $7,%ecx - movdqa %xmm7,%xmm1 + shrdl $7,%ecx,%ecx xorl %edi,%esi - pslldq $12,%xmm3 - paddd %xmm7,%xmm7 + vpslldq $12,%xmm7,%xmm3 + vpaddd %xmm7,%xmm7,%xmm7 movl %ebx,%ebp addl 56(%esp),%eax - psrld $31,%xmm1 xorl %edx,%ecx - roll $5,%ebx - movdqa %xmm3,%xmm2 + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm2 + vpor %xmm1,%xmm7,%xmm7 addl %esi,%eax andl %ecx,%ebp xorl %edx,%ecx - psrld $30,%xmm3 addl %ebx,%eax - rorl $7,%ebx - por %xmm1,%xmm7 + vpslld $2,%xmm3,%xmm3 + vmovdqa 80(%esp),%xmm1 + shrdl $7,%ebx,%ebx xorl %edx,%ebp - movdqa 80(%esp),%xmm1 + vpxor %xmm2,%xmm7,%xmm7 movl %eax,%esi addl 60(%esp),%edi - pslld $2,%xmm2 xorl %ecx,%ebx - roll $5,%eax - pxor %xmm3,%xmm7 - movdqa 112(%esp),%xmm3 + shldl $5,%eax,%eax + vpxor %xmm3,%xmm7,%xmm7 addl %ebp,%edi andl %ebx,%esi - pxor %xmm2,%xmm7 - pshufd $238,%xmm6,%xmm2 + vmovdqa 112(%esp),%xmm3 xorl %ecx,%ebx addl %eax,%edi - rorl $7,%eax - pxor %xmm4,%xmm0 - punpcklqdq %xmm7,%xmm2 + vpalignr $8,%xmm6,%xmm7,%xmm2 + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax xorl %ecx,%esi movl %edi,%ebp addl (%esp),%edx - pxor %xmm1,%xmm0 - movdqa %xmm4,80(%esp) + vpxor %xmm1,%xmm0,%xmm0 + vmovdqa %xmm4,80(%esp) xorl %ebx,%eax - roll $5,%edi - movdqa %xmm3,%xmm4 + shldl $5,%edi,%edi + vmovdqa %xmm3,%xmm4 + vpaddd %xmm7,%xmm3,%xmm3 addl %esi,%edx - paddd %xmm7,%xmm3 andl %eax,%ebp - pxor %xmm2,%xmm0 + vpxor %xmm2,%xmm0,%xmm0 xorl %ebx,%eax addl %edi,%edx - rorl $7,%edi + shrdl $7,%edi,%edi xorl %ebx,%ebp - movdqa %xmm0,%xmm2 - movdqa %xmm3,48(%esp) + vpsrld $30,%xmm0,%xmm2 + vmovdqa %xmm3,48(%esp) movl %edx,%esi addl 4(%esp),%ecx xorl %eax,%edi - roll $5,%edx - pslld $2,%xmm0 + shldl $5,%edx,%edx + vpslld $2,%xmm0,%xmm0 addl %ebp,%ecx andl %edi,%esi - psrld $30,%xmm2 xorl %eax,%edi addl %edx,%ecx - rorl $7,%edx + shrdl $7,%edx,%edx xorl %eax,%esi movl %ecx,%ebp addl 8(%esp),%ebx + vpor %xmm2,%xmm0,%xmm0 xorl %edi,%edx - roll $5,%ecx - por %xmm2,%xmm0 + shldl $5,%ecx,%ecx + vmovdqa 96(%esp),%xmm2 addl %esi,%ebx andl %edx,%ebp - movdqa 96(%esp),%xmm2 xorl %edi,%edx addl %ecx,%ebx addl 12(%esp),%eax xorl %edi,%ebp movl %ebx,%esi - pshufd $238,%xmm7,%xmm3 - roll $5,%ebx + shldl $5,%ebx,%ebx addl %ebp,%eax xorl %edx,%esi - rorl $7,%ecx + shrdl $7,%ecx,%ecx addl %ebx,%eax + vpalignr $8,%xmm7,%xmm0,%xmm3 + vpxor %xmm5,%xmm1,%xmm1 addl 16(%esp),%edi - pxor %xmm5,%xmm1 - punpcklqdq %xmm0,%xmm3 xorl %ecx,%esi movl %eax,%ebp - roll $5,%eax - pxor %xmm2,%xmm1 - movdqa %xmm5,96(%esp) + shldl $5,%eax,%eax + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm5,96(%esp) addl %esi,%edi xorl %ecx,%ebp - movdqa %xmm4,%xmm5 - rorl $7,%ebx - paddd %xmm0,%xmm4 + vmovdqa %xmm4,%xmm5 + vpaddd %xmm0,%xmm4,%xmm4 + shrdl $7,%ebx,%ebx addl %eax,%edi - pxor %xmm3,%xmm1 + vpxor %xmm3,%xmm1,%xmm1 addl 20(%esp),%edx xorl %ebx,%ebp movl %edi,%esi - roll $5,%edi - movdqa %xmm1,%xmm3 - movdqa %xmm4,(%esp) + shldl $5,%edi,%edi + vpsrld $30,%xmm1,%xmm3 + vmovdqa %xmm4,(%esp) addl %ebp,%edx xorl %ebx,%esi - rorl $7,%eax + shrdl $7,%eax,%eax addl %edi,%edx - pslld $2,%xmm1 + vpslld $2,%xmm1,%xmm1 addl 24(%esp),%ecx xorl %eax,%esi - psrld $30,%xmm3 movl %edx,%ebp - roll $5,%edx + shldl $5,%edx,%edx addl %esi,%ecx xorl %eax,%ebp - rorl $7,%edi + shrdl $7,%edi,%edi addl %edx,%ecx - por %xmm3,%xmm1 + vpor %xmm3,%xmm1,%xmm1 addl 28(%esp),%ebx xorl %edi,%ebp - movdqa 64(%esp),%xmm3 + vmovdqa 64(%esp),%xmm3 movl %ecx,%esi - roll $5,%ecx + shldl $5,%ecx,%ecx addl %ebp,%ebx xorl %edi,%esi - rorl $7,%edx - pshufd $238,%xmm0,%xmm4 + shrdl $7,%edx,%edx addl %ecx,%ebx + vpalignr $8,%xmm0,%xmm1,%xmm4 + vpxor %xmm6,%xmm2,%xmm2 addl 32(%esp),%eax - pxor %xmm6,%xmm2 - punpcklqdq %xmm1,%xmm4 xorl %edx,%esi movl %ebx,%ebp - roll $5,%ebx - pxor %xmm3,%xmm2 - movdqa %xmm6,64(%esp) + shldl $5,%ebx,%ebx + vpxor %xmm3,%xmm2,%xmm2 + vmovdqa %xmm6,64(%esp) addl %esi,%eax xorl %edx,%ebp - movdqa 128(%esp),%xmm6 - rorl $7,%ecx - paddd %xmm1,%xmm5 + vmovdqa 128(%esp),%xmm6 + vpaddd %xmm1,%xmm5,%xmm5 + shrdl $7,%ecx,%ecx addl %ebx,%eax - pxor %xmm4,%xmm2 + vpxor %xmm4,%xmm2,%xmm2 addl 36(%esp),%edi xorl %ecx,%ebp movl %eax,%esi - roll $5,%eax - movdqa %xmm2,%xmm4 - movdqa %xmm5,16(%esp) + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm4 + vmovdqa %xmm5,16(%esp) addl %ebp,%edi xorl %ecx,%esi - rorl $7,%ebx + shrdl $7,%ebx,%ebx addl %eax,%edi - pslld $2,%xmm2 + vpslld $2,%xmm2,%xmm2 addl 40(%esp),%edx xorl %ebx,%esi - psrld $30,%xmm4 movl %edi,%ebp - roll $5,%edi + shldl $5,%edi,%edi addl %esi,%edx xorl %ebx,%ebp - rorl $7,%eax + shrdl $7,%eax,%eax addl %edi,%edx - por %xmm4,%xmm2 + vpor %xmm4,%xmm2,%xmm2 addl 44(%esp),%ecx xorl %eax,%ebp - movdqa 80(%esp),%xmm4 + vmovdqa 80(%esp),%xmm4 movl %edx,%esi - roll $5,%edx + shldl $5,%edx,%edx addl %ebp,%ecx xorl %eax,%esi - rorl $7,%edi - pshufd $238,%xmm1,%xmm5 + shrdl $7,%edi,%edi addl %edx,%ecx + vpalignr $8,%xmm1,%xmm2,%xmm5 + vpxor %xmm7,%xmm3,%xmm3 addl 48(%esp),%ebx - pxor %xmm7,%xmm3 - punpcklqdq %xmm2,%xmm5 xorl %edi,%esi movl %ecx,%ebp - roll $5,%ecx - pxor %xmm4,%xmm3 - movdqa %xmm7,80(%esp) + shldl $5,%ecx,%ecx + vpxor %xmm4,%xmm3,%xmm3 + vmovdqa %xmm7,80(%esp) addl %esi,%ebx xorl %edi,%ebp - movdqa %xmm6,%xmm7 - rorl $7,%edx - paddd %xmm2,%xmm6 + vmovdqa %xmm6,%xmm7 + vpaddd %xmm2,%xmm6,%xmm6 + shrdl $7,%edx,%edx addl %ecx,%ebx - pxor %xmm5,%xmm3 + vpxor %xmm5,%xmm3,%xmm3 addl 52(%esp),%eax xorl %edx,%ebp movl %ebx,%esi - roll $5,%ebx - movdqa %xmm3,%xmm5 - movdqa %xmm6,32(%esp) + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm5 + vmovdqa %xmm6,32(%esp) addl %ebp,%eax xorl %edx,%esi - rorl $7,%ecx + shrdl $7,%ecx,%ecx addl %ebx,%eax - pslld $2,%xmm3 + vpslld $2,%xmm3,%xmm3 addl 56(%esp),%edi xorl %ecx,%esi - psrld $30,%xmm5 movl %eax,%ebp - roll $5,%eax + shldl $5,%eax,%eax addl %esi,%edi xorl %ecx,%ebp - rorl $7,%ebx + shrdl $7,%ebx,%ebx addl %eax,%edi - por %xmm5,%xmm3 + vpor %xmm5,%xmm3,%xmm3 addl 60(%esp),%edx xorl %ebx,%ebp - movdqa 96(%esp),%xmm5 + vmovdqa 96(%esp),%xmm5 movl %edi,%esi - roll $5,%edi + shldl $5,%edi,%edi addl %ebp,%edx xorl %ebx,%esi - rorl $7,%eax - pshufd $238,%xmm2,%xmm6 + shrdl $7,%eax,%eax addl %edi,%edx + vpalignr $8,%xmm2,%xmm3,%xmm6 + vpxor %xmm0,%xmm4,%xmm4 addl (%esp),%ecx - pxor %xmm0,%xmm4 - punpcklqdq %xmm3,%xmm6 xorl %eax,%esi movl %edx,%ebp - roll $5,%edx - pxor %xmm5,%xmm4 - movdqa %xmm0,96(%esp) + shldl $5,%edx,%edx + vpxor %xmm5,%xmm4,%xmm4 + vmovdqa %xmm0,96(%esp) addl %esi,%ecx xorl %eax,%ebp - movdqa %xmm7,%xmm0 - rorl $7,%edi - paddd %xmm3,%xmm7 + vmovdqa %xmm7,%xmm0 + vpaddd %xmm3,%xmm7,%xmm7 + shrdl $7,%edi,%edi addl %edx,%ecx - pxor %xmm6,%xmm4 + vpxor %xmm6,%xmm4,%xmm4 addl 4(%esp),%ebx xorl %edi,%ebp movl %ecx,%esi - roll $5,%ecx - movdqa %xmm4,%xmm6 - movdqa %xmm7,48(%esp) + shldl $5,%ecx,%ecx + vpsrld $30,%xmm4,%xmm6 + vmovdqa %xmm7,48(%esp) addl %ebp,%ebx xorl %edi,%esi - rorl $7,%edx + shrdl $7,%edx,%edx addl %ecx,%ebx - pslld $2,%xmm4 + vpslld $2,%xmm4,%xmm4 addl 8(%esp),%eax xorl %edx,%esi - psrld $30,%xmm6 movl %ebx,%ebp - roll $5,%ebx + shldl $5,%ebx,%ebx addl %esi,%eax xorl %edx,%ebp - rorl $7,%ecx + shrdl $7,%ecx,%ecx addl %ebx,%eax - por %xmm6,%xmm4 + vpor %xmm6,%xmm4,%xmm4 addl 12(%esp),%edi xorl %ecx,%ebp - movdqa 64(%esp),%xmm6 + vmovdqa 64(%esp),%xmm6 movl %eax,%esi - roll $5,%eax + shldl $5,%eax,%eax addl %ebp,%edi xorl %ecx,%esi - rorl $7,%ebx - pshufd $238,%xmm3,%xmm7 + shrdl $7,%ebx,%ebx addl %eax,%edi + vpalignr $8,%xmm3,%xmm4,%xmm7 + vpxor %xmm1,%xmm5,%xmm5 addl 16(%esp),%edx - pxor %xmm1,%xmm5 - punpcklqdq %xmm4,%xmm7 xorl %ebx,%esi movl %edi,%ebp - roll $5,%edi - pxor %xmm6,%xmm5 - movdqa %xmm1,64(%esp) + shldl $5,%edi,%edi + vpxor %xmm6,%xmm5,%xmm5 + vmovdqa %xmm1,64(%esp) addl %esi,%edx xorl %ebx,%ebp - movdqa %xmm0,%xmm1 - rorl $7,%eax - paddd %xmm4,%xmm0 + vmovdqa %xmm0,%xmm1 + vpaddd %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax addl %edi,%edx - pxor %xmm7,%xmm5 + vpxor %xmm7,%xmm5,%xmm5 addl 20(%esp),%ecx xorl %eax,%ebp movl %edx,%esi - roll $5,%edx - movdqa %xmm5,%xmm7 - movdqa %xmm0,(%esp) + shldl $5,%edx,%edx + vpsrld $30,%xmm5,%xmm7 + vmovdqa %xmm0,(%esp) addl %ebp,%ecx xorl %eax,%esi - rorl $7,%edi + shrdl $7,%edi,%edi addl %edx,%ecx - pslld $2,%xmm5 + vpslld $2,%xmm5,%xmm5 addl 24(%esp),%ebx xorl %edi,%esi - psrld $30,%xmm7 movl %ecx,%ebp - roll $5,%ecx + shldl $5,%ecx,%ecx addl %esi,%ebx xorl %edi,%ebp - rorl $7,%edx + shrdl $7,%edx,%edx addl %ecx,%ebx - por %xmm7,%xmm5 + vpor %xmm7,%xmm5,%xmm5 addl 28(%esp),%eax - movdqa 80(%esp),%xmm7 - rorl $7,%ecx + vmovdqa 80(%esp),%xmm7 + shrdl $7,%ecx,%ecx movl %ebx,%esi xorl %edx,%ebp - roll $5,%ebx - pshufd $238,%xmm4,%xmm0 + shldl $5,%ebx,%ebx addl %ebp,%eax xorl %ecx,%esi xorl %edx,%ecx addl %ebx,%eax + vpalignr $8,%xmm4,%xmm5,%xmm0 + vpxor %xmm2,%xmm6,%xmm6 addl 32(%esp),%edi - pxor %xmm2,%xmm6 - punpcklqdq %xmm5,%xmm0 andl %ecx,%esi xorl %edx,%ecx - rorl $7,%ebx - pxor %xmm7,%xmm6 - movdqa %xmm2,80(%esp) + shrdl $7,%ebx,%ebx + vpxor %xmm7,%xmm6,%xmm6 + vmovdqa %xmm2,80(%esp) movl %eax,%ebp xorl %ecx,%esi - roll $5,%eax - movdqa %xmm1,%xmm2 + vmovdqa %xmm1,%xmm2 + vpaddd %xmm5,%xmm1,%xmm1 + shldl $5,%eax,%eax addl %esi,%edi - paddd %xmm5,%xmm1 + vpxor %xmm0,%xmm6,%xmm6 xorl %ebx,%ebp - pxor %xmm0,%xmm6 xorl %ecx,%ebx addl %eax,%edi addl 36(%esp),%edx + vpsrld $30,%xmm6,%xmm0 + vmovdqa %xmm1,16(%esp) andl %ebx,%ebp - movdqa %xmm6,%xmm0 - movdqa %xmm1,16(%esp) xorl %ecx,%ebx - rorl $7,%eax + shrdl $7,%eax,%eax movl %edi,%esi + vpslld $2,%xmm6,%xmm6 xorl %ebx,%ebp - roll $5,%edi - pslld $2,%xmm6 + shldl $5,%edi,%edi addl %ebp,%edx xorl %eax,%esi - psrld $30,%xmm0 xorl %ebx,%eax addl %edi,%edx addl 40(%esp),%ecx andl %eax,%esi + vpor %xmm0,%xmm6,%xmm6 xorl %ebx,%eax - rorl $7,%edi - por %xmm0,%xmm6 + shrdl $7,%edi,%edi + vmovdqa 96(%esp),%xmm0 movl %edx,%ebp xorl %eax,%esi - movdqa 96(%esp),%xmm0 - roll $5,%edx + shldl $5,%edx,%edx addl %esi,%ecx xorl %edi,%ebp xorl %eax,%edi addl %edx,%ecx - pshufd $238,%xmm5,%xmm1 addl 44(%esp),%ebx andl %edi,%ebp xorl %eax,%edi - rorl $7,%edx + shrdl $7,%edx,%edx movl %ecx,%esi xorl %edi,%ebp - roll $5,%ecx + shldl $5,%ecx,%ecx addl %ebp,%ebx xorl %edx,%esi xorl %edi,%edx addl %ecx,%ebx + vpalignr $8,%xmm5,%xmm6,%xmm1 + vpxor %xmm3,%xmm7,%xmm7 addl 48(%esp),%eax - pxor %xmm3,%xmm7 - punpcklqdq %xmm6,%xmm1 andl %edx,%esi xorl %edi,%edx - rorl $7,%ecx - pxor %xmm0,%xmm7 - movdqa %xmm3,96(%esp) + shrdl $7,%ecx,%ecx + vpxor %xmm0,%xmm7,%xmm7 + vmovdqa %xmm3,96(%esp) movl %ebx,%ebp xorl %edx,%esi - roll $5,%ebx - movdqa 144(%esp),%xmm3 + vmovdqa 144(%esp),%xmm3 + vpaddd %xmm6,%xmm2,%xmm2 + shldl $5,%ebx,%ebx addl %esi,%eax - paddd %xmm6,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 xorl %ecx,%ebp - pxor %xmm1,%xmm7 xorl %edx,%ecx addl %ebx,%eax addl 52(%esp),%edi + vpsrld $30,%xmm7,%xmm1 + vmovdqa %xmm2,32(%esp) andl %ecx,%ebp - movdqa %xmm7,%xmm1 - movdqa %xmm2,32(%esp) xorl %edx,%ecx - rorl $7,%ebx + shrdl $7,%ebx,%ebx movl %eax,%esi + vpslld $2,%xmm7,%xmm7 xorl %ecx,%ebp - roll $5,%eax - pslld $2,%xmm7 + shldl $5,%eax,%eax addl %ebp,%edi xorl %ebx,%esi - psrld $30,%xmm1 xorl %ecx,%ebx addl %eax,%edi addl 56(%esp),%edx andl %ebx,%esi + vpor %xmm1,%xmm7,%xmm7 xorl %ecx,%ebx - rorl $7,%eax - por %xmm1,%xmm7 + shrdl $7,%eax,%eax + vmovdqa 64(%esp),%xmm1 movl %edi,%ebp xorl %ebx,%esi - movdqa 64(%esp),%xmm1 - roll $5,%edi + shldl $5,%edi,%edi addl %esi,%edx xorl %eax,%ebp xorl %ebx,%eax addl %edi,%edx - pshufd $238,%xmm6,%xmm2 addl 60(%esp),%ecx andl %eax,%ebp xorl %ebx,%eax - rorl $7,%edi + shrdl $7,%edi,%edi movl %edx,%esi xorl %eax,%ebp - roll $5,%edx + shldl $5,%edx,%edx addl %ebp,%ecx xorl %edi,%esi xorl %eax,%edi addl %edx,%ecx + vpalignr $8,%xmm6,%xmm7,%xmm2 + vpxor %xmm4,%xmm0,%xmm0 addl (%esp),%ebx - pxor %xmm4,%xmm0 - punpcklqdq %xmm7,%xmm2 andl %edi,%esi xorl %eax,%edi - rorl $7,%edx - pxor %xmm1,%xmm0 - movdqa %xmm4,64(%esp) + shrdl $7,%edx,%edx + vpxor %xmm1,%xmm0,%xmm0 + vmovdqa %xmm4,64(%esp) movl %ecx,%ebp xorl %edi,%esi - roll $5,%ecx - movdqa %xmm3,%xmm4 + vmovdqa %xmm3,%xmm4 + vpaddd %xmm7,%xmm3,%xmm3 + shldl $5,%ecx,%ecx addl %esi,%ebx - paddd %xmm7,%xmm3 + vpxor %xmm2,%xmm0,%xmm0 xorl %edx,%ebp - pxor %xmm2,%xmm0 xorl %edi,%edx addl %ecx,%ebx addl 4(%esp),%eax + vpsrld $30,%xmm0,%xmm2 + vmovdqa %xmm3,48(%esp) andl %edx,%ebp - movdqa %xmm0,%xmm2 - movdqa %xmm3,48(%esp) xorl %edi,%edx - rorl $7,%ecx + shrdl $7,%ecx,%ecx movl %ebx,%esi + vpslld $2,%xmm0,%xmm0 xorl %edx,%ebp - roll $5,%ebx - pslld $2,%xmm0 + shldl $5,%ebx,%ebx addl %ebp,%eax xorl %ecx,%esi - psrld $30,%xmm2 xorl %edx,%ecx addl %ebx,%eax addl 8(%esp),%edi andl %ecx,%esi + vpor %xmm2,%xmm0,%xmm0 xorl %edx,%ecx - rorl $7,%ebx - por %xmm2,%xmm0 + shrdl $7,%ebx,%ebx + vmovdqa 80(%esp),%xmm2 movl %eax,%ebp xorl %ecx,%esi - movdqa 80(%esp),%xmm2 - roll $5,%eax + shldl $5,%eax,%eax addl %esi,%edi xorl %ebx,%ebp xorl %ecx,%ebx addl %eax,%edi - pshufd $238,%xmm7,%xmm3 addl 12(%esp),%edx andl %ebx,%ebp xorl %ecx,%ebx - rorl $7,%eax + shrdl $7,%eax,%eax movl %edi,%esi xorl %ebx,%ebp - roll $5,%edi + shldl $5,%edi,%edi addl %ebp,%edx xorl %eax,%esi xorl %ebx,%eax addl %edi,%edx + vpalignr $8,%xmm7,%xmm0,%xmm3 + vpxor %xmm5,%xmm1,%xmm1 addl 16(%esp),%ecx - pxor %xmm5,%xmm1 - punpcklqdq %xmm0,%xmm3 andl %eax,%esi xorl %ebx,%eax - rorl $7,%edi - pxor %xmm2,%xmm1 - movdqa %xmm5,80(%esp) + shrdl $7,%edi,%edi + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm5,80(%esp) movl %edx,%ebp xorl %eax,%esi - roll $5,%edx - movdqa %xmm4,%xmm5 + vmovdqa %xmm4,%xmm5 + vpaddd %xmm0,%xmm4,%xmm4 + shldl $5,%edx,%edx addl %esi,%ecx - paddd %xmm0,%xmm4 + vpxor %xmm3,%xmm1,%xmm1 xorl %edi,%ebp - pxor %xmm3,%xmm1 xorl %eax,%edi addl %edx,%ecx addl 20(%esp),%ebx + vpsrld $30,%xmm1,%xmm3 + vmovdqa %xmm4,(%esp) andl %edi,%ebp - movdqa %xmm1,%xmm3 - movdqa %xmm4,(%esp) xorl %eax,%edi - rorl $7,%edx + shrdl $7,%edx,%edx movl %ecx,%esi + vpslld $2,%xmm1,%xmm1 xorl %edi,%ebp - roll $5,%ecx - pslld $2,%xmm1 + shldl $5,%ecx,%ecx addl %ebp,%ebx xorl %edx,%esi - psrld $30,%xmm3 xorl %edi,%edx addl %ecx,%ebx addl 24(%esp),%eax andl %edx,%esi + vpor %xmm3,%xmm1,%xmm1 xorl %edi,%edx - rorl $7,%ecx - por %xmm3,%xmm1 + shrdl $7,%ecx,%ecx + vmovdqa 96(%esp),%xmm3 movl %ebx,%ebp xorl %edx,%esi - movdqa 96(%esp),%xmm3 - roll $5,%ebx + shldl $5,%ebx,%ebx addl %esi,%eax xorl %ecx,%ebp xorl %edx,%ecx addl %ebx,%eax - pshufd $238,%xmm0,%xmm4 addl 28(%esp),%edi andl %ecx,%ebp xorl %edx,%ecx - rorl $7,%ebx + shrdl $7,%ebx,%ebx movl %eax,%esi xorl %ecx,%ebp - roll $5,%eax + shldl $5,%eax,%eax addl %ebp,%edi xorl %ebx,%esi xorl %ecx,%ebx addl %eax,%edi + vpalignr $8,%xmm0,%xmm1,%xmm4 + vpxor %xmm6,%xmm2,%xmm2 addl 32(%esp),%edx - pxor %xmm6,%xmm2 - punpcklqdq %xmm1,%xmm4 andl %ebx,%esi xorl %ecx,%ebx - rorl $7,%eax - pxor %xmm3,%xmm2 - movdqa %xmm6,96(%esp) + shrdl $7,%eax,%eax + vpxor %xmm3,%xmm2,%xmm2 + vmovdqa %xmm6,96(%esp) movl %edi,%ebp xorl %ebx,%esi - roll $5,%edi - movdqa %xmm5,%xmm6 + vmovdqa %xmm5,%xmm6 + vpaddd %xmm1,%xmm5,%xmm5 + shldl $5,%edi,%edi addl %esi,%edx - paddd %xmm1,%xmm5 + vpxor %xmm4,%xmm2,%xmm2 xorl %eax,%ebp - pxor %xmm4,%xmm2 xorl %ebx,%eax addl %edi,%edx addl 36(%esp),%ecx + vpsrld $30,%xmm2,%xmm4 + vmovdqa %xmm5,16(%esp) andl %eax,%ebp - movdqa %xmm2,%xmm4 - movdqa %xmm5,16(%esp) xorl %ebx,%eax - rorl $7,%edi + shrdl $7,%edi,%edi movl %edx,%esi + vpslld $2,%xmm2,%xmm2 xorl %eax,%ebp - roll $5,%edx - pslld $2,%xmm2 + shldl $5,%edx,%edx addl %ebp,%ecx xorl %edi,%esi - psrld $30,%xmm4 xorl %eax,%edi addl %edx,%ecx addl 40(%esp),%ebx andl %edi,%esi + vpor %xmm4,%xmm2,%xmm2 xorl %eax,%edi - rorl $7,%edx - por %xmm4,%xmm2 + shrdl $7,%edx,%edx + vmovdqa 64(%esp),%xmm4 movl %ecx,%ebp xorl %edi,%esi - movdqa 64(%esp),%xmm4 - roll $5,%ecx + shldl $5,%ecx,%ecx addl %esi,%ebx xorl %edx,%ebp xorl %edi,%edx addl %ecx,%ebx - pshufd $238,%xmm1,%xmm5 addl 44(%esp),%eax andl %edx,%ebp xorl %edi,%edx - rorl $7,%ecx + shrdl $7,%ecx,%ecx movl %ebx,%esi xorl %edx,%ebp - roll $5,%ebx + shldl $5,%ebx,%ebx addl %ebp,%eax xorl %edx,%esi addl %ebx,%eax + vpalignr $8,%xmm1,%xmm2,%xmm5 + vpxor %xmm7,%xmm3,%xmm3 addl 48(%esp),%edi - pxor %xmm7,%xmm3 - punpcklqdq %xmm2,%xmm5 xorl %ecx,%esi movl %eax,%ebp - roll $5,%eax - pxor %xmm4,%xmm3 - movdqa %xmm7,64(%esp) + shldl $5,%eax,%eax + vpxor %xmm4,%xmm3,%xmm3 + vmovdqa %xmm7,64(%esp) addl %esi,%edi xorl %ecx,%ebp - movdqa %xmm6,%xmm7 - rorl $7,%ebx - paddd %xmm2,%xmm6 + vmovdqa %xmm6,%xmm7 + vpaddd %xmm2,%xmm6,%xmm6 + shrdl $7,%ebx,%ebx addl %eax,%edi - pxor %xmm5,%xmm3 + vpxor %xmm5,%xmm3,%xmm3 addl 52(%esp),%edx xorl %ebx,%ebp movl %edi,%esi - roll $5,%edi - movdqa %xmm3,%xmm5 - movdqa %xmm6,32(%esp) + shldl $5,%edi,%edi + vpsrld $30,%xmm3,%xmm5 + vmovdqa %xmm6,32(%esp) addl %ebp,%edx xorl %ebx,%esi - rorl $7,%eax + shrdl $7,%eax,%eax addl %edi,%edx - pslld $2,%xmm3 + vpslld $2,%xmm3,%xmm3 addl 56(%esp),%ecx xorl %eax,%esi - psrld $30,%xmm5 movl %edx,%ebp - roll $5,%edx + shldl $5,%edx,%edx addl %esi,%ecx xorl %eax,%ebp - rorl $7,%edi + shrdl $7,%edi,%edi addl %edx,%ecx - por %xmm5,%xmm3 + vpor %xmm5,%xmm3,%xmm3 addl 60(%esp),%ebx xorl %edi,%ebp movl %ecx,%esi - roll $5,%ecx + shldl $5,%ecx,%ecx addl %ebp,%ebx xorl %edi,%esi - rorl $7,%edx + shrdl $7,%edx,%edx addl %ecx,%ebx addl (%esp),%eax + vpaddd %xmm3,%xmm7,%xmm7 xorl %edx,%esi movl %ebx,%ebp - roll $5,%ebx + shldl $5,%ebx,%ebx addl %esi,%eax + vmovdqa %xmm7,48(%esp) xorl %edx,%ebp - rorl $7,%ecx - paddd %xmm3,%xmm7 + shrdl $7,%ecx,%ecx addl %ebx,%eax addl 4(%esp),%edi xorl %ecx,%ebp movl %eax,%esi - movdqa %xmm7,48(%esp) - roll $5,%eax + shldl $5,%eax,%eax addl %ebp,%edi xorl %ecx,%esi - rorl $7,%ebx + shrdl $7,%ebx,%ebx addl %eax,%edi addl 8(%esp),%edx xorl %ebx,%esi movl %edi,%ebp - roll $5,%edi + shldl $5,%edi,%edi addl %esi,%edx xorl %ebx,%ebp - rorl $7,%eax + shrdl $7,%eax,%eax addl %edi,%edx addl 12(%esp),%ecx xorl %eax,%ebp movl %edx,%esi - roll $5,%edx + shldl $5,%edx,%edx addl %ebp,%ecx xorl %eax,%esi - rorl $7,%edi + shrdl $7,%edi,%edi addl %edx,%ecx movl 196(%esp),%ebp cmpl 200(%esp),%ebp - je .L007done - movdqa 160(%esp),%xmm7 - movdqa 176(%esp),%xmm6 - movdqu (%ebp),%xmm0 - movdqu 16(%ebp),%xmm1 - movdqu 32(%ebp),%xmm2 - movdqu 48(%ebp),%xmm3 + je .L010done + vmovdqa 160(%esp),%xmm7 + vmovdqa 176(%esp),%xmm6 + vmovdqu (%ebp),%xmm0 + vmovdqu 16(%ebp),%xmm1 + vmovdqu 32(%ebp),%xmm2 + vmovdqu 48(%ebp),%xmm3 addl $64,%ebp -.byte 102,15,56,0,198 + vpshufb %xmm6,%xmm0,%xmm0 movl %ebp,196(%esp) - movdqa %xmm7,96(%esp) + vmovdqa %xmm7,96(%esp) addl 16(%esp),%ebx xorl %edi,%esi + vpshufb %xmm6,%xmm1,%xmm1 movl %ecx,%ebp - roll $5,%ecx + shldl $5,%ecx,%ecx + vpaddd %xmm7,%xmm0,%xmm4 addl %esi,%ebx xorl %edi,%ebp - rorl $7,%edx -.byte 102,15,56,0,206 + shrdl $7,%edx,%edx addl %ecx,%ebx + vmovdqa %xmm4,(%esp) addl 20(%esp),%eax xorl %edx,%ebp movl %ebx,%esi - paddd %xmm7,%xmm0 - roll $5,%ebx + shldl $5,%ebx,%ebx addl %ebp,%eax xorl %edx,%esi - rorl $7,%ecx - movdqa %xmm0,(%esp) + shrdl $7,%ecx,%ecx addl %ebx,%eax addl 24(%esp),%edi xorl %ecx,%esi movl %eax,%ebp - psubd %xmm7,%xmm0 - roll $5,%eax + shldl $5,%eax,%eax addl %esi,%edi xorl %ecx,%ebp - rorl $7,%ebx + shrdl $7,%ebx,%ebx addl %eax,%edi addl 28(%esp),%edx xorl %ebx,%ebp movl %edi,%esi - roll $5,%edi + shldl $5,%edi,%edi addl %ebp,%edx xorl %ebx,%esi - rorl $7,%eax + shrdl $7,%eax,%eax addl %edi,%edx addl 32(%esp),%ecx xorl %eax,%esi + vpshufb %xmm6,%xmm2,%xmm2 movl %edx,%ebp - roll $5,%edx + shldl $5,%edx,%edx + vpaddd %xmm7,%xmm1,%xmm5 addl %esi,%ecx xorl %eax,%ebp - rorl $7,%edi -.byte 102,15,56,0,214 + shrdl $7,%edi,%edi addl %edx,%ecx + vmovdqa %xmm5,16(%esp) addl 36(%esp),%ebx xorl %edi,%ebp movl %ecx,%esi - paddd %xmm7,%xmm1 - roll $5,%ecx + shldl $5,%ecx,%ecx addl %ebp,%ebx xorl %edi,%esi - rorl $7,%edx - movdqa %xmm1,16(%esp) + shrdl $7,%edx,%edx addl %ecx,%ebx addl 40(%esp),%eax xorl %edx,%esi movl %ebx,%ebp - psubd %xmm7,%xmm1 - roll $5,%ebx + shldl $5,%ebx,%ebx addl %esi,%eax xorl %edx,%ebp - rorl $7,%ecx + shrdl $7,%ecx,%ecx addl %ebx,%eax addl 44(%esp),%edi xorl %ecx,%ebp movl %eax,%esi - roll $5,%eax + shldl $5,%eax,%eax addl %ebp,%edi xorl %ecx,%esi - rorl $7,%ebx + shrdl $7,%ebx,%ebx addl %eax,%edi addl 48(%esp),%edx xorl %ebx,%esi + vpshufb %xmm6,%xmm3,%xmm3 movl %edi,%ebp - roll $5,%edi + shldl $5,%edi,%edi + vpaddd %xmm7,%xmm2,%xmm6 addl %esi,%edx xorl %ebx,%ebp - rorl $7,%eax -.byte 102,15,56,0,222 + shrdl $7,%eax,%eax addl %edi,%edx + vmovdqa %xmm6,32(%esp) addl 52(%esp),%ecx xorl %eax,%ebp movl %edx,%esi - paddd %xmm7,%xmm2 - roll $5,%edx + shldl $5,%edx,%edx addl %ebp,%ecx xorl %eax,%esi - rorl $7,%edi - movdqa %xmm2,32(%esp) + shrdl $7,%edi,%edi addl %edx,%ecx addl 56(%esp),%ebx xorl %edi,%esi movl %ecx,%ebp - psubd %xmm7,%xmm2 - roll $5,%ecx + shldl $5,%ecx,%ecx addl %esi,%ebx xorl %edi,%ebp - rorl $7,%edx + shrdl $7,%edx,%edx addl %ecx,%ebx addl 60(%esp),%eax xorl %edx,%ebp movl %ebx,%esi - roll $5,%ebx + shldl $5,%ebx,%ebx addl %ebp,%eax - rorl $7,%ecx + shrdl $7,%ecx,%ecx addl %ebx,%eax movl 192(%esp),%ebp addl (%ebp),%eax @@ -5452,113 +7802,113 @@ _sha1_block_data_order_ssse3: addl 12(%ebp),%edx movl %esi,4(%ebp) addl 16(%ebp),%edi - movl %ecx,8(%ebp) movl %ecx,%ebx - movl %edx,12(%ebp) + movl %ecx,8(%ebp) xorl %edx,%ebx + movl %edx,12(%ebp) movl %edi,16(%ebp) movl %esi,%ebp - pshufd $238,%xmm0,%xmm4 andl %ebx,%esi movl %ebp,%ebx - jmp .L006loop + jmp .L009loop .align 16 -.L007done: +.L010done: addl 16(%esp),%ebx xorl %edi,%esi movl %ecx,%ebp - roll $5,%ecx + shldl $5,%ecx,%ecx addl %esi,%ebx xorl %edi,%ebp - rorl $7,%edx + shrdl $7,%edx,%edx addl %ecx,%ebx addl 20(%esp),%eax xorl %edx,%ebp movl %ebx,%esi - roll $5,%ebx + shldl $5,%ebx,%ebx addl %ebp,%eax xorl %edx,%esi - rorl $7,%ecx + shrdl $7,%ecx,%ecx addl %ebx,%eax addl 24(%esp),%edi xorl %ecx,%esi movl %eax,%ebp - roll $5,%eax + shldl $5,%eax,%eax addl %esi,%edi xorl %ecx,%ebp - rorl $7,%ebx + shrdl $7,%ebx,%ebx addl %eax,%edi addl 28(%esp),%edx xorl %ebx,%ebp movl %edi,%esi - roll $5,%edi + shldl $5,%edi,%edi addl %ebp,%edx xorl %ebx,%esi - rorl $7,%eax + shrdl $7,%eax,%eax addl %edi,%edx addl 32(%esp),%ecx xorl %eax,%esi movl %edx,%ebp - roll $5,%edx + shldl $5,%edx,%edx addl %esi,%ecx xorl %eax,%ebp - rorl $7,%edi + shrdl $7,%edi,%edi addl %edx,%ecx addl 36(%esp),%ebx xorl %edi,%ebp movl %ecx,%esi - roll $5,%ecx + shldl $5,%ecx,%ecx addl %ebp,%ebx xorl %edi,%esi - rorl $7,%edx + shrdl $7,%edx,%edx addl %ecx,%ebx addl 40(%esp),%eax xorl %edx,%esi movl %ebx,%ebp - roll $5,%ebx + shldl $5,%ebx,%ebx addl %esi,%eax xorl %edx,%ebp - rorl $7,%ecx + shrdl $7,%ecx,%ecx addl %ebx,%eax addl 44(%esp),%edi xorl %ecx,%ebp movl %eax,%esi - roll $5,%eax + shldl $5,%eax,%eax addl %ebp,%edi xorl %ecx,%esi - rorl $7,%ebx + shrdl $7,%ebx,%ebx addl %eax,%edi addl 48(%esp),%edx xorl %ebx,%esi movl %edi,%ebp - roll $5,%edi + shldl $5,%edi,%edi addl %esi,%edx xorl %ebx,%ebp - rorl $7,%eax + shrdl $7,%eax,%eax addl %edi,%edx addl 52(%esp),%ecx xorl %eax,%ebp movl %edx,%esi - roll $5,%edx + shldl $5,%edx,%edx addl %ebp,%ecx xorl %eax,%esi - rorl $7,%edi + shrdl $7,%edi,%edi addl %edx,%ecx addl 56(%esp),%ebx xorl %edi,%esi movl %ecx,%ebp - roll $5,%ecx + shldl $5,%ecx,%ecx addl %esi,%ebx xorl %edi,%ebp - rorl $7,%edx + shrdl $7,%edx,%edx addl %ecx,%ebx addl 60(%esp),%eax xorl %edx,%ebp movl %ebx,%esi - roll $5,%ebx + shldl $5,%ebx,%ebx addl %ebp,%eax - rorl $7,%ecx + shrdl $7,%ecx,%ecx addl %ebx,%eax + vzeroall movl 192(%esp),%ebp addl (%ebp),%eax movl 204(%esp),%esp @@ -5576,7 +7926,7 @@ _sha1_block_data_order_ssse3: popl %ebx popl %ebp ret -.size _sha1_block_data_order_ssse3,.-_sha1_block_data_order_ssse3 +.size _sha1_block_data_order_avx,.-_sha1_block_data_order_avx .align 64 .LK_XX_XX: .long 1518500249,1518500249,1518500249,1518500249 diff --git a/secure/lib/libcrypto/i386/sha256-586.S b/secure/lib/libcrypto/i386/sha256-586.S index 5d8476c1e1b..7b4205352bd 100644 --- a/secure/lib/libcrypto/i386/sha256-586.S +++ b/secure/lib/libcrypto/i386/sha256-586.S @@ -42,12 +42,13 @@ sha256_block_data_order: orl %ebx,%ecx andl $1342177280,%ecx cmpl $1342177280,%ecx + je .L005AVX testl $512,%ebx - jnz .L005SSSE3 + jnz .L006SSSE3 .L003no_xmm: subl %edi,%eax cmpl $256,%eax - jae .L006unrolled + jae .L007unrolled jmp .L002loop .align 16 .L002loop: @@ -119,7 +120,7 @@ sha256_block_data_order: movl %ecx,28(%esp) movl %edi,32(%esp) .align 16 -.L00700_15: +.L00800_15: movl %edx,%ecx movl 24(%esp),%esi rorl $14,%ecx @@ -157,11 +158,11 @@ sha256_block_data_order: addl $4,%ebp addl %ebx,%eax cmpl $3248222580,%esi - jne .L00700_15 + jne .L00800_15 movl 156(%esp),%ecx - jmp .L00816_63 + jmp .L00916_63 .align 16 -.L00816_63: +.L00916_63: movl %ecx,%ebx movl 104(%esp),%esi rorl $11,%ecx @@ -216,7 +217,7 @@ sha256_block_data_order: addl $4,%ebp addl %ebx,%eax cmpl $3329325298,%esi - jne .L00816_63 + jne .L00916_63 movl 356(%esp),%esi movl 8(%esp),%ebx movl 16(%esp),%ecx @@ -260,7 +261,7 @@ sha256_block_data_order: .byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 .byte 62,0 .align 16 -.L006unrolled: +.L007unrolled: leal -96(%esp),%esp movl (%esi),%eax movl 4(%esi),%ebp @@ -277,9 +278,9 @@ sha256_block_data_order: movl %ebx,20(%esp) movl %ecx,24(%esp) movl %esi,28(%esp) - jmp .L009grand_loop + jmp .L010grand_loop .align 16 -.L009grand_loop: +.L010grand_loop: movl (%edi),%ebx movl 4(%edi),%ecx bswap %ebx @@ -3159,7 +3160,7 @@ sha256_block_data_order: movl %ebx,24(%esp) movl %ecx,28(%esp) cmpl 104(%esp),%edi - jb .L009grand_loop + jb .L010grand_loop movl 108(%esp),%esp popl %edi popl %esi @@ -3178,9 +3179,9 @@ sha256_block_data_order: pshufd $27,%xmm2,%xmm2 .byte 102,15,58,15,202,8 punpcklqdq %xmm0,%xmm2 - jmp .L010loop_shaext + jmp .L011loop_shaext .align 16 -.L010loop_shaext: +.L011loop_shaext: movdqu (%edi),%xmm3 movdqu 16(%edi),%xmm4 movdqu 32(%edi),%xmm5 @@ -3350,7 +3351,7 @@ sha256_block_data_order: .byte 15,56,203,202 paddd 16(%esp),%xmm2 paddd (%esp),%xmm1 - jnz .L010loop_shaext + jnz .L011loop_shaext pshufd $177,%xmm2,%xmm2 pshufd $27,%xmm1,%xmm7 pshufd $177,%xmm1,%xmm1 @@ -3365,7 +3366,7 @@ sha256_block_data_order: popl %ebp ret .align 32 -.L005SSSE3: +.L006SSSE3: leal -96(%esp),%esp movl (%esi),%eax movl 4(%esi),%ebx @@ -3384,9 +3385,9 @@ sha256_block_data_order: movl %ecx,24(%esp) movl %esi,28(%esp) movdqa 256(%ebp),%xmm7 - jmp .L011grand_ssse3 + jmp .L012grand_ssse3 .align 16 -.L011grand_ssse3: +.L012grand_ssse3: movdqu (%edi),%xmm0 movdqu 16(%edi),%xmm1 movdqu 32(%edi),%xmm2 @@ -3409,9 +3410,9 @@ sha256_block_data_order: paddd %xmm3,%xmm7 movdqa %xmm6,64(%esp) movdqa %xmm7,80(%esp) - jmp .L012ssse3_00_47 + jmp .L013ssse3_00_47 .align 16 -.L012ssse3_00_47: +.L013ssse3_00_47: addl $64,%ebp movl %edx,%ecx movdqa %xmm1,%xmm4 @@ -4054,7 +4055,7 @@ sha256_block_data_order: addl %ecx,%eax movdqa %xmm6,80(%esp) cmpl $66051,64(%ebp) - jne .L012ssse3_00_47 + jne .L013ssse3_00_47 movl %edx,%ecx rorl $14,%edx movl 20(%esp),%esi @@ -4568,4550 +4569,8960 @@ sha256_block_data_order: movdqa 64(%ebp),%xmm7 subl $192,%ebp cmpl 104(%esp),%edi - jb .L011grand_ssse3 + jb .L012grand_ssse3 movl 108(%esp),%esp popl %edi popl %esi popl %ebx popl %ebp ret -.size sha256_block_data_order,.-.L_sha256_block_data_order_begin -.comm OPENSSL_ia32cap_P,16,4 -#else -.text -.globl sha256_block_data_order -.type sha256_block_data_order,@function -.align 16 -sha256_block_data_order: -.L_sha256_block_data_order_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 20(%esp),%esi - movl 24(%esp),%edi - movl 28(%esp),%eax - movl %esp,%ebx - call .L000pic_point -.L000pic_point: - popl %ebp - leal .L001K256-.L000pic_point(%ebp),%ebp - subl $16,%esp - andl $-64,%esp - shll $6,%eax - addl %edi,%eax - movl %esi,(%esp) - movl %edi,4(%esp) - movl %eax,8(%esp) - movl %ebx,12(%esp) - leal OPENSSL_ia32cap_P,%edx - movl (%edx),%ecx - movl 4(%edx),%ebx - testl $1048576,%ecx - jnz .L002loop - movl 8(%edx),%edx - testl $16777216,%ecx - jz .L003no_xmm - andl $1073741824,%ecx - andl $268435968,%ebx - testl $536870912,%edx - jnz .L004shaext - orl %ebx,%ecx - andl $1342177280,%ecx - cmpl $1342177280,%ecx - testl $512,%ebx - jnz .L005SSSE3 -.L003no_xmm: - subl %edi,%eax - cmpl $256,%eax - jae .L006unrolled - jmp .L002loop -.align 16 -.L002loop: - movl (%edi),%eax - movl 4(%edi),%ebx - movl 8(%edi),%ecx - bswap %eax - movl 12(%edi),%edx - bswap %ebx - pushl %eax - bswap %ecx - pushl %ebx - bswap %edx - pushl %ecx - pushl %edx - movl 16(%edi),%eax - movl 20(%edi),%ebx - movl 24(%edi),%ecx - bswap %eax - movl 28(%edi),%edx - bswap %ebx - pushl %eax - bswap %ecx - pushl %ebx - bswap %edx - pushl %ecx - pushl %edx - movl 32(%edi),%eax - movl 36(%edi),%ebx - movl 40(%edi),%ecx - bswap %eax - movl 44(%edi),%edx - bswap %ebx - pushl %eax - bswap %ecx - pushl %ebx - bswap %edx - pushl %ecx - pushl %edx - movl 48(%edi),%eax - movl 52(%edi),%ebx - movl 56(%edi),%ecx - bswap %eax - movl 60(%edi),%edx - bswap %ebx - pushl %eax - bswap %ecx - pushl %ebx - bswap %edx - pushl %ecx - pushl %edx - addl $64,%edi - leal -36(%esp),%esp - movl %edi,104(%esp) +.align 32 +.L005AVX: + andl $264,%edx + cmpl $264,%edx + je .L014AVX_BMI + leal -96(%esp),%esp + vzeroall movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx movl 12(%esi),%edi - movl %ebx,8(%esp) + movl %ebx,4(%esp) xorl %ecx,%ebx - movl %ecx,12(%esp) - movl %edi,16(%esp) - movl %ebx,(%esp) + movl %ecx,8(%esp) + movl %edi,12(%esp) movl 16(%esi),%edx - movl 20(%esi),%ebx + movl 20(%esi),%edi movl 24(%esi),%ecx - movl 28(%esi),%edi - movl %ebx,24(%esp) - movl %ecx,28(%esp) - movl %edi,32(%esp) + movl 28(%esi),%esi + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ecx,24(%esp) + movl %esi,28(%esp) + vmovdqa 256(%ebp),%xmm7 + jmp .L015grand_avx +.align 32 +.L015grand_avx: + vmovdqu (%edi),%xmm0 + vmovdqu 16(%edi),%xmm1 + vmovdqu 32(%edi),%xmm2 + vmovdqu 48(%edi),%xmm3 + addl $64,%edi + vpshufb %xmm7,%xmm0,%xmm0 + movl %edi,100(%esp) + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd (%ebp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 16(%ebp),%xmm1,%xmm5 + vpaddd 32(%ebp),%xmm2,%xmm6 + vpaddd 48(%ebp),%xmm3,%xmm7 + vmovdqa %xmm4,32(%esp) + vmovdqa %xmm5,48(%esp) + vmovdqa %xmm6,64(%esp) + vmovdqa %xmm7,80(%esp) + jmp .L016avx_00_47 .align 16 -.L00700_15: +.L016avx_00_47: + addl $64,%ebp + vpalignr $4,%xmm0,%xmm1,%xmm4 movl %edx,%ecx - movl 24(%esp),%esi - rorl $14,%ecx - movl 28(%esp),%edi - xorl %edx,%ecx - xorl %edi,%esi - movl 96(%esp),%ebx - rorl $5,%ecx - andl %edx,%esi - movl %edx,20(%esp) + shrdl $14,%edx,%edx + movl 20(%esp),%esi + vpalignr $4,%xmm2,%xmm3,%xmm7 xorl %ecx,%edx - addl 32(%esp),%ebx + movl 24(%esp),%edi xorl %edi,%esi - rorl $6,%edx + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + vpaddd %xmm7,%xmm0,%xmm0 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 movl %eax,%ecx - addl %esi,%ebx - rorl $9,%ecx - addl %edx,%ebx - movl 8(%esp),%edi - xorl %eax,%ecx - movl %eax,4(%esp) - leal -4(%esp),%esp - rorl $11,%ecx - movl (%ebp),%esi + addl %edi,%edx + movl 4(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + vpxor %xmm6,%xmm7,%xmm4 xorl %eax,%ecx - movl 20(%esp),%edx xorl %edi,%eax - rorl $2,%ecx - addl %esi,%ebx - movl %eax,(%esp) - addl %ebx,%edx - andl 4(%esp),%eax + addl 28(%esp),%edx + vpshufd $250,%xmm3,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 32(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx addl %ecx,%ebx - xorl %edi,%eax - addl $4,%ebp - addl %ebx,%eax - cmpl $3248222580,%esi - jne .L00700_15 - movl 156(%esp),%ecx - jmp .L00816_63 -.align 16 -.L00816_63: - movl %ecx,%ebx - movl 104(%esp),%esi - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 160(%esp),%ebx - shrl $10,%edi - addl 124(%esp),%ebx + vpslld $11,%xmm5,%xmm5 movl %edx,%ecx - xorl %esi,%edi - movl 24(%esp),%esi - rorl $14,%ecx - addl %edi,%ebx - movl 28(%esp),%edi - xorl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 20(%esp),%edi xorl %edi,%esi - movl %ebx,96(%esp) - rorl $5,%ecx - andl %edx,%esi - movl %edx,20(%esp) + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + vpxor %xmm5,%xmm4,%xmm4 xorl %ecx,%edx - addl 32(%esp),%ebx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + vpaddd %xmm4,%xmm0,%xmm0 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 36(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + vpaddd %xmm7,%xmm0,%xmm0 + xorl %ecx,%edx + movl 16(%esp),%edi xorl %edi,%esi - rorl $6,%edx + vpshufd $80,%xmm0,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 movl %eax,%ecx - addl %esi,%ebx - rorl $9,%ecx - addl %edx,%ebx - movl 8(%esp),%edi - xorl %eax,%ecx - movl %eax,4(%esp) - leal -4(%esp),%esp - rorl $11,%ecx - movl (%ebp),%esi + addl %edi,%edx + movl 28(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + vpsrlq $19,%xmm7,%xmm7 xorl %eax,%ecx - movl 20(%esp),%edx xorl %edi,%eax - rorl $2,%ecx - addl %esi,%ebx - movl %eax,(%esp) - addl %ebx,%edx - andl 4(%esp),%eax + addl 20(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 40(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx addl %ecx,%ebx - xorl %edi,%eax - movl 156(%esp),%ecx - addl $4,%ebp - addl %ebx,%eax - cmpl $3329325298,%esi - jne .L00816_63 - movl 356(%esp),%esi - movl 8(%esp),%ebx - movl 16(%esp),%ecx - addl (%esi),%eax - addl 4(%esi),%ebx - addl 8(%esi),%edi - addl 12(%esi),%ecx - movl %eax,(%esi) - movl %ebx,4(%esi) - movl %edi,8(%esi) - movl %ecx,12(%esi) - movl 24(%esp),%eax - movl 28(%esp),%ebx - movl 32(%esp),%ecx - movl 360(%esp),%edi - addl 16(%esi),%edx - addl 20(%esi),%eax - addl 24(%esi),%ebx - addl 28(%esi),%ecx - movl %edx,16(%esi) - movl %eax,20(%esi) - movl %ebx,24(%esi) - movl %ecx,28(%esi) - leal 356(%esp),%esp - subl $256,%ebp - cmpl 8(%esp),%edi - jb .L002loop - movl 12(%esp),%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.align 64 -.L001K256: -.long 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298 -.long 66051,67438087,134810123,202182159 -.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 -.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 -.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 -.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 -.byte 62,0 -.align 16 -.L006unrolled: - leal -96(%esp),%esp - movl (%esi),%eax - movl 4(%esi),%ebp - movl 8(%esi),%ecx - movl 12(%esi),%ebx - movl %ebp,4(%esp) - xorl %ecx,%ebp - movl %ecx,8(%esp) - movl %ebx,12(%esp) - movl 16(%esi),%edx - movl 20(%esi),%ebx - movl 24(%esi),%ecx - movl 28(%esi),%esi - movl %ebx,20(%esp) - movl %ecx,24(%esp) - movl %esi,28(%esp) - jmp .L009grand_loop -.align 16 -.L009grand_loop: - movl (%edi),%ebx - movl 4(%edi),%ecx - bswap %ebx - movl 8(%edi),%esi - bswap %ecx - movl %ebx,32(%esp) - bswap %esi - movl %ecx,36(%esp) - movl %esi,40(%esp) - movl 12(%edi),%ebx - movl 16(%edi),%ecx - bswap %ebx - movl 20(%edi),%esi - bswap %ecx - movl %ebx,44(%esp) - bswap %esi - movl %ecx,48(%esp) - movl %esi,52(%esp) - movl 24(%edi),%ebx - movl 28(%edi),%ecx - bswap %ebx - movl 32(%edi),%esi - bswap %ecx - movl %ebx,56(%esp) - bswap %esi - movl %ecx,60(%esp) - movl %esi,64(%esp) - movl 36(%edi),%ebx - movl 40(%edi),%ecx - bswap %ebx - movl 44(%edi),%esi - bswap %ecx - movl %ebx,68(%esp) - bswap %esi - movl %ecx,72(%esp) - movl %esi,76(%esp) - movl 48(%edi),%ebx - movl 52(%edi),%ecx - bswap %ebx - movl 56(%edi),%esi - bswap %ecx - movl %ebx,80(%esp) - bswap %esi - movl %ecx,84(%esp) - movl %esi,88(%esp) - movl 60(%edi),%ebx - addl $64,%edi - bswap %ebx - movl %edi,100(%esp) - movl %ebx,92(%esp) + vpaddd %xmm7,%xmm0,%xmm0 movl %edx,%ecx - movl 20(%esp),%esi - rorl $14,%edx - movl 24(%esp),%edi + shrdl $14,%edx,%edx + movl 8(%esp),%esi + vpaddd (%ebp),%xmm0,%xmm6 xorl %ecx,%edx - movl 32(%esp),%ebx + movl 12(%esp),%edi xorl %edi,%esi - rorl $5,%edx + shrdl $5,%edx,%edx andl %ecx,%esi - movl %ecx,16(%esp) + movl %ecx,4(%esp) xorl %ecx,%edx - addl 28(%esp),%ebx xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 4(%esp),%edi - xorl %eax,%ecx - movl %eax,(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 1116352408(%ebx,%edx,1),%edx + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax xorl %esi,%ecx - xorl %edi,%ebp - rorl $2,%ecx - addl %edx,%ebp - addl 12(%esp),%edx - addl %ecx,%ebp - movl %edx,%esi - movl 16(%esp),%ecx - rorl $14,%edx - movl 20(%esp),%edi - xorl %esi,%edx - movl 36(%esp),%ebx - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,12(%esp) - xorl %esi,%edx - addl 24(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl (%esp),%edi - xorl %ebp,%esi - movl %ebp,28(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 1899447441(%ebx,%edx,1),%edx - xorl %ecx,%esi + addl 44(%esp),%edx xorl %edi,%eax - rorl $2,%esi + shrdl $2,%ecx,%ecx addl %edx,%eax - addl 8(%esp),%edx - addl %esi,%eax + addl (%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,32(%esp) + vpalignr $4,%xmm1,%xmm2,%xmm4 movl %edx,%ecx - movl 12(%esp),%esi - rorl $14,%edx - movl 16(%esp),%edi + shrdl $14,%edx,%edx + movl 4(%esp),%esi + vpalignr $4,%xmm3,%xmm0,%xmm7 xorl %ecx,%edx - movl 40(%esp),%ebx + movl 8(%esp),%edi xorl %edi,%esi - rorl $5,%edx + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx andl %ecx,%esi - movl %ecx,8(%esp) + movl %ecx,(%esp) + vpaddd %xmm7,%xmm1,%xmm1 xorl %ecx,%edx - addl 20(%esp),%ebx xorl %esi,%edi - rorl $6,%edx + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx + addl %edi,%edx + movl 20(%esp),%edi + vpslld $14,%xmm4,%xmm5 movl %eax,%esi - movl 28(%esp),%edi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + vpxor %xmm6,%xmm7,%xmm4 xorl %eax,%ecx - movl %eax,24(%esp) xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 3049323471(%ebx,%edx,1),%edx + addl 12(%esp),%edx + vpshufd $250,%xmm0,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx xorl %esi,%ecx - xorl %edi,%ebp - rorl $2,%ecx - addl %edx,%ebp - addl 4(%esp),%edx - addl %ecx,%ebp - movl %edx,%esi - movl 8(%esp),%ecx - rorl $14,%edx - movl 12(%esp),%edi - xorl %esi,%edx - movl 44(%esp),%ebx - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,4(%esp) - xorl %esi,%edx - addl 16(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 24(%esp),%edi - xorl %ebp,%esi - movl %ebp,20(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 3921009573(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - rorl $2,%esi - addl %edx,%eax - addl (%esp),%edx - addl %esi,%eax + vpsrld $11,%xmm6,%xmm6 + addl 48(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 movl %edx,%ecx - movl 4(%esp),%esi - rorl $14,%edx - movl 8(%esp),%edi + shrdl $14,%edx,%edx + movl (%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 xorl %ecx,%edx - movl 48(%esp),%ebx + movl 4(%esp),%edi xorl %edi,%esi - rorl $5,%edx + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx andl %ecx,%esi - movl %ecx,(%esp) + movl %ecx,28(%esp) + vpxor %xmm5,%xmm4,%xmm4 xorl %ecx,%edx - addl 12(%esp),%ebx xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 20(%esp),%edi - xorl %eax,%ecx - movl %eax,16(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 961987163(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - rorl $2,%ecx - addl %edx,%ebp - addl 28(%esp),%edx - addl %ecx,%ebp - movl %edx,%esi - movl (%esp),%ecx - rorl $14,%edx - movl 4(%esp),%edi - xorl %esi,%edx - movl 52(%esp),%ebx - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,28(%esp) - xorl %esi,%edx - addl 8(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx movl 16(%esp),%edi - xorl %ebp,%esi - movl %ebp,12(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 1508970993(%ebx,%edx,1),%edx - xorl %ecx,%esi + vpaddd %xmm4,%xmm1,%xmm1 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 52(%esp),%edx xorl %edi,%eax - rorl $2,%esi + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 addl %edx,%eax addl 24(%esp),%edx - addl %esi,%eax + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 movl %edx,%ecx + shrdl $14,%edx,%edx movl 28(%esp),%esi - rorl $14,%edx - movl (%esp),%edi + vpaddd %xmm7,%xmm1,%xmm1 xorl %ecx,%edx - movl 56(%esp),%ebx + movl (%esp),%edi xorl %edi,%esi - rorl $5,%edx + vpshufd $80,%xmm1,%xmm7 + shrdl $5,%edx,%edx andl %ecx,%esi movl %ecx,24(%esp) + vpsrld $10,%xmm7,%xmm6 xorl %ecx,%edx - addl 4(%esp),%ebx xorl %esi,%edi - rorl $6,%edx + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi + addl %edi,%edx movl 12(%esp),%edi - xorl %eax,%ecx + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx movl %eax,8(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 2453635748(%ebx,%edx,1),%edx + addl 4(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx xorl %esi,%ecx - xorl %edi,%ebp - rorl $2,%ecx - addl %edx,%ebp + vpshufd $232,%xmm6,%xmm7 + addl 56(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx addl 20(%esp),%edx - addl %ecx,%ebp - movl %edx,%esi - movl 24(%esp),%ecx - rorl $14,%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm1,%xmm1 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + vpaddd 16(%ebp),%xmm1,%xmm6 + xorl %ecx,%edx movl 28(%esp),%edi - xorl %esi,%edx - movl 60(%esp),%ebx - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,20(%esp) - xorl %esi,%edx - addl (%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx movl 8(%esp),%edi - xorl %ebp,%esi - movl %ebp,4(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 2870763221(%ebx,%edx,1),%edx - xorl %ecx,%esi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx xorl %edi,%eax - rorl $2,%esi + shrdl $2,%ecx,%ecx addl %edx,%eax addl 16(%esp),%edx - addl %esi,%eax + addl %ecx,%eax + vmovdqa %xmm6,48(%esp) + vpalignr $4,%xmm2,%xmm3,%xmm4 movl %edx,%ecx + shrdl $14,%edx,%edx movl 20(%esp),%esi - rorl $14,%edx - movl 24(%esp),%edi + vpalignr $4,%xmm0,%xmm1,%xmm7 xorl %ecx,%edx - movl 64(%esp),%ebx + movl 24(%esp),%edi xorl %edi,%esi - rorl $5,%edx + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx andl %ecx,%esi movl %ecx,16(%esp) + vpaddd %xmm7,%xmm2,%xmm2 xorl %ecx,%edx - addl 28(%esp),%ebx xorl %esi,%edi - rorl $6,%edx + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi + addl %edi,%edx movl 4(%esp),%edi - xorl %eax,%ecx + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx movl %eax,(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 3624381080(%ebx,%edx,1),%edx + addl 28(%esp),%edx + vpshufd $250,%xmm1,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx xorl %esi,%ecx - xorl %edi,%ebp - rorl $2,%ecx - addl %edx,%ebp + vpsrld $11,%xmm6,%xmm6 + addl 64(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx addl 12(%esp),%edx - addl %ecx,%ebp - movl %edx,%esi - movl 16(%esp),%ecx - rorl $14,%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx movl 20(%esp),%edi - xorl %esi,%edx - movl 68(%esp),%ebx - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,12(%esp) - xorl %esi,%edx - addl 24(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx movl (%esp),%edi - xorl %ebp,%esi - movl %ebp,28(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 310598401(%ebx,%edx,1),%edx - xorl %ecx,%esi + vpaddd %xmm4,%xmm2,%xmm2 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 68(%esp),%edx xorl %edi,%eax - rorl $2,%esi + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 addl %edx,%eax addl 8(%esp),%edx - addl %esi,%eax + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 movl %edx,%ecx + shrdl $14,%edx,%edx movl 12(%esp),%esi - rorl $14,%edx - movl 16(%esp),%edi + vpaddd %xmm7,%xmm2,%xmm2 xorl %ecx,%edx - movl 72(%esp),%ebx + movl 16(%esp),%edi xorl %edi,%esi - rorl $5,%edx + vpshufd $80,%xmm2,%xmm7 + shrdl $5,%edx,%edx andl %ecx,%esi movl %ecx,8(%esp) + vpsrld $10,%xmm7,%xmm6 xorl %ecx,%edx - addl 20(%esp),%ebx xorl %esi,%edi - rorl $6,%edx + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi + addl %edi,%edx movl 28(%esp),%edi - xorl %eax,%ecx + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx movl %eax,24(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 607225278(%ebx,%edx,1),%edx + addl 20(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx xorl %esi,%ecx - xorl %edi,%ebp - rorl $2,%ecx - addl %edx,%ebp + vpshufd $232,%xmm6,%xmm7 + addl 72(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx addl 4(%esp),%edx - addl %ecx,%ebp - movl %edx,%esi - movl 8(%esp),%ecx - rorl $14,%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm2,%xmm2 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + vpaddd 32(%ebp),%xmm2,%xmm6 + xorl %ecx,%edx movl 12(%esp),%edi - xorl %esi,%edx - movl 76(%esp),%ebx - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,4(%esp) - xorl %esi,%edx - addl 16(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx movl 24(%esp),%edi - xorl %ebp,%esi - movl %ebp,20(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 1426881987(%ebx,%edx,1),%edx - xorl %ecx,%esi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx xorl %edi,%eax - rorl $2,%esi + shrdl $2,%ecx,%ecx addl %edx,%eax addl (%esp),%edx - addl %esi,%eax + addl %ecx,%eax + vmovdqa %xmm6,64(%esp) + vpalignr $4,%xmm3,%xmm0,%xmm4 movl %edx,%ecx + shrdl $14,%edx,%edx movl 4(%esp),%esi - rorl $14,%edx - movl 8(%esp),%edi + vpalignr $4,%xmm1,%xmm2,%xmm7 xorl %ecx,%edx - movl 80(%esp),%ebx + movl 8(%esp),%edi xorl %edi,%esi - rorl $5,%edx + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx andl %ecx,%esi movl %ecx,(%esp) + vpaddd %xmm7,%xmm3,%xmm3 xorl %ecx,%edx - addl 12(%esp),%ebx xorl %esi,%edi - rorl $6,%edx + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi + addl %edi,%edx movl 20(%esp),%edi - xorl %eax,%ecx + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx movl %eax,16(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 1925078388(%ebx,%edx,1),%edx + addl 12(%esp),%edx + vpshufd $250,%xmm2,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx xorl %esi,%ecx - xorl %edi,%ebp - rorl $2,%ecx - addl %edx,%ebp + vpsrld $11,%xmm6,%xmm6 + addl 80(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx addl 28(%esp),%edx - addl %ecx,%ebp - movl %edx,%esi - movl (%esp),%ecx - rorl $14,%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx movl 4(%esp),%edi - xorl %esi,%edx - movl 84(%esp),%ebx - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,28(%esp) - xorl %esi,%edx - addl 8(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx movl 16(%esp),%edi - xorl %ebp,%esi - movl %ebp,12(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 2162078206(%ebx,%edx,1),%edx - xorl %ecx,%esi + vpaddd %xmm4,%xmm3,%xmm3 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 84(%esp),%edx xorl %edi,%eax - rorl $2,%esi + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 addl %edx,%eax addl 24(%esp),%edx - addl %esi,%eax + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 movl %edx,%ecx + shrdl $14,%edx,%edx movl 28(%esp),%esi - rorl $14,%edx - movl (%esp),%edi + vpaddd %xmm7,%xmm3,%xmm3 xorl %ecx,%edx - movl 88(%esp),%ebx + movl (%esp),%edi xorl %edi,%esi - rorl $5,%edx + vpshufd $80,%xmm3,%xmm7 + shrdl $5,%edx,%edx andl %ecx,%esi movl %ecx,24(%esp) + vpsrld $10,%xmm7,%xmm6 xorl %ecx,%edx - addl 4(%esp),%ebx xorl %esi,%edi - rorl $6,%edx + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi + addl %edi,%edx movl 12(%esp),%edi - xorl %eax,%ecx + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx movl %eax,8(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 2614888103(%ebx,%edx,1),%edx + addl 4(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx xorl %esi,%ecx - xorl %edi,%ebp - rorl $2,%ecx - addl %edx,%ebp + vpshufd $232,%xmm6,%xmm7 + addl 88(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx addl 20(%esp),%edx - addl %ecx,%ebp - movl %edx,%esi - movl 24(%esp),%ecx - rorl $14,%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm3,%xmm3 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + vpaddd 48(%ebp),%xmm3,%xmm6 + xorl %ecx,%edx movl 28(%esp),%edi - xorl %esi,%edx - movl 92(%esp),%ebx - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,20(%esp) - xorl %esi,%edx - addl (%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx movl 8(%esp),%edi - xorl %ebp,%esi - movl %ebp,4(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 3248222580(%ebx,%edx,1),%edx - xorl %ecx,%esi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx xorl %edi,%eax - movl 36(%esp),%ecx - rorl $2,%esi + shrdl $2,%ecx,%ecx addl %edx,%eax addl 16(%esp),%edx - addl %esi,%eax - movl 88(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 32(%esp),%ebx - shrl $10,%edi - addl 68(%esp),%ebx + addl %ecx,%eax + vmovdqa %xmm6,80(%esp) + cmpl $66051,64(%ebp) + jne .L016avx_00_47 movl %edx,%ecx - xorl %esi,%edi + shrdl $14,%edx,%edx movl 20(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 24(%esp),%edi xorl %ecx,%edx - movl %ebx,32(%esp) + movl 24(%esp),%edi xorl %edi,%esi - rorl $5,%edx + shrdl $5,%edx,%edx andl %ecx,%esi movl %ecx,16(%esp) xorl %ecx,%edx - addl 28(%esp),%ebx xorl %esi,%edi - rorl $6,%edx + shrdl $6,%edx,%edx movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi + addl %edi,%edx movl 4(%esp),%edi - xorl %eax,%ecx + movl %eax,%esi + shrdl $9,%ecx,%ecx movl %eax,(%esp) + xorl %eax,%ecx xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 3835390401(%ebx,%edx,1),%edx + addl 28(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx xorl %esi,%ecx - xorl %edi,%ebp - movl 40(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp + addl 32(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx addl 12(%esp),%edx - addl %ecx,%ebp - movl 92(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 36(%esp),%ebx - shrl $10,%edi - addl 72(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 16(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + xorl %ecx,%edx movl 20(%esp),%edi - xorl %esi,%edx - movl %ebx,36(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,12(%esp) - xorl %esi,%edx - addl 24(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx movl (%esp),%edi - xorl %ebp,%esi - movl %ebp,28(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 4022224774(%ebx,%edx,1),%edx - xorl %ecx,%esi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 36(%esp),%edx xorl %edi,%eax - movl 44(%esp),%ecx - rorl $2,%esi + shrdl $2,%ecx,%ecx addl %edx,%eax addl 8(%esp),%edx - addl %esi,%eax - movl 32(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 40(%esp),%ebx - shrl $10,%edi - addl 76(%esp),%ebx + addl %ecx,%eax movl %edx,%ecx - xorl %esi,%edi + shrdl $14,%edx,%edx movl 12(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 16(%esp),%edi xorl %ecx,%edx - movl %ebx,40(%esp) + movl 16(%esp),%edi xorl %edi,%esi - rorl $5,%edx + shrdl $5,%edx,%edx andl %ecx,%esi movl %ecx,8(%esp) xorl %ecx,%edx - addl 20(%esp),%ebx xorl %esi,%edi - rorl $6,%edx + shrdl $6,%edx,%edx movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi + addl %edi,%edx movl 28(%esp),%edi - xorl %eax,%ecx + movl %eax,%esi + shrdl $9,%ecx,%ecx movl %eax,24(%esp) + xorl %eax,%ecx xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 264347078(%ebx,%edx,1),%edx + addl 20(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx xorl %esi,%ecx - xorl %edi,%ebp - movl 48(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp + addl 40(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx addl 4(%esp),%edx - addl %ecx,%ebp - movl 36(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 44(%esp),%ebx - shrl $10,%edi - addl 80(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 8(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + xorl %ecx,%edx movl 12(%esp),%edi - xorl %esi,%edx - movl %ebx,44(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,4(%esp) - xorl %esi,%edx - addl 16(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx movl 24(%esp),%edi - xorl %ebp,%esi - movl %ebp,20(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 604807628(%ebx,%edx,1),%edx - xorl %ecx,%esi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx xorl %edi,%eax - movl 52(%esp),%ecx - rorl $2,%esi + shrdl $2,%ecx,%ecx addl %edx,%eax addl (%esp),%edx - addl %esi,%eax - movl 40(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 48(%esp),%ebx - shrl $10,%edi - addl 84(%esp),%ebx + addl %ecx,%eax movl %edx,%ecx - xorl %esi,%edi + shrdl $14,%edx,%edx movl 4(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 8(%esp),%edi xorl %ecx,%edx - movl %ebx,48(%esp) + movl 8(%esp),%edi xorl %edi,%esi - rorl $5,%edx + shrdl $5,%edx,%edx andl %ecx,%esi movl %ecx,(%esp) xorl %ecx,%edx - addl 12(%esp),%ebx xorl %esi,%edi - rorl $6,%edx + shrdl $6,%edx,%edx movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi + addl %edi,%edx movl 20(%esp),%edi - xorl %eax,%ecx + movl %eax,%esi + shrdl $9,%ecx,%ecx movl %eax,16(%esp) + xorl %eax,%ecx xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 770255983(%ebx,%edx,1),%edx + addl 12(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx xorl %esi,%ecx - xorl %edi,%ebp - movl 56(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp + addl 48(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx addl 28(%esp),%edx - addl %ecx,%ebp - movl 44(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 52(%esp),%ebx - shrl $10,%edi - addl 88(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl (%esp),%ecx - rorl $14,%edx - addl %edi,%ebx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + xorl %ecx,%edx movl 4(%esp),%edi - xorl %esi,%edx - movl %ebx,52(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,28(%esp) - xorl %esi,%edx - addl 8(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx movl 16(%esp),%edi - xorl %ebp,%esi - movl %ebp,12(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 1249150122(%ebx,%edx,1),%edx - xorl %ecx,%esi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 52(%esp),%edx xorl %edi,%eax - movl 60(%esp),%ecx - rorl $2,%esi + shrdl $2,%ecx,%ecx addl %edx,%eax addl 24(%esp),%edx - addl %esi,%eax - movl 48(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 56(%esp),%ebx - shrl $10,%edi - addl 92(%esp),%ebx + addl %ecx,%eax movl %edx,%ecx - xorl %esi,%edi + shrdl $14,%edx,%edx movl 28(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl (%esp),%edi xorl %ecx,%edx - movl %ebx,56(%esp) + movl (%esp),%edi xorl %edi,%esi - rorl $5,%edx + shrdl $5,%edx,%edx andl %ecx,%esi movl %ecx,24(%esp) xorl %ecx,%edx - addl 4(%esp),%ebx xorl %esi,%edi - rorl $6,%edx + shrdl $6,%edx,%edx movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi + addl %edi,%edx movl 12(%esp),%edi - xorl %eax,%ecx + movl %eax,%esi + shrdl $9,%ecx,%ecx movl %eax,8(%esp) + xorl %eax,%ecx xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 1555081692(%ebx,%edx,1),%edx + addl 4(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx xorl %esi,%ecx - xorl %edi,%ebp - movl 64(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp + addl 56(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx addl 20(%esp),%edx - addl %ecx,%ebp - movl 52(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 60(%esp),%ebx - shrl $10,%edi - addl 32(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 24(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + xorl %ecx,%edx movl 28(%esp),%edi - xorl %esi,%edx - movl %ebx,60(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,20(%esp) - xorl %esi,%edx - addl (%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx movl 8(%esp),%edi - xorl %ebp,%esi - movl %ebp,4(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 1996064986(%ebx,%edx,1),%edx - xorl %ecx,%esi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx xorl %edi,%eax - movl 68(%esp),%ecx - rorl $2,%esi + shrdl $2,%ecx,%ecx addl %edx,%eax addl 16(%esp),%edx - addl %esi,%eax - movl 56(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 64(%esp),%ebx - shrl $10,%edi - addl 36(%esp),%ebx + addl %ecx,%eax movl %edx,%ecx - xorl %esi,%edi + shrdl $14,%edx,%edx movl 20(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 24(%esp),%edi xorl %ecx,%edx - movl %ebx,64(%esp) + movl 24(%esp),%edi xorl %edi,%esi - rorl $5,%edx + shrdl $5,%edx,%edx andl %ecx,%esi movl %ecx,16(%esp) xorl %ecx,%edx - addl 28(%esp),%ebx xorl %esi,%edi - rorl $6,%edx + shrdl $6,%edx,%edx movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi + addl %edi,%edx movl 4(%esp),%edi - xorl %eax,%ecx + movl %eax,%esi + shrdl $9,%ecx,%ecx movl %eax,(%esp) + xorl %eax,%ecx xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 2554220882(%ebx,%edx,1),%edx + addl 28(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx xorl %esi,%ecx - xorl %edi,%ebp - movl 72(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp + addl 64(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx addl 12(%esp),%edx - addl %ecx,%ebp - movl 60(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 68(%esp),%ebx - shrl $10,%edi - addl 40(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 16(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + xorl %ecx,%edx movl 20(%esp),%edi - xorl %esi,%edx - movl %ebx,68(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,12(%esp) - xorl %esi,%edx - addl 24(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx movl (%esp),%edi - xorl %ebp,%esi - movl %ebp,28(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 2821834349(%ebx,%edx,1),%edx - xorl %ecx,%esi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 68(%esp),%edx xorl %edi,%eax - movl 76(%esp),%ecx - rorl $2,%esi + shrdl $2,%ecx,%ecx addl %edx,%eax addl 8(%esp),%edx - addl %esi,%eax - movl 64(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 72(%esp),%ebx - shrl $10,%edi - addl 44(%esp),%ebx + addl %ecx,%eax movl %edx,%ecx - xorl %esi,%edi + shrdl $14,%edx,%edx movl 12(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 16(%esp),%edi xorl %ecx,%edx - movl %ebx,72(%esp) + movl 16(%esp),%edi xorl %edi,%esi - rorl $5,%edx + shrdl $5,%edx,%edx andl %ecx,%esi movl %ecx,8(%esp) xorl %ecx,%edx - addl 20(%esp),%ebx xorl %esi,%edi - rorl $6,%edx + shrdl $6,%edx,%edx movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi + addl %edi,%edx movl 28(%esp),%edi - xorl %eax,%ecx + movl %eax,%esi + shrdl $9,%ecx,%ecx movl %eax,24(%esp) + xorl %eax,%ecx xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 2952996808(%ebx,%edx,1),%edx + addl 20(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx xorl %esi,%ecx - xorl %edi,%ebp - movl 80(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp + addl 72(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx addl 4(%esp),%edx - addl %ecx,%ebp - movl 68(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 76(%esp),%ebx - shrl $10,%edi - addl 48(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 8(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + xorl %ecx,%edx movl 12(%esp),%edi - xorl %esi,%edx - movl %ebx,76(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,4(%esp) - xorl %esi,%edx - addl 16(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx movl 24(%esp),%edi - xorl %ebp,%esi - movl %ebp,20(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 3210313671(%ebx,%edx,1),%edx - xorl %ecx,%esi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx xorl %edi,%eax - movl 84(%esp),%ecx - rorl $2,%esi + shrdl $2,%ecx,%ecx addl %edx,%eax addl (%esp),%edx - addl %esi,%eax - movl 72(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 80(%esp),%ebx - shrl $10,%edi - addl 52(%esp),%ebx + addl %ecx,%eax movl %edx,%ecx - xorl %esi,%edi + shrdl $14,%edx,%edx movl 4(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 8(%esp),%edi xorl %ecx,%edx - movl %ebx,80(%esp) + movl 8(%esp),%edi xorl %edi,%esi - rorl $5,%edx + shrdl $5,%edx,%edx andl %ecx,%esi movl %ecx,(%esp) xorl %ecx,%edx - addl 12(%esp),%ebx xorl %esi,%edi - rorl $6,%edx + shrdl $6,%edx,%edx movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi + addl %edi,%edx movl 20(%esp),%edi - xorl %eax,%ecx + movl %eax,%esi + shrdl $9,%ecx,%ecx movl %eax,16(%esp) + xorl %eax,%ecx xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 3336571891(%ebx,%edx,1),%edx + addl 12(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx xorl %esi,%ecx - xorl %edi,%ebp - movl 88(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp + addl 80(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx addl 28(%esp),%edx - addl %ecx,%ebp - movl 76(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 84(%esp),%ebx - shrl $10,%edi - addl 56(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl (%esp),%ecx - rorl $14,%edx - addl %edi,%ebx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + xorl %ecx,%edx movl 4(%esp),%edi - xorl %esi,%edx - movl %ebx,84(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,28(%esp) - xorl %esi,%edx - addl 8(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx movl 16(%esp),%edi - xorl %ebp,%esi - movl %ebp,12(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 3584528711(%ebx,%edx,1),%edx - xorl %ecx,%esi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 84(%esp),%edx xorl %edi,%eax - movl 92(%esp),%ecx - rorl $2,%esi + shrdl $2,%ecx,%ecx addl %edx,%eax addl 24(%esp),%edx - addl %esi,%eax - movl 80(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 88(%esp),%ebx - shrl $10,%edi - addl 60(%esp),%ebx + addl %ecx,%eax movl %edx,%ecx - xorl %esi,%edi + shrdl $14,%edx,%edx movl 28(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl (%esp),%edi xorl %ecx,%edx - movl %ebx,88(%esp) + movl (%esp),%edi xorl %edi,%esi - rorl $5,%edx + shrdl $5,%edx,%edx andl %ecx,%esi movl %ecx,24(%esp) xorl %ecx,%edx - addl 4(%esp),%ebx xorl %esi,%edi - rorl $6,%edx + shrdl $6,%edx,%edx movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi + addl %edi,%edx movl 12(%esp),%edi - xorl %eax,%ecx + movl %eax,%esi + shrdl $9,%ecx,%ecx movl %eax,8(%esp) + xorl %eax,%ecx xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 113926993(%ebx,%edx,1),%edx + addl 4(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx xorl %esi,%ecx - xorl %edi,%ebp - movl 32(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp + addl 88(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx addl 20(%esp),%edx - addl %ecx,%ebp - movl 84(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 92(%esp),%ebx - shrl $10,%edi - addl 64(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 24(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 28(%esp),%edi - xorl %esi,%edx - movl %ebx,92(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,20(%esp) - xorl %esi,%edx - addl (%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 8(%esp),%edi - xorl %ebp,%esi - movl %ebp,4(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 338241895(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 36(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 16(%esp),%edx - addl %esi,%eax - movl 88(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 32(%esp),%ebx - shrl $10,%edi - addl 68(%esp),%ebx + addl %ecx,%ebx movl %edx,%ecx - xorl %esi,%edi - movl 20(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 24(%esp),%edi + shrdl $14,%edx,%edx + movl 24(%esp),%esi xorl %ecx,%edx - movl %ebx,32(%esp) + movl 28(%esp),%edi xorl %edi,%esi - rorl $5,%edx + shrdl $5,%edx,%edx andl %ecx,%esi - movl %ecx,16(%esp) + movl %ecx,20(%esp) xorl %ecx,%edx - addl 28(%esp),%ebx xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 4(%esp),%edi - xorl %eax,%ecx - movl %eax,(%esp) + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 666307205(%ebx,%edx,1),%edx + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl 96(%esp),%esi + xorl %edi,%ebx + movl 12(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebx + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl %ebx,4(%esp) + xorl %edi,%ebx + movl %edi,8(%esp) + movl %ecx,12(%esp) + movl 20(%esp),%edi + movl 24(%esp),%ecx + addl 16(%esi),%edx + addl 20(%esi),%edi + addl 24(%esi),%ecx + movl %edx,16(%esi) + movl %edi,20(%esi) + movl %edi,20(%esp) + movl 28(%esp),%edi + movl %ecx,24(%esi) + addl 28(%esi),%edi + movl %ecx,24(%esp) + movl %edi,28(%esi) + movl %edi,28(%esp) + movl 100(%esp),%edi + vmovdqa 64(%ebp),%xmm7 + subl $192,%ebp + cmpl 104(%esp),%edi + jb .L015grand_avx + movl 108(%esp),%esp + vzeroall + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 32 +.L014AVX_BMI: + leal -96(%esp),%esp + vzeroall + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edi + movl %ebx,4(%esp) + xorl %ecx,%ebx + movl %ecx,8(%esp) + movl %edi,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%edi + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ecx,24(%esp) + movl %esi,28(%esp) + vmovdqa 256(%ebp),%xmm7 + jmp .L017grand_avx_bmi +.align 32 +.L017grand_avx_bmi: + vmovdqu (%edi),%xmm0 + vmovdqu 16(%edi),%xmm1 + vmovdqu 32(%edi),%xmm2 + vmovdqu 48(%edi),%xmm3 + addl $64,%edi + vpshufb %xmm7,%xmm0,%xmm0 + movl %edi,100(%esp) + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd (%ebp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 16(%ebp),%xmm1,%xmm5 + vpaddd 32(%ebp),%xmm2,%xmm6 + vpaddd 48(%ebp),%xmm3,%xmm7 + vmovdqa %xmm4,32(%esp) + vmovdqa %xmm5,48(%esp) + vmovdqa %xmm6,64(%esp) + vmovdqa %xmm7,80(%esp) + jmp .L018avx_bmi_00_47 +.align 16 +.L018avx_bmi_00_47: + addl $64,%ebp + vpalignr $4,%xmm0,%xmm1,%xmm4 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,16(%esp) + vpalignr $4,%xmm2,%xmm3,%xmm7 + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 24(%esp),%edx,%esi + vpsrld $7,%xmm4,%xmm6 + xorl %edi,%ecx + andl 20(%esp),%edx + movl %eax,(%esp) + vpaddd %xmm7,%xmm0,%xmm0 + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrld $3,%xmm4,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpslld $14,%xmm4,%xmm5 + movl 4(%esp),%edi xorl %esi,%ecx - xorl %edi,%ebp - movl 40(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp + xorl %edi,%eax + vpxor %xmm6,%xmm7,%xmm4 + addl 28(%esp),%edx + andl %eax,%ebx + addl 32(%esp),%edx + vpshufd $250,%xmm3,%xmm7 + xorl %edi,%ebx + addl %edx,%ecx addl 12(%esp),%edx - addl %ecx,%ebp - movl 92(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 36(%esp),%ebx - shrl $10,%edi - addl 72(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 16(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 20(%esp),%edi - xorl %esi,%edx - movl %ebx,36(%esp) + vpsrld $11,%xmm6,%xmm6 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl %edx,12(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpslld $11,%xmm5,%xmm5 + andnl 20(%esp),%edx,%esi xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,12(%esp) - xorl %esi,%edx - addl 24(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx + andl 16(%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + movl %ebx,28(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpsrld $10,%xmm7,%xmm6 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpxor %xmm5,%xmm4,%xmm4 movl (%esp),%edi - xorl %ebp,%esi - movl %ebp,28(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 773529912(%ebx,%edx,1),%edx - xorl %ecx,%esi + xorl %esi,%ecx + xorl %edi,%ebx + vpsrlq $17,%xmm7,%xmm5 + addl 24(%esp),%edx + andl %ebx,%eax + addl 36(%esp),%edx + vpaddd %xmm4,%xmm0,%xmm0 xorl %edi,%eax - movl 44(%esp),%ecx - rorl $2,%esi - addl %edx,%eax + addl %edx,%ecx addl 8(%esp),%edx - addl %esi,%eax - movl 32(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 40(%esp),%ebx - shrl $10,%edi - addl 76(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 12(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 16(%esp),%edi - xorl %ecx,%edx - movl %ebx,40(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - addl 20(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 28(%esp),%edi - xorl %eax,%ecx + vpxor %xmm5,%xmm6,%xmm6 + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpsrlq $19,%xmm7,%xmm7 + movl %edx,8(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + andnl 16(%esp),%edx,%esi + xorl %edi,%ecx + andl 12(%esp),%edx + vpshufd $132,%xmm6,%xmm7 movl %eax,24(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 1294757372(%ebx,%edx,1),%edx + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrldq $8,%xmm7,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpaddd %xmm7,%xmm0,%xmm0 + movl 28(%esp),%edi xorl %esi,%ecx - xorl %edi,%ebp - movl 48(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp + xorl %edi,%eax + vpshufd $80,%xmm0,%xmm7 + addl 20(%esp),%edx + andl %eax,%ebx + addl 40(%esp),%edx + vpsrld $10,%xmm7,%xmm6 + xorl %edi,%ebx + addl %edx,%ecx addl 4(%esp),%edx - addl %ecx,%ebp - movl 36(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi + vpsrlq $17,%xmm7,%xmm5 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm6,%xmm6 + movl %edx,4(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpsrlq $19,%xmm7,%xmm7 + andnl 12(%esp),%edx,%esi xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 44(%esp),%ebx - shrl $10,%edi - addl 80(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 8(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 12(%esp),%edi - xorl %esi,%edx - movl %ebx,44(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,4(%esp) - xorl %esi,%edx - addl 16(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx + andl 8(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + movl %ebx,20(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpshufd $232,%xmm6,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpslldq $8,%xmm7,%xmm7 movl 24(%esp),%edi - xorl %ebp,%esi - movl %ebp,20(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 1396182291(%ebx,%edx,1),%edx - xorl %ecx,%esi + xorl %esi,%ecx + xorl %edi,%ebx + vpaddd %xmm7,%xmm0,%xmm0 + addl 16(%esp),%edx + andl %ebx,%eax + addl 44(%esp),%edx + vpaddd (%ebp),%xmm0,%xmm6 xorl %edi,%eax - movl 52(%esp),%ecx - rorl $2,%esi - addl %edx,%eax + addl %edx,%ecx addl (%esp),%edx - addl %esi,%eax - movl 40(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 48(%esp),%ebx - shrl $10,%edi - addl 84(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 4(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 8(%esp),%edi - xorl %ecx,%edx - movl %ebx,48(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - addl 12(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 20(%esp),%edi - xorl %eax,%ecx + leal (%eax,%ecx,1),%eax + vmovdqa %xmm6,32(%esp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,(%esp) + vpalignr $4,%xmm3,%xmm0,%xmm7 + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 8(%esp),%edx,%esi + vpsrld $7,%xmm4,%xmm6 + xorl %edi,%ecx + andl 4(%esp),%edx movl %eax,16(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 1695183700(%ebx,%edx,1),%edx + vpaddd %xmm7,%xmm1,%xmm1 + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrld $3,%xmm4,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpslld $14,%xmm4,%xmm5 + movl 20(%esp),%edi xorl %esi,%ecx - xorl %edi,%ebp - movl 56(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp + xorl %edi,%eax + vpxor %xmm6,%xmm7,%xmm4 + addl 12(%esp),%edx + andl %eax,%ebx + addl 48(%esp),%edx + vpshufd $250,%xmm0,%xmm7 + xorl %edi,%ebx + addl %edx,%ecx addl 28(%esp),%edx - addl %ecx,%ebp - movl 44(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 52(%esp),%ebx - shrl $10,%edi - addl 88(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl (%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 4(%esp),%edi - xorl %esi,%edx - movl %ebx,52(%esp) + vpsrld $11,%xmm6,%xmm6 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl %edx,28(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpslld $11,%xmm5,%xmm5 + andnl 4(%esp),%edx,%esi xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,28(%esp) - xorl %esi,%edx - addl 8(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx + andl (%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + movl %ebx,12(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpsrld $10,%xmm7,%xmm6 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpxor %xmm5,%xmm4,%xmm4 movl 16(%esp),%edi - xorl %ebp,%esi - movl %ebp,12(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 1986661051(%ebx,%edx,1),%edx - xorl %ecx,%esi + xorl %esi,%ecx + xorl %edi,%ebx + vpsrlq $17,%xmm7,%xmm5 + addl 8(%esp),%edx + andl %ebx,%eax + addl 52(%esp),%edx + vpaddd %xmm4,%xmm1,%xmm1 xorl %edi,%eax - movl 60(%esp),%ecx - rorl $2,%esi - addl %edx,%eax + addl %edx,%ecx addl 24(%esp),%edx - addl %esi,%eax - movl 48(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 56(%esp),%ebx - shrl $10,%edi - addl 92(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 28(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl (%esp),%edi - xorl %ecx,%edx - movl %ebx,56(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - addl 4(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 12(%esp),%edi - xorl %eax,%ecx + vpxor %xmm5,%xmm6,%xmm6 + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpsrlq $19,%xmm7,%xmm7 + movl %edx,24(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + andnl (%esp),%edx,%esi + xorl %edi,%ecx + andl 28(%esp),%edx + vpshufd $132,%xmm6,%xmm7 movl %eax,8(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 2177026350(%ebx,%edx,1),%edx + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrldq $8,%xmm7,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpaddd %xmm7,%xmm1,%xmm1 + movl 12(%esp),%edi xorl %esi,%ecx - xorl %edi,%ebp - movl 64(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp + xorl %edi,%eax + vpshufd $80,%xmm1,%xmm7 + addl 4(%esp),%edx + andl %eax,%ebx + addl 56(%esp),%edx + vpsrld $10,%xmm7,%xmm6 + xorl %edi,%ebx + addl %edx,%ecx addl 20(%esp),%edx - addl %ecx,%ebp - movl 52(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 60(%esp),%ebx - shrl $10,%edi - addl 32(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 24(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 28(%esp),%edi - xorl %esi,%edx - movl %ebx,60(%esp) + vpsrlq $17,%xmm7,%xmm5 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm6,%xmm6 + movl %edx,20(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpsrlq $19,%xmm7,%xmm7 + andnl 28(%esp),%edx,%esi xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,20(%esp) - xorl %esi,%edx - addl (%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx + andl 24(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + movl %ebx,4(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpshufd $232,%xmm6,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpslldq $8,%xmm7,%xmm7 movl 8(%esp),%edi - xorl %ebp,%esi - movl %ebp,4(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 2456956037(%ebx,%edx,1),%edx - xorl %ecx,%esi + xorl %esi,%ecx + xorl %edi,%ebx + vpaddd %xmm7,%xmm1,%xmm1 + addl (%esp),%edx + andl %ebx,%eax + addl 60(%esp),%edx + vpaddd 16(%ebp),%xmm1,%xmm6 xorl %edi,%eax - movl 68(%esp),%ecx - rorl $2,%esi - addl %edx,%eax + addl %edx,%ecx addl 16(%esp),%edx - addl %esi,%eax - movl 56(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 64(%esp),%ebx - shrl $10,%edi - addl 36(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 20(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 24(%esp),%edi - xorl %ecx,%edx - movl %ebx,64(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - addl 28(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 4(%esp),%edi - xorl %eax,%ecx + leal (%eax,%ecx,1),%eax + vmovdqa %xmm6,48(%esp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,16(%esp) + vpalignr $4,%xmm0,%xmm1,%xmm7 + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 24(%esp),%edx,%esi + vpsrld $7,%xmm4,%xmm6 + xorl %edi,%ecx + andl 20(%esp),%edx movl %eax,(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 2730485921(%ebx,%edx,1),%edx + vpaddd %xmm7,%xmm2,%xmm2 + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrld $3,%xmm4,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpslld $14,%xmm4,%xmm5 + movl 4(%esp),%edi xorl %esi,%ecx - xorl %edi,%ebp - movl 72(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp + xorl %edi,%eax + vpxor %xmm6,%xmm7,%xmm4 + addl 28(%esp),%edx + andl %eax,%ebx + addl 64(%esp),%edx + vpshufd $250,%xmm1,%xmm7 + xorl %edi,%ebx + addl %edx,%ecx addl 12(%esp),%edx - addl %ecx,%ebp - movl 60(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 68(%esp),%ebx - shrl $10,%edi - addl 40(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 16(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 20(%esp),%edi - xorl %esi,%edx - movl %ebx,68(%esp) + vpsrld $11,%xmm6,%xmm6 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl %edx,12(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpslld $11,%xmm5,%xmm5 + andnl 20(%esp),%edx,%esi xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,12(%esp) - xorl %esi,%edx - addl 24(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx + andl 16(%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + movl %ebx,28(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpsrld $10,%xmm7,%xmm6 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpxor %xmm5,%xmm4,%xmm4 movl (%esp),%edi - xorl %ebp,%esi - movl %ebp,28(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 2820302411(%ebx,%edx,1),%edx - xorl %ecx,%esi + xorl %esi,%ecx + xorl %edi,%ebx + vpsrlq $17,%xmm7,%xmm5 + addl 24(%esp),%edx + andl %ebx,%eax + addl 68(%esp),%edx + vpaddd %xmm4,%xmm2,%xmm2 xorl %edi,%eax - movl 76(%esp),%ecx - rorl $2,%esi - addl %edx,%eax + addl %edx,%ecx addl 8(%esp),%edx - addl %esi,%eax - movl 64(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 72(%esp),%ebx - shrl $10,%edi - addl 44(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 12(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 16(%esp),%edi - xorl %ecx,%edx - movl %ebx,72(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - addl 20(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 28(%esp),%edi - xorl %eax,%ecx + vpxor %xmm5,%xmm6,%xmm6 + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpsrlq $19,%xmm7,%xmm7 + movl %edx,8(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + andnl 16(%esp),%edx,%esi + xorl %edi,%ecx + andl 12(%esp),%edx + vpshufd $132,%xmm6,%xmm7 movl %eax,24(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 3259730800(%ebx,%edx,1),%edx + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrldq $8,%xmm7,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpaddd %xmm7,%xmm2,%xmm2 + movl 28(%esp),%edi xorl %esi,%ecx - xorl %edi,%ebp - movl 80(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp + xorl %edi,%eax + vpshufd $80,%xmm2,%xmm7 + addl 20(%esp),%edx + andl %eax,%ebx + addl 72(%esp),%edx + vpsrld $10,%xmm7,%xmm6 + xorl %edi,%ebx + addl %edx,%ecx addl 4(%esp),%edx - addl %ecx,%ebp - movl 68(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 76(%esp),%ebx - shrl $10,%edi - addl 48(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 8(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 12(%esp),%edi - xorl %esi,%edx - movl %ebx,76(%esp) + vpsrlq $17,%xmm7,%xmm5 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm6,%xmm6 + movl %edx,4(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpsrlq $19,%xmm7,%xmm7 + andnl 12(%esp),%edx,%esi xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,4(%esp) - xorl %esi,%edx - addl 16(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx + andl 8(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + movl %ebx,20(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpshufd $232,%xmm6,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpslldq $8,%xmm7,%xmm7 movl 24(%esp),%edi - xorl %ebp,%esi - movl %ebp,20(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 3345764771(%ebx,%edx,1),%edx - xorl %ecx,%esi + xorl %esi,%ecx + xorl %edi,%ebx + vpaddd %xmm7,%xmm2,%xmm2 + addl 16(%esp),%edx + andl %ebx,%eax + addl 76(%esp),%edx + vpaddd 32(%ebp),%xmm2,%xmm6 xorl %edi,%eax - movl 84(%esp),%ecx - rorl $2,%esi - addl %edx,%eax + addl %edx,%ecx addl (%esp),%edx - addl %esi,%eax - movl 72(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 80(%esp),%ebx - shrl $10,%edi - addl 52(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 4(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 8(%esp),%edi - xorl %ecx,%edx - movl %ebx,80(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - addl 12(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 20(%esp),%edi - xorl %eax,%ecx + leal (%eax,%ecx,1),%eax + vmovdqa %xmm6,64(%esp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,(%esp) + vpalignr $4,%xmm1,%xmm2,%xmm7 + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 8(%esp),%edx,%esi + vpsrld $7,%xmm4,%xmm6 + xorl %edi,%ecx + andl 4(%esp),%edx movl %eax,16(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 3516065817(%ebx,%edx,1),%edx + vpaddd %xmm7,%xmm3,%xmm3 + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrld $3,%xmm4,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpslld $14,%xmm4,%xmm5 + movl 20(%esp),%edi xorl %esi,%ecx - xorl %edi,%ebp - movl 88(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp + xorl %edi,%eax + vpxor %xmm6,%xmm7,%xmm4 + addl 12(%esp),%edx + andl %eax,%ebx + addl 80(%esp),%edx + vpshufd $250,%xmm2,%xmm7 + xorl %edi,%ebx + addl %edx,%ecx addl 28(%esp),%edx - addl %ecx,%ebp - movl 76(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi + vpsrld $11,%xmm6,%xmm6 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl %edx,28(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpslld $11,%xmm5,%xmm5 + andnl 4(%esp),%edx,%esi xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 84(%esp),%ebx - shrl $10,%edi - addl 56(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl (%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 4(%esp),%edi - xorl %esi,%edx - movl %ebx,84(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,28(%esp) - xorl %esi,%edx - addl 8(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx + andl (%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + movl %ebx,12(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpsrld $10,%xmm7,%xmm6 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpxor %xmm5,%xmm4,%xmm4 movl 16(%esp),%edi - xorl %ebp,%esi - movl %ebp,12(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 3600352804(%ebx,%edx,1),%edx - xorl %ecx,%esi + xorl %esi,%ecx + xorl %edi,%ebx + vpsrlq $17,%xmm7,%xmm5 + addl 8(%esp),%edx + andl %ebx,%eax + addl 84(%esp),%edx + vpaddd %xmm4,%xmm3,%xmm3 xorl %edi,%eax - movl 92(%esp),%ecx - rorl $2,%esi - addl %edx,%eax + addl %edx,%ecx addl 24(%esp),%edx - addl %esi,%eax - movl 80(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 88(%esp),%ebx - shrl $10,%edi - addl 60(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 28(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl (%esp),%edi - xorl %ecx,%edx - movl %ebx,88(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - addl 4(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 12(%esp),%edi - xorl %eax,%ecx + vpxor %xmm5,%xmm6,%xmm6 + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpsrlq $19,%xmm7,%xmm7 + movl %edx,24(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + andnl (%esp),%edx,%esi + xorl %edi,%ecx + andl 28(%esp),%edx + vpshufd $132,%xmm6,%xmm7 movl %eax,8(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 4094571909(%ebx,%edx,1),%edx + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrldq $8,%xmm7,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpaddd %xmm7,%xmm3,%xmm3 + movl 12(%esp),%edi xorl %esi,%ecx - xorl %edi,%ebp - movl 32(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp + xorl %edi,%eax + vpshufd $80,%xmm3,%xmm7 + addl 4(%esp),%edx + andl %eax,%ebx + addl 88(%esp),%edx + vpsrld $10,%xmm7,%xmm6 + xorl %edi,%ebx + addl %edx,%ecx addl 20(%esp),%edx - addl %ecx,%ebp - movl 84(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 92(%esp),%ebx - shrl $10,%edi - addl 64(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 24(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 28(%esp),%edi - xorl %esi,%edx - movl %ebx,92(%esp) + vpsrlq $17,%xmm7,%xmm5 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm6,%xmm6 + movl %edx,20(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpsrlq $19,%xmm7,%xmm7 + andnl 28(%esp),%edx,%esi xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,20(%esp) - xorl %esi,%edx - addl (%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx + andl 24(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + movl %ebx,4(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpshufd $232,%xmm6,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpslldq $8,%xmm7,%xmm7 movl 8(%esp),%edi - xorl %ebp,%esi - movl %ebp,4(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 275423344(%ebx,%edx,1),%edx - xorl %ecx,%esi + xorl %esi,%ecx + xorl %edi,%ebx + vpaddd %xmm7,%xmm3,%xmm3 + addl (%esp),%edx + andl %ebx,%eax + addl 92(%esp),%edx + vpaddd 48(%ebp),%xmm3,%xmm6 xorl %edi,%eax - movl 36(%esp),%ecx - rorl $2,%esi - addl %edx,%eax + addl %edx,%ecx addl 16(%esp),%edx - addl %esi,%eax - movl 88(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 32(%esp),%ebx - shrl $10,%edi - addl 68(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 20(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 24(%esp),%edi - xorl %ecx,%edx - movl %ebx,32(%esp) + leal (%eax,%ecx,1),%eax + vmovdqa %xmm6,80(%esp) + cmpl $66051,64(%ebp) + jne .L018avx_bmi_00_47 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,16(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 24(%esp),%edx,%esi + xorl %edi,%ecx + andl 20(%esp),%edx + movl %eax,(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - addl 28(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi movl 4(%esp),%edi - xorl %eax,%ecx - movl %eax,(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 430227734(%ebx,%edx,1),%edx xorl %esi,%ecx - xorl %edi,%ebp - movl 40(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp + xorl %edi,%eax + addl 28(%esp),%edx + andl %eax,%ebx + addl 32(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx addl 12(%esp),%edx - addl %ecx,%ebp - movl 92(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 36(%esp),%ebx - shrl $10,%edi - addl 72(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 16(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 20(%esp),%edi - xorl %esi,%edx - movl %ebx,36(%esp) + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,12(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 20(%esp),%edx,%esi xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,12(%esp) - xorl %esi,%edx - addl 24(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx + andl 16(%esp),%edx + movl %ebx,28(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi movl (%esp),%edi - xorl %ebp,%esi - movl %ebp,28(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 506948616(%ebx,%edx,1),%edx - xorl %ecx,%esi + xorl %esi,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + andl %ebx,%eax + addl 36(%esp),%edx xorl %edi,%eax - movl 44(%esp),%ecx - rorl $2,%esi - addl %edx,%eax + addl %edx,%ecx addl 8(%esp),%edx - addl %esi,%eax - movl 32(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 40(%esp),%ebx - shrl $10,%edi - addl 76(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 12(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 16(%esp),%edi - xorl %ecx,%edx - movl %ebx,40(%esp) + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,8(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 16(%esp),%edx,%esi + xorl %edi,%ecx + andl 12(%esp),%edx + movl %eax,24(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - addl 20(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi movl 28(%esp),%edi - xorl %eax,%ecx - movl %eax,24(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 659060556(%ebx,%edx,1),%edx xorl %esi,%ecx - xorl %edi,%ebp - movl 48(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp + xorl %edi,%eax + addl 20(%esp),%edx + andl %eax,%ebx + addl 40(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx addl 4(%esp),%edx - addl %ecx,%ebp - movl 36(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 44(%esp),%ebx - shrl $10,%edi - addl 80(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 8(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 12(%esp),%edi - xorl %esi,%edx - movl %ebx,44(%esp) + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,4(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 12(%esp),%edx,%esi xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,4(%esp) - xorl %esi,%edx - addl 16(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx + andl 8(%esp),%edx + movl %ebx,20(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi movl 24(%esp),%edi - xorl %ebp,%esi - movl %ebp,20(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 883997877(%ebx,%edx,1),%edx - xorl %ecx,%esi + xorl %esi,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + andl %ebx,%eax + addl 44(%esp),%edx xorl %edi,%eax - movl 52(%esp),%ecx - rorl $2,%esi - addl %edx,%eax + addl %edx,%ecx addl (%esp),%edx - addl %esi,%eax - movl 40(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 48(%esp),%ebx - shrl $10,%edi - addl 84(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 4(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 8(%esp),%edi - xorl %ecx,%edx - movl %ebx,48(%esp) + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 8(%esp),%edx,%esi + xorl %edi,%ecx + andl 4(%esp),%edx + movl %eax,16(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - addl 12(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi movl 20(%esp),%edi - xorl %eax,%ecx - movl %eax,16(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 958139571(%ebx,%edx,1),%edx xorl %esi,%ecx - xorl %edi,%ebp - movl 56(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp + xorl %edi,%eax + addl 12(%esp),%edx + andl %eax,%ebx + addl 48(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx addl 28(%esp),%edx - addl %ecx,%ebp - movl 44(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 52(%esp),%ebx - shrl $10,%edi - addl 88(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl (%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 4(%esp),%edi - xorl %esi,%edx - movl %ebx,52(%esp) + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,28(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 4(%esp),%edx,%esi xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,28(%esp) - xorl %esi,%edx - addl 8(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx + andl (%esp),%edx + movl %ebx,12(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi movl 16(%esp),%edi - xorl %ebp,%esi - movl %ebp,12(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 1322822218(%ebx,%edx,1),%edx - xorl %ecx,%esi + xorl %esi,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + andl %ebx,%eax + addl 52(%esp),%edx xorl %edi,%eax - movl 60(%esp),%ecx - rorl $2,%esi - addl %edx,%eax + addl %edx,%ecx addl 24(%esp),%edx - addl %esi,%eax - movl 48(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 56(%esp),%ebx - shrl $10,%edi - addl 92(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 28(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl (%esp),%edi - xorl %ecx,%edx - movl %ebx,56(%esp) + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,24(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl (%esp),%edx,%esi + xorl %edi,%ecx + andl 28(%esp),%edx + movl %eax,8(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - addl 4(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi movl 12(%esp),%edi - xorl %eax,%ecx - movl %eax,8(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 1537002063(%ebx,%edx,1),%edx xorl %esi,%ecx - xorl %edi,%ebp - movl 64(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp + xorl %edi,%eax + addl 4(%esp),%edx + andl %eax,%ebx + addl 56(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx addl 20(%esp),%edx - addl %ecx,%ebp - movl 52(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 60(%esp),%ebx - shrl $10,%edi - addl 32(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 24(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 28(%esp),%edi - xorl %esi,%edx - movl %ebx,60(%esp) + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,20(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 28(%esp),%edx,%esi xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,20(%esp) - xorl %esi,%edx - addl (%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx + andl 24(%esp),%edx + movl %ebx,4(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi movl 8(%esp),%edi - xorl %ebp,%esi - movl %ebp,4(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 1747873779(%ebx,%edx,1),%edx - xorl %ecx,%esi + xorl %esi,%ecx + xorl %edi,%ebx + addl (%esp),%edx + andl %ebx,%eax + addl 60(%esp),%edx xorl %edi,%eax - movl 68(%esp),%ecx - rorl $2,%esi - addl %edx,%eax + addl %edx,%ecx addl 16(%esp),%edx - addl %esi,%eax - movl 56(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 64(%esp),%ebx - shrl $10,%edi - addl 36(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 20(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 24(%esp),%edi - xorl %ecx,%edx - movl %ebx,64(%esp) + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,16(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 24(%esp),%edx,%esi + xorl %edi,%ecx + andl 20(%esp),%edx + movl %eax,(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - addl 28(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi movl 4(%esp),%edi - xorl %eax,%ecx - movl %eax,(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 1955562222(%ebx,%edx,1),%edx xorl %esi,%ecx - xorl %edi,%ebp - movl 72(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp + xorl %edi,%eax + addl 28(%esp),%edx + andl %eax,%ebx + addl 64(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx addl 12(%esp),%edx - addl %ecx,%ebp - movl 60(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 68(%esp),%ebx - shrl $10,%edi - addl 40(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 16(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 20(%esp),%edi - xorl %esi,%edx - movl %ebx,68(%esp) + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,12(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 20(%esp),%edx,%esi xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,12(%esp) - xorl %esi,%edx - addl 24(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx + andl 16(%esp),%edx + movl %ebx,28(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi movl (%esp),%edi - xorl %ebp,%esi - movl %ebp,28(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 2024104815(%ebx,%edx,1),%edx - xorl %ecx,%esi + xorl %esi,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + andl %ebx,%eax + addl 68(%esp),%edx xorl %edi,%eax - movl 76(%esp),%ecx - rorl $2,%esi - addl %edx,%eax + addl %edx,%ecx addl 8(%esp),%edx - addl %esi,%eax - movl 64(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 72(%esp),%ebx - shrl $10,%edi - addl 44(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 12(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 16(%esp),%edi - xorl %ecx,%edx - movl %ebx,72(%esp) + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,8(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 16(%esp),%edx,%esi + xorl %edi,%ecx + andl 12(%esp),%edx + movl %eax,24(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - addl 20(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi movl 28(%esp),%edi - xorl %eax,%ecx - movl %eax,24(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 2227730452(%ebx,%edx,1),%edx xorl %esi,%ecx - xorl %edi,%ebp - movl 80(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp + xorl %edi,%eax + addl 20(%esp),%edx + andl %eax,%ebx + addl 72(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx addl 4(%esp),%edx - addl %ecx,%ebp - movl 68(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 76(%esp),%ebx - shrl $10,%edi - addl 48(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 8(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 12(%esp),%edi - xorl %esi,%edx - movl %ebx,76(%esp) + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,4(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 12(%esp),%edx,%esi xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,4(%esp) - xorl %esi,%edx - addl 16(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx + andl 8(%esp),%edx + movl %ebx,20(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi movl 24(%esp),%edi - xorl %ebp,%esi - movl %ebp,20(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 2361852424(%ebx,%edx,1),%edx - xorl %ecx,%esi + xorl %esi,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + andl %ebx,%eax + addl 76(%esp),%edx xorl %edi,%eax - movl 84(%esp),%ecx - rorl $2,%esi - addl %edx,%eax + addl %edx,%ecx addl (%esp),%edx - addl %esi,%eax - movl 72(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 80(%esp),%ebx - shrl $10,%edi - addl 52(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 4(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 8(%esp),%edi - xorl %ecx,%edx - movl %ebx,80(%esp) + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 8(%esp),%edx,%esi + xorl %edi,%ecx + andl 4(%esp),%edx + movl %eax,16(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - addl 12(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi movl 20(%esp),%edi - xorl %eax,%ecx - movl %eax,16(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 2428436474(%ebx,%edx,1),%edx xorl %esi,%ecx - xorl %edi,%ebp - movl 88(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp + xorl %edi,%eax + addl 12(%esp),%edx + andl %eax,%ebx + addl 80(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx addl 28(%esp),%edx - addl %ecx,%ebp - movl 76(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 84(%esp),%ebx - shrl $10,%edi - addl 56(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl (%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 4(%esp),%edi - xorl %esi,%edx - movl %ebx,84(%esp) + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,28(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 4(%esp),%edx,%esi xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,28(%esp) - xorl %esi,%edx - addl 8(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx + andl (%esp),%edx + movl %ebx,12(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi movl 16(%esp),%edi - xorl %ebp,%esi - movl %ebp,12(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 2756734187(%ebx,%edx,1),%edx - xorl %ecx,%esi + xorl %esi,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + andl %ebx,%eax + addl 84(%esp),%edx xorl %edi,%eax - movl 92(%esp),%ecx - rorl $2,%esi - addl %edx,%eax + addl %edx,%ecx addl 24(%esp),%edx - addl %esi,%eax - movl 80(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 88(%esp),%ebx - shrl $10,%edi - addl 60(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 28(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl (%esp),%edi - xorl %ecx,%edx + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,24(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl (%esp),%edx,%esi + xorl %edi,%ecx + andl 28(%esp),%edx + movl %eax,8(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - addl 4(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi movl 12(%esp),%edi - xorl %eax,%ecx - movl %eax,8(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 3204031479(%ebx,%edx,1),%edx xorl %esi,%ecx - xorl %edi,%ebp - movl 32(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp + xorl %edi,%eax + addl 4(%esp),%edx + andl %eax,%ebx + addl 88(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx addl 20(%esp),%edx - addl %ecx,%ebp - movl 84(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 92(%esp),%ebx - shrl $10,%edi - addl 64(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 24(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 28(%esp),%edi - xorl %esi,%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,20(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 28(%esp),%edx,%esi xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,20(%esp) - xorl %esi,%edx - addl (%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx + andl 24(%esp),%edx + movl %ebx,4(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi movl 8(%esp),%edi - xorl %ebp,%esi - movl %ebp,4(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 3329325298(%ebx,%edx,1),%edx - xorl %ecx,%esi + xorl %esi,%ecx + xorl %edi,%ebx + addl (%esp),%edx + andl %ebx,%eax + addl 92(%esp),%edx xorl %edi,%eax - rorl $2,%esi - addl %edx,%eax + addl %edx,%ecx addl 16(%esp),%edx - addl %esi,%eax + leal (%eax,%ecx,1),%eax movl 96(%esp),%esi - xorl %edi,%ebp + xorl %edi,%ebx movl 12(%esp),%ecx addl (%esi),%eax - addl 4(%esi),%ebp + addl 4(%esi),%ebx addl 8(%esi),%edi addl 12(%esi),%ecx movl %eax,(%esi) - movl %ebp,4(%esi) + movl %ebx,4(%esi) movl %edi,8(%esi) movl %ecx,12(%esi) - movl %ebp,4(%esp) - xorl %edi,%ebp + movl %ebx,4(%esp) + xorl %edi,%ebx movl %edi,8(%esp) movl %ecx,12(%esp) movl 20(%esp),%edi - movl 24(%esp),%ebx - movl 28(%esp),%ecx + movl 24(%esp),%ecx addl 16(%esi),%edx addl 20(%esi),%edi - addl 24(%esi),%ebx - addl 28(%esi),%ecx + addl 24(%esi),%ecx movl %edx,16(%esi) movl %edi,20(%esi) - movl %ebx,24(%esi) - movl %ecx,28(%esi) movl %edi,20(%esp) + movl 28(%esp),%edi + movl %ecx,24(%esi) + addl 28(%esi),%edi + movl %ecx,24(%esp) + movl %edi,28(%esi) + movl %edi,28(%esp) movl 100(%esp),%edi - movl %ebx,24(%esp) - movl %ecx,28(%esp) + vmovdqa 64(%ebp),%xmm7 + subl $192,%ebp cmpl 104(%esp),%edi - jb .L009grand_loop + jb .L017grand_avx_bmi movl 108(%esp),%esp + vzeroall popl %edi popl %esi popl %ebx popl %ebp ret -.align 32 -.L004shaext: - subl $32,%esp - movdqu (%esi),%xmm1 - leal 128(%ebp),%ebp - movdqu 16(%esi),%xmm2 - movdqa 128(%ebp),%xmm7 - pshufd $27,%xmm1,%xmm0 - pshufd $177,%xmm1,%xmm1 - pshufd $27,%xmm2,%xmm2 -.byte 102,15,58,15,202,8 - punpcklqdq %xmm0,%xmm2 - jmp .L010loop_shaext +.size sha256_block_data_order,.-.L_sha256_block_data_order_begin +.comm OPENSSL_ia32cap_P,16,4 +#else +.text +.globl sha256_block_data_order +.type sha256_block_data_order,@function .align 16 -.L010loop_shaext: - movdqu (%edi),%xmm3 - movdqu 16(%edi),%xmm4 - movdqu 32(%edi),%xmm5 -.byte 102,15,56,0,223 - movdqu 48(%edi),%xmm6 - movdqa %xmm2,16(%esp) - movdqa -128(%ebp),%xmm0 - paddd %xmm3,%xmm0 -.byte 102,15,56,0,231 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - nop - movdqa %xmm1,(%esp) -.byte 15,56,203,202 - movdqa -112(%ebp),%xmm0 - paddd %xmm4,%xmm0 -.byte 102,15,56,0,239 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - leal 64(%edi),%edi -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa -96(%ebp),%xmm0 - paddd %xmm5,%xmm0 -.byte 102,15,56,0,247 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa -80(%ebp),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa -64(%ebp),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa -48(%ebp),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 - nop - paddd %xmm7,%xmm6 -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa -32(%ebp),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,205,245 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa -16(%ebp),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa (%ebp),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 16(%ebp),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 - nop - paddd %xmm7,%xmm6 -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa 32(%ebp),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,205,245 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa 48(%ebp),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa 64(%ebp),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 80(%ebp),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 -.byte 15,56,203,202 - paddd %xmm7,%xmm6 - movdqa 96(%ebp),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 -.byte 15,56,205,245 - movdqa 128(%ebp),%xmm7 -.byte 15,56,203,202 - movdqa 112(%ebp),%xmm0 - paddd %xmm6,%xmm0 - nop -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - cmpl %edi,%eax - nop -.byte 15,56,203,202 - paddd 16(%esp),%xmm2 - paddd (%esp),%xmm1 - jnz .L010loop_shaext - pshufd $177,%xmm2,%xmm2 - pshufd $27,%xmm1,%xmm7 - pshufd $177,%xmm1,%xmm1 - punpckhqdq %xmm2,%xmm1 -.byte 102,15,58,15,215,8 - movl 44(%esp),%esp - movdqu %xmm1,(%esi) - movdqu %xmm2,16(%esi) - popl %edi - popl %esi - popl %ebx +sha256_block_data_order: +.L_sha256_block_data_order_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl %esp,%ebx + call .L000pic_point +.L000pic_point: popl %ebp - ret -.align 32 -.L005SSSE3: - leal -96(%esp),%esp + leal .L001K256-.L000pic_point(%ebp),%ebp + subl $16,%esp + andl $-64,%esp + shll $6,%eax + addl %edi,%eax + movl %esi,(%esp) + movl %edi,4(%esp) + movl %eax,8(%esp) + movl %ebx,12(%esp) + leal OPENSSL_ia32cap_P,%edx + movl (%edx),%ecx + movl 4(%edx),%ebx + testl $1048576,%ecx + jnz .L002loop + movl 8(%edx),%edx + testl $16777216,%ecx + jz .L003no_xmm + andl $1073741824,%ecx + andl $268435968,%ebx + testl $536870912,%edx + jnz .L004shaext + orl %ebx,%ecx + andl $1342177280,%ecx + cmpl $1342177280,%ecx + je .L005AVX + testl $512,%ebx + jnz .L006SSSE3 +.L003no_xmm: + subl %edi,%eax + cmpl $256,%eax + jae .L007unrolled + jmp .L002loop +.align 16 +.L002loop: + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + bswap %eax + movl 12(%edi),%edx + bswap %ebx + pushl %eax + bswap %ecx + pushl %ebx + bswap %edx + pushl %ecx + pushl %edx + movl 16(%edi),%eax + movl 20(%edi),%ebx + movl 24(%edi),%ecx + bswap %eax + movl 28(%edi),%edx + bswap %ebx + pushl %eax + bswap %ecx + pushl %ebx + bswap %edx + pushl %ecx + pushl %edx + movl 32(%edi),%eax + movl 36(%edi),%ebx + movl 40(%edi),%ecx + bswap %eax + movl 44(%edi),%edx + bswap %ebx + pushl %eax + bswap %ecx + pushl %ebx + bswap %edx + pushl %ecx + pushl %edx + movl 48(%edi),%eax + movl 52(%edi),%ebx + movl 56(%edi),%ecx + bswap %eax + movl 60(%edi),%edx + bswap %ebx + pushl %eax + bswap %ecx + pushl %ebx + bswap %edx + pushl %ecx + pushl %edx + addl $64,%edi + leal -36(%esp),%esp + movl %edi,104(%esp) movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx movl 12(%esi),%edi - movl %ebx,4(%esp) + movl %ebx,8(%esp) xorl %ecx,%ebx - movl %ecx,8(%esp) - movl %edi,12(%esp) + movl %ecx,12(%esp) + movl %edi,16(%esp) + movl %ebx,(%esp) movl 16(%esi),%edx - movl 20(%esi),%edi + movl 20(%esi),%ebx movl 24(%esi),%ecx - movl 28(%esi),%esi - movl %edi,20(%esp) - movl 100(%esp),%edi - movl %ecx,24(%esp) - movl %esi,28(%esp) - movdqa 256(%ebp),%xmm7 - jmp .L011grand_ssse3 + movl 28(%esi),%edi + movl %ebx,24(%esp) + movl %ecx,28(%esp) + movl %edi,32(%esp) .align 16 -.L011grand_ssse3: - movdqu (%edi),%xmm0 - movdqu 16(%edi),%xmm1 - movdqu 32(%edi),%xmm2 - movdqu 48(%edi),%xmm3 - addl $64,%edi -.byte 102,15,56,0,199 - movl %edi,100(%esp) -.byte 102,15,56,0,207 - movdqa (%ebp),%xmm4 -.byte 102,15,56,0,215 - movdqa 16(%ebp),%xmm5 - paddd %xmm0,%xmm4 -.byte 102,15,56,0,223 - movdqa 32(%ebp),%xmm6 - paddd %xmm1,%xmm5 - movdqa 48(%ebp),%xmm7 - movdqa %xmm4,32(%esp) - paddd %xmm2,%xmm6 - movdqa %xmm5,48(%esp) - paddd %xmm3,%xmm7 - movdqa %xmm6,64(%esp) - movdqa %xmm7,80(%esp) - jmp .L012ssse3_00_47 +.L00800_15: + movl %edx,%ecx + movl 24(%esp),%esi + rorl $14,%ecx + movl 28(%esp),%edi + xorl %edx,%ecx + xorl %edi,%esi + movl 96(%esp),%ebx + rorl $5,%ecx + andl %edx,%esi + movl %edx,20(%esp) + xorl %ecx,%edx + addl 32(%esp),%ebx + xorl %edi,%esi + rorl $6,%edx + movl %eax,%ecx + addl %esi,%ebx + rorl $9,%ecx + addl %edx,%ebx + movl 8(%esp),%edi + xorl %eax,%ecx + movl %eax,4(%esp) + leal -4(%esp),%esp + rorl $11,%ecx + movl (%ebp),%esi + xorl %eax,%ecx + movl 20(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %esi,%ebx + movl %eax,(%esp) + addl %ebx,%edx + andl 4(%esp),%eax + addl %ecx,%ebx + xorl %edi,%eax + addl $4,%ebp + addl %ebx,%eax + cmpl $3248222580,%esi + jne .L00800_15 + movl 156(%esp),%ecx + jmp .L00916_63 .align 16 -.L012ssse3_00_47: - addl $64,%ebp +.L00916_63: + movl %ecx,%ebx + movl 104(%esp),%esi + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 160(%esp),%ebx + shrl $10,%edi + addl 124(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 24(%esp),%esi + rorl $14,%ecx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %edx,%ecx + xorl %edi,%esi + movl %ebx,96(%esp) + rorl $5,%ecx + andl %edx,%esi + movl %edx,20(%esp) + xorl %ecx,%edx + addl 32(%esp),%ebx + xorl %edi,%esi + rorl $6,%edx + movl %eax,%ecx + addl %esi,%ebx + rorl $9,%ecx + addl %edx,%ebx + movl 8(%esp),%edi + xorl %eax,%ecx + movl %eax,4(%esp) + leal -4(%esp),%esp + rorl $11,%ecx + movl (%ebp),%esi + xorl %eax,%ecx + movl 20(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %esi,%ebx + movl %eax,(%esp) + addl %ebx,%edx + andl 4(%esp),%eax + addl %ecx,%ebx + xorl %edi,%eax + movl 156(%esp),%ecx + addl $4,%ebp + addl %ebx,%eax + cmpl $3329325298,%esi + jne .L00916_63 + movl 356(%esp),%esi + movl 8(%esp),%ebx + movl 16(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebx + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl 24(%esp),%eax + movl 28(%esp),%ebx + movl 32(%esp),%ecx + movl 360(%esp),%edi + addl 16(%esi),%edx + addl 20(%esi),%eax + addl 24(%esi),%ebx + addl 28(%esi),%ecx + movl %edx,16(%esi) + movl %eax,20(%esi) + movl %ebx,24(%esi) + movl %ecx,28(%esi) + leal 356(%esp),%esp + subl $256,%ebp + cmpl 8(%esp),%edi + jb .L002loop + movl 12(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 64 +.L001K256: +.long 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298 +.long 66051,67438087,134810123,202182159 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 +.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 +.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 +.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 +.byte 62,0 +.align 16 +.L007unrolled: + leal -96(%esp),%esp + movl (%esi),%eax + movl 4(%esi),%ebp + movl 8(%esi),%ecx + movl 12(%esi),%ebx + movl %ebp,4(%esp) + xorl %ecx,%ebp + movl %ecx,8(%esp) + movl %ebx,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%ebx + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %ebx,20(%esp) + movl %ecx,24(%esp) + movl %esi,28(%esp) + jmp .L010grand_loop +.align 16 +.L010grand_loop: + movl (%edi),%ebx + movl 4(%edi),%ecx + bswap %ebx + movl 8(%edi),%esi + bswap %ecx + movl %ebx,32(%esp) + bswap %esi + movl %ecx,36(%esp) + movl %esi,40(%esp) + movl 12(%edi),%ebx + movl 16(%edi),%ecx + bswap %ebx + movl 20(%edi),%esi + bswap %ecx + movl %ebx,44(%esp) + bswap %esi + movl %ecx,48(%esp) + movl %esi,52(%esp) + movl 24(%edi),%ebx + movl 28(%edi),%ecx + bswap %ebx + movl 32(%edi),%esi + bswap %ecx + movl %ebx,56(%esp) + bswap %esi + movl %ecx,60(%esp) + movl %esi,64(%esp) + movl 36(%edi),%ebx + movl 40(%edi),%ecx + bswap %ebx + movl 44(%edi),%esi + bswap %ecx + movl %ebx,68(%esp) + bswap %esi + movl %ecx,72(%esp) + movl %esi,76(%esp) + movl 48(%edi),%ebx + movl 52(%edi),%ecx + bswap %ebx + movl 56(%edi),%esi + bswap %ecx + movl %ebx,80(%esp) + bswap %esi + movl %ecx,84(%esp) + movl %esi,88(%esp) + movl 60(%edi),%ebx + addl $64,%edi + bswap %ebx + movl %edi,100(%esp) + movl %ebx,92(%esp) + movl %edx,%ecx + movl 20(%esp),%esi + rorl $14,%edx + movl 24(%esp),%edi + xorl %ecx,%edx + movl 32(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1116352408(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 16(%esp),%ecx + rorl $14,%edx + movl 20(%esp),%edi + xorl %esi,%edx + movl 36(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1899447441(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 12(%esp),%esi + rorl $14,%edx + movl 16(%esp),%edi + xorl %ecx,%edx + movl 40(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3049323471(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 8(%esp),%ecx + rorl $14,%edx + movl 12(%esp),%edi + xorl %esi,%edx + movl 44(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3921009573(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 4(%esp),%esi + rorl $14,%edx + movl 8(%esp),%edi + xorl %ecx,%edx + movl 48(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 961987163(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl (%esp),%ecx + rorl $14,%edx + movl 4(%esp),%edi + xorl %esi,%edx + movl 52(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1508970993(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 28(%esp),%esi + rorl $14,%edx + movl (%esp),%edi + xorl %ecx,%edx + movl 56(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2453635748(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 24(%esp),%ecx + rorl $14,%edx + movl 28(%esp),%edi + xorl %esi,%edx + movl 60(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2870763221(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 20(%esp),%esi + rorl $14,%edx + movl 24(%esp),%edi + xorl %ecx,%edx + movl 64(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3624381080(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 16(%esp),%ecx + rorl $14,%edx + movl 20(%esp),%edi + xorl %esi,%edx + movl 68(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 310598401(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 12(%esp),%esi + rorl $14,%edx + movl 16(%esp),%edi + xorl %ecx,%edx + movl 72(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 607225278(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 8(%esp),%ecx + rorl $14,%edx + movl 12(%esp),%edi + xorl %esi,%edx + movl 76(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1426881987(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 4(%esp),%esi + rorl $14,%edx + movl 8(%esp),%edi + xorl %ecx,%edx + movl 80(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1925078388(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl (%esp),%ecx + rorl $14,%edx + movl 4(%esp),%edi + xorl %esi,%edx + movl 84(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2162078206(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 28(%esp),%esi + rorl $14,%edx + movl (%esp),%edi + xorl %ecx,%edx + movl 88(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2614888103(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 24(%esp),%ecx + rorl $14,%edx + movl 28(%esp),%edi + xorl %esi,%edx + movl 92(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3248222580(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 36(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 88(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 32(%esp),%ebx + shrl $10,%edi + addl 68(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,32(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3835390401(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 40(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 92(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 36(%esp),%ebx + shrl $10,%edi + addl 72(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,36(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 4022224774(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 44(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 32(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 40(%esp),%ebx + shrl $10,%edi + addl 76(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,40(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 264347078(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 48(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 36(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 44(%esp),%ebx + shrl $10,%edi + addl 80(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,44(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 604807628(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 52(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 40(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 48(%esp),%ebx + shrl $10,%edi + addl 84(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,48(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 770255983(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 56(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 44(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 52(%esp),%ebx + shrl $10,%edi + addl 88(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,52(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1249150122(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 60(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 48(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 56(%esp),%ebx + shrl $10,%edi + addl 92(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,56(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1555081692(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 64(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 52(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 60(%esp),%ebx + shrl $10,%edi + addl 32(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,60(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1996064986(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 68(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 56(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 64(%esp),%ebx + shrl $10,%edi + addl 36(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,64(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2554220882(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 72(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 60(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 68(%esp),%ebx + shrl $10,%edi + addl 40(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,68(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2821834349(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 76(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 64(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 72(%esp),%ebx + shrl $10,%edi + addl 44(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,72(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2952996808(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 80(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 68(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 76(%esp),%ebx + shrl $10,%edi + addl 48(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,76(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3210313671(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 84(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 72(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 80(%esp),%ebx + shrl $10,%edi + addl 52(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,80(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3336571891(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 88(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 76(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 84(%esp),%ebx + shrl $10,%edi + addl 56(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,84(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3584528711(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 92(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 80(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 88(%esp),%ebx + shrl $10,%edi + addl 60(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,88(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 113926993(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 32(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 84(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 92(%esp),%ebx + shrl $10,%edi + addl 64(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,92(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 338241895(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 36(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 88(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 32(%esp),%ebx + shrl $10,%edi + addl 68(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,32(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 666307205(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 40(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 92(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 36(%esp),%ebx + shrl $10,%edi + addl 72(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,36(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 773529912(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 44(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 32(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 40(%esp),%ebx + shrl $10,%edi + addl 76(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,40(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1294757372(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 48(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 36(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 44(%esp),%ebx + shrl $10,%edi + addl 80(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,44(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1396182291(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 52(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 40(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 48(%esp),%ebx + shrl $10,%edi + addl 84(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,48(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1695183700(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 56(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 44(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 52(%esp),%ebx + shrl $10,%edi + addl 88(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,52(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1986661051(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 60(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 48(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 56(%esp),%ebx + shrl $10,%edi + addl 92(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,56(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2177026350(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 64(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 52(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 60(%esp),%ebx + shrl $10,%edi + addl 32(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,60(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2456956037(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 68(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 56(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 64(%esp),%ebx + shrl $10,%edi + addl 36(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,64(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2730485921(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 72(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 60(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 68(%esp),%ebx + shrl $10,%edi + addl 40(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,68(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2820302411(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 76(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 64(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 72(%esp),%ebx + shrl $10,%edi + addl 44(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,72(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3259730800(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 80(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 68(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 76(%esp),%ebx + shrl $10,%edi + addl 48(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,76(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3345764771(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 84(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 72(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 80(%esp),%ebx + shrl $10,%edi + addl 52(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,80(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3516065817(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 88(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 76(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 84(%esp),%ebx + shrl $10,%edi + addl 56(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,84(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3600352804(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 92(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 80(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 88(%esp),%ebx + shrl $10,%edi + addl 60(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,88(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 4094571909(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 32(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 84(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 92(%esp),%ebx + shrl $10,%edi + addl 64(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,92(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 275423344(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 36(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 88(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 32(%esp),%ebx + shrl $10,%edi + addl 68(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,32(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 430227734(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 40(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 92(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 36(%esp),%ebx + shrl $10,%edi + addl 72(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,36(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 506948616(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 44(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 32(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 40(%esp),%ebx + shrl $10,%edi + addl 76(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,40(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 659060556(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 48(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 36(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 44(%esp),%ebx + shrl $10,%edi + addl 80(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,44(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 883997877(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 52(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 40(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 48(%esp),%ebx + shrl $10,%edi + addl 84(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,48(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 958139571(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 56(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 44(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 52(%esp),%ebx + shrl $10,%edi + addl 88(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,52(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1322822218(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 60(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 48(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 56(%esp),%ebx + shrl $10,%edi + addl 92(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,56(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1537002063(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 64(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 52(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 60(%esp),%ebx + shrl $10,%edi + addl 32(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,60(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1747873779(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 68(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 56(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 64(%esp),%ebx + shrl $10,%edi + addl 36(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,64(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1955562222(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 72(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 60(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 68(%esp),%ebx + shrl $10,%edi + addl 40(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,68(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2024104815(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 76(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 64(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 72(%esp),%ebx + shrl $10,%edi + addl 44(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,72(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2227730452(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 80(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 68(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 76(%esp),%ebx + shrl $10,%edi + addl 48(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,76(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2361852424(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 84(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 72(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 80(%esp),%ebx + shrl $10,%edi + addl 52(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,80(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2428436474(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 88(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 76(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 84(%esp),%ebx + shrl $10,%edi + addl 56(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,84(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2756734187(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 92(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 80(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 88(%esp),%ebx + shrl $10,%edi + addl 60(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3204031479(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 32(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 84(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 92(%esp),%ebx + shrl $10,%edi + addl 64(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3329325298(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 96(%esp),%esi + xorl %edi,%ebp + movl 12(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebp + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebp,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl %ebp,4(%esp) + xorl %edi,%ebp + movl %edi,8(%esp) + movl %ecx,12(%esp) + movl 20(%esp),%edi + movl 24(%esp),%ebx + movl 28(%esp),%ecx + addl 16(%esi),%edx + addl 20(%esi),%edi + addl 24(%esi),%ebx + addl 28(%esi),%ecx + movl %edx,16(%esi) + movl %edi,20(%esi) + movl %ebx,24(%esi) + movl %ecx,28(%esi) + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ebx,24(%esp) + movl %ecx,28(%esp) + cmpl 104(%esp),%edi + jb .L010grand_loop + movl 108(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 32 +.L004shaext: + subl $32,%esp + movdqu (%esi),%xmm1 + leal 128(%ebp),%ebp + movdqu 16(%esi),%xmm2 + movdqa 128(%ebp),%xmm7 + pshufd $27,%xmm1,%xmm0 + pshufd $177,%xmm1,%xmm1 + pshufd $27,%xmm2,%xmm2 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + jmp .L011loop_shaext +.align 16 +.L011loop_shaext: + movdqu (%edi),%xmm3 + movdqu 16(%edi),%xmm4 + movdqu 32(%edi),%xmm5 +.byte 102,15,56,0,223 + movdqu 48(%edi),%xmm6 + movdqa %xmm2,16(%esp) + movdqa -128(%ebp),%xmm0 + paddd %xmm3,%xmm0 +.byte 102,15,56,0,231 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + nop + movdqa %xmm1,(%esp) +.byte 15,56,203,202 + movdqa -112(%ebp),%xmm0 + paddd %xmm4,%xmm0 +.byte 102,15,56,0,239 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + leal 64(%edi),%edi +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa -96(%ebp),%xmm0 + paddd %xmm5,%xmm0 +.byte 102,15,56,0,247 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa -80(%ebp),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa -64(%ebp),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa -48(%ebp),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa -32(%ebp),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa -16(%ebp),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa (%ebp),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 16(%ebp),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 32(%ebp),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 48(%ebp),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 64(%ebp),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 80(%ebp),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 +.byte 15,56,203,202 + paddd %xmm7,%xmm6 + movdqa 96(%ebp),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 +.byte 15,56,205,245 + movdqa 128(%ebp),%xmm7 +.byte 15,56,203,202 + movdqa 112(%ebp),%xmm0 + paddd %xmm6,%xmm0 + nop +.byte 15,56,203,209 + pshufd $14,%xmm0,%xmm0 + cmpl %edi,%eax + nop +.byte 15,56,203,202 + paddd 16(%esp),%xmm2 + paddd (%esp),%xmm1 + jnz .L011loop_shaext + pshufd $177,%xmm2,%xmm2 + pshufd $27,%xmm1,%xmm7 + pshufd $177,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,215,8 + movl 44(%esp),%esp + movdqu %xmm1,(%esi) + movdqu %xmm2,16(%esi) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 32 +.L006SSSE3: + leal -96(%esp),%esp + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edi + movl %ebx,4(%esp) + xorl %ecx,%ebx + movl %ecx,8(%esp) + movl %edi,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%edi + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ecx,24(%esp) + movl %esi,28(%esp) + movdqa 256(%ebp),%xmm7 + jmp .L012grand_ssse3 +.align 16 +.L012grand_ssse3: + movdqu (%edi),%xmm0 + movdqu 16(%edi),%xmm1 + movdqu 32(%edi),%xmm2 + movdqu 48(%edi),%xmm3 + addl $64,%edi +.byte 102,15,56,0,199 + movl %edi,100(%esp) +.byte 102,15,56,0,207 + movdqa (%ebp),%xmm4 +.byte 102,15,56,0,215 + movdqa 16(%ebp),%xmm5 + paddd %xmm0,%xmm4 +.byte 102,15,56,0,223 + movdqa 32(%ebp),%xmm6 + paddd %xmm1,%xmm5 + movdqa 48(%ebp),%xmm7 + movdqa %xmm4,32(%esp) + paddd %xmm2,%xmm6 + movdqa %xmm5,48(%esp) + paddd %xmm3,%xmm7 + movdqa %xmm6,64(%esp) + movdqa %xmm7,80(%esp) + jmp .L013ssse3_00_47 +.align 16 +.L013ssse3_00_47: + addl $64,%ebp + movl %edx,%ecx + movdqa %xmm1,%xmm4 + rorl $14,%edx + movl 20(%esp),%esi + movdqa %xmm3,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi +.byte 102,15,58,15,224,4 + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi +.byte 102,15,58,15,250,4 + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + movdqa %xmm4,%xmm5 + rorl $6,%edx + movl %eax,%ecx + movdqa %xmm4,%xmm6 + addl %edi,%edx + movl 4(%esp),%edi + psrld $3,%xmm4 + movl %eax,%esi + rorl $9,%ecx + paddd %xmm7,%xmm0 + movl %eax,(%esp) + xorl %eax,%ecx + psrld $7,%xmm6 + xorl %edi,%eax + addl 28(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + pshufd $250,%xmm3,%xmm7 + xorl %esi,%ecx + addl 32(%esp),%edx + pslld $14,%xmm5 + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + psrld $11,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm5,%xmm4 + movl 16(%esp),%esi + xorl %ecx,%edx + pslld $11,%xmm5 + movl 20(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + pxor %xmm6,%xmm4 + andl %ecx,%esi + movl %ecx,12(%esp) + movdqa %xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + pxor %xmm5,%xmm4 + movl %ebx,%ecx + addl %edi,%edx + psrld $10,%xmm7 + movl (%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm4,%xmm0 + movl %ebx,28(%esp) + xorl %ebx,%ecx + psrlq $17,%xmm6 + xorl %edi,%ebx + addl 24(%esp),%edx + rorl $11,%ecx + pxor %xmm6,%xmm7 + andl %ebx,%eax + xorl %esi,%ecx + psrlq $2,%xmm6 + addl 36(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + pshufd $128,%xmm7,%xmm7 + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + psrldq $8,%xmm7 + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + paddd %xmm7,%xmm0 + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,24(%esp) + pshufd $80,%xmm0,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + movdqa %xmm7,%xmm6 + rorl $11,%ecx + psrld $10,%xmm7 + andl %eax,%ebx + psrlq $17,%xmm6 + xorl %esi,%ecx + addl 40(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + psrlq $2,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm6,%xmm7 + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + pshufd $8,%xmm7,%xmm7 + xorl %edi,%esi + rorl $5,%edx + movdqa (%ebp),%xmm6 + andl %ecx,%esi + movl %ecx,4(%esp) + pslldq $8,%xmm7 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm7,%xmm0 + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + paddd %xmm0,%xmm6 + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movdqa %xmm6,32(%esp) + movl %edx,%ecx + movdqa %xmm2,%xmm4 + rorl $14,%edx + movl 4(%esp),%esi + movdqa %xmm0,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi +.byte 102,15,58,15,225,4 + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi +.byte 102,15,58,15,251,4 + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + movdqa %xmm4,%xmm5 + rorl $6,%edx + movl %eax,%ecx + movdqa %xmm4,%xmm6 + addl %edi,%edx + movl 20(%esp),%edi + psrld $3,%xmm4 + movl %eax,%esi + rorl $9,%ecx + paddd %xmm7,%xmm1 + movl %eax,16(%esp) + xorl %eax,%ecx + psrld $7,%xmm6 + xorl %edi,%eax + addl 12(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + pshufd $250,%xmm0,%xmm7 + xorl %esi,%ecx + addl 48(%esp),%edx + pslld $14,%xmm5 + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + psrld $11,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm5,%xmm4 + movl (%esp),%esi + xorl %ecx,%edx + pslld $11,%xmm5 + movl 4(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + pxor %xmm6,%xmm4 + andl %ecx,%esi + movl %ecx,28(%esp) + movdqa %xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + pxor %xmm5,%xmm4 + movl %ebx,%ecx + addl %edi,%edx + psrld $10,%xmm7 + movl 16(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm4,%xmm1 + movl %ebx,12(%esp) + xorl %ebx,%ecx + psrlq $17,%xmm6 + xorl %edi,%ebx + addl 8(%esp),%edx + rorl $11,%ecx + pxor %xmm6,%xmm7 + andl %ebx,%eax + xorl %esi,%ecx + psrlq $2,%xmm6 + addl 52(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + pshufd $128,%xmm7,%xmm7 + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + psrldq $8,%xmm7 + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + paddd %xmm7,%xmm1 + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,8(%esp) + pshufd $80,%xmm1,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + movdqa %xmm7,%xmm6 + rorl $11,%ecx + psrld $10,%xmm7 + andl %eax,%ebx + psrlq $17,%xmm6 + xorl %esi,%ecx + addl 56(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + psrlq $2,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm6,%xmm7 + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + pshufd $8,%xmm7,%xmm7 + xorl %edi,%esi + rorl $5,%edx + movdqa 16(%ebp),%xmm6 + andl %ecx,%esi + movl %ecx,20(%esp) + pslldq $8,%xmm7 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm7,%xmm1 + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + paddd %xmm1,%xmm6 + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movdqa %xmm6,48(%esp) + movl %edx,%ecx + movdqa %xmm3,%xmm4 + rorl $14,%edx + movl 20(%esp),%esi + movdqa %xmm1,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi +.byte 102,15,58,15,226,4 + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi +.byte 102,15,58,15,248,4 + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + movdqa %xmm4,%xmm5 + rorl $6,%edx + movl %eax,%ecx + movdqa %xmm4,%xmm6 + addl %edi,%edx + movl 4(%esp),%edi + psrld $3,%xmm4 + movl %eax,%esi + rorl $9,%ecx + paddd %xmm7,%xmm2 + movl %eax,(%esp) + xorl %eax,%ecx + psrld $7,%xmm6 + xorl %edi,%eax + addl 28(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + pshufd $250,%xmm1,%xmm7 + xorl %esi,%ecx + addl 64(%esp),%edx + pslld $14,%xmm5 + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + psrld $11,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm5,%xmm4 + movl 16(%esp),%esi + xorl %ecx,%edx + pslld $11,%xmm5 + movl 20(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + pxor %xmm6,%xmm4 + andl %ecx,%esi + movl %ecx,12(%esp) + movdqa %xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + pxor %xmm5,%xmm4 + movl %ebx,%ecx + addl %edi,%edx + psrld $10,%xmm7 + movl (%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm4,%xmm2 + movl %ebx,28(%esp) + xorl %ebx,%ecx + psrlq $17,%xmm6 + xorl %edi,%ebx + addl 24(%esp),%edx + rorl $11,%ecx + pxor %xmm6,%xmm7 + andl %ebx,%eax + xorl %esi,%ecx + psrlq $2,%xmm6 + addl 68(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + pshufd $128,%xmm7,%xmm7 + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + psrldq $8,%xmm7 + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + paddd %xmm7,%xmm2 + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,24(%esp) + pshufd $80,%xmm2,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + movdqa %xmm7,%xmm6 + rorl $11,%ecx + psrld $10,%xmm7 + andl %eax,%ebx + psrlq $17,%xmm6 + xorl %esi,%ecx + addl 72(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + psrlq $2,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm6,%xmm7 + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + pshufd $8,%xmm7,%xmm7 + xorl %edi,%esi + rorl $5,%edx + movdqa 32(%ebp),%xmm6 + andl %ecx,%esi + movl %ecx,4(%esp) + pslldq $8,%xmm7 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm7,%xmm2 + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + paddd %xmm2,%xmm6 + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movdqa %xmm6,64(%esp) + movl %edx,%ecx + movdqa %xmm0,%xmm4 + rorl $14,%edx + movl 4(%esp),%esi + movdqa %xmm2,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi +.byte 102,15,58,15,227,4 + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi +.byte 102,15,58,15,249,4 + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + movdqa %xmm4,%xmm5 + rorl $6,%edx + movl %eax,%ecx + movdqa %xmm4,%xmm6 + addl %edi,%edx + movl 20(%esp),%edi + psrld $3,%xmm4 + movl %eax,%esi + rorl $9,%ecx + paddd %xmm7,%xmm3 + movl %eax,16(%esp) + xorl %eax,%ecx + psrld $7,%xmm6 + xorl %edi,%eax + addl 12(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + pshufd $250,%xmm2,%xmm7 + xorl %esi,%ecx + addl 80(%esp),%edx + pslld $14,%xmm5 + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + psrld $11,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm5,%xmm4 + movl (%esp),%esi + xorl %ecx,%edx + pslld $11,%xmm5 + movl 4(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + pxor %xmm6,%xmm4 + andl %ecx,%esi + movl %ecx,28(%esp) + movdqa %xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + pxor %xmm5,%xmm4 + movl %ebx,%ecx + addl %edi,%edx + psrld $10,%xmm7 + movl 16(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm4,%xmm3 + movl %ebx,12(%esp) + xorl %ebx,%ecx + psrlq $17,%xmm6 + xorl %edi,%ebx + addl 8(%esp),%edx + rorl $11,%ecx + pxor %xmm6,%xmm7 + andl %ebx,%eax + xorl %esi,%ecx + psrlq $2,%xmm6 + addl 84(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + pshufd $128,%xmm7,%xmm7 + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + psrldq $8,%xmm7 + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + paddd %xmm7,%xmm3 + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,8(%esp) + pshufd $80,%xmm3,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + movdqa %xmm7,%xmm6 + rorl $11,%ecx + psrld $10,%xmm7 + andl %eax,%ebx + psrlq $17,%xmm6 + xorl %esi,%ecx + addl 88(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + psrlq $2,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm6,%xmm7 + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + pshufd $8,%xmm7,%xmm7 + xorl %edi,%esi + rorl $5,%edx + movdqa 48(%ebp),%xmm6 + andl %ecx,%esi + movl %ecx,20(%esp) + pslldq $8,%xmm7 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm7,%xmm3 + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + paddd %xmm3,%xmm6 + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movdqa %xmm6,80(%esp) + cmpl $66051,64(%ebp) + jne .L013ssse3_00_47 + movl %edx,%ecx + rorl $14,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 32(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 36(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 40(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 48(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 52(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 56(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 64(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 68(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 72(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 80(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 84(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 88(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl 96(%esp),%esi + xorl %edi,%ebx + movl 12(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebx + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl %ebx,4(%esp) + xorl %edi,%ebx + movl %edi,8(%esp) + movl %ecx,12(%esp) + movl 20(%esp),%edi + movl 24(%esp),%ecx + addl 16(%esi),%edx + addl 20(%esi),%edi + addl 24(%esi),%ecx + movl %edx,16(%esi) + movl %edi,20(%esi) + movl %edi,20(%esp) + movl 28(%esp),%edi + movl %ecx,24(%esi) + addl 28(%esi),%edi + movl %ecx,24(%esp) + movl %edi,28(%esi) + movl %edi,28(%esp) + movl 100(%esp),%edi + movdqa 64(%ebp),%xmm7 + subl $192,%ebp + cmpl 104(%esp),%edi + jb .L012grand_ssse3 + movl 108(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 32 +.L005AVX: + andl $264,%edx + cmpl $264,%edx + je .L014AVX_BMI + leal -96(%esp),%esp + vzeroall + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edi + movl %ebx,4(%esp) + xorl %ecx,%ebx + movl %ecx,8(%esp) + movl %edi,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%edi + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ecx,24(%esp) + movl %esi,28(%esp) + vmovdqa 256(%ebp),%xmm7 + jmp .L015grand_avx +.align 32 +.L015grand_avx: + vmovdqu (%edi),%xmm0 + vmovdqu 16(%edi),%xmm1 + vmovdqu 32(%edi),%xmm2 + vmovdqu 48(%edi),%xmm3 + addl $64,%edi + vpshufb %xmm7,%xmm0,%xmm0 + movl %edi,100(%esp) + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd (%ebp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 16(%ebp),%xmm1,%xmm5 + vpaddd 32(%ebp),%xmm2,%xmm6 + vpaddd 48(%ebp),%xmm3,%xmm7 + vmovdqa %xmm4,32(%esp) + vmovdqa %xmm5,48(%esp) + vmovdqa %xmm6,64(%esp) + vmovdqa %xmm7,80(%esp) + jmp .L016avx_00_47 +.align 16 +.L016avx_00_47: + addl $64,%ebp + vpalignr $4,%xmm0,%xmm1,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + vpalignr $4,%xmm2,%xmm3,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + vpaddd %xmm7,%xmm0,%xmm0 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + vpshufd $250,%xmm3,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 32(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + vpaddd %xmm4,%xmm0,%xmm0 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 36(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + vpaddd %xmm7,%xmm0,%xmm0 + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm0,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 40(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm0,%xmm0 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + vpaddd (%ebp),%xmm0,%xmm6 + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,32(%esp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + vpalignr $4,%xmm3,%xmm0,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + vpaddd %xmm7,%xmm1,%xmm1 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + vpshufd $250,%xmm0,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 48(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + vpaddd %xmm4,%xmm1,%xmm1 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 52(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + vpaddd %xmm7,%xmm1,%xmm1 + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm1,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 56(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm1,%xmm1 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + vpaddd 16(%ebp),%xmm1,%xmm6 + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,48(%esp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + vpalignr $4,%xmm0,%xmm1,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + vpaddd %xmm7,%xmm2,%xmm2 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + vpshufd $250,%xmm1,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 64(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + vpaddd %xmm4,%xmm2,%xmm2 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 68(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + vpaddd %xmm7,%xmm2,%xmm2 + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm2,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 72(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm2,%xmm2 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + vpaddd 32(%ebp),%xmm2,%xmm6 + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,64(%esp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + vpalignr $4,%xmm1,%xmm2,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + vpaddd %xmm7,%xmm3,%xmm3 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + vpshufd $250,%xmm2,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 80(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + vpaddd %xmm4,%xmm3,%xmm3 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 84(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + vpaddd %xmm7,%xmm3,%xmm3 + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm3,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 88(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm3,%xmm3 movl %edx,%ecx - movdqa %xmm1,%xmm4 - rorl $14,%edx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + vpaddd 48(%ebp),%xmm3,%xmm6 + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,80(%esp) + cmpl $66051,64(%ebp) + jne .L016avx_00_47 + movl %edx,%ecx + shrdl $14,%edx,%edx movl 20(%esp),%esi - movdqa %xmm3,%xmm7 xorl %ecx,%edx movl 24(%esp),%edi -.byte 102,15,58,15,224,4 xorl %edi,%esi - rorl $5,%edx + shrdl $5,%edx,%edx andl %ecx,%esi -.byte 102,15,58,15,250,4 movl %ecx,16(%esp) xorl %ecx,%edx xorl %esi,%edi - movdqa %xmm4,%xmm5 - rorl $6,%edx + shrdl $6,%edx,%edx movl %eax,%ecx - movdqa %xmm4,%xmm6 addl %edi,%edx movl 4(%esp),%edi - psrld $3,%xmm4 movl %eax,%esi - rorl $9,%ecx - paddd %xmm7,%xmm0 + shrdl $9,%ecx,%ecx movl %eax,(%esp) xorl %eax,%ecx - psrld $7,%xmm6 xorl %edi,%eax addl 28(%esp),%edx - rorl $11,%ecx + shrdl $11,%ecx,%ecx andl %eax,%ebx - pshufd $250,%xmm3,%xmm7 xorl %esi,%ecx addl 32(%esp),%edx - pslld $14,%xmm5 xorl %edi,%ebx - rorl $2,%ecx - pxor %xmm6,%xmm4 + shrdl $2,%ecx,%ecx addl %edx,%ebx addl 12(%esp),%edx - psrld $11,%xmm6 addl %ecx,%ebx movl %edx,%ecx - rorl $14,%edx - pxor %xmm5,%xmm4 + shrdl $14,%edx,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 36(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 40(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 48(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 52(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 56(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 64(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx movl 16(%esp),%esi xorl %ecx,%edx - pslld $11,%xmm5 movl 20(%esp),%edi xorl %edi,%esi - rorl $5,%edx - pxor %xmm6,%xmm4 + shrdl $5,%edx,%edx andl %ecx,%esi movl %ecx,12(%esp) - movdqa %xmm7,%xmm6 xorl %ecx,%edx xorl %esi,%edi - rorl $6,%edx - pxor %xmm5,%xmm4 + shrdl $6,%edx,%edx movl %ebx,%ecx addl %edi,%edx - psrld $10,%xmm7 movl (%esp),%edi movl %ebx,%esi - rorl $9,%ecx - paddd %xmm4,%xmm0 + shrdl $9,%ecx,%ecx movl %ebx,28(%esp) xorl %ebx,%ecx - psrlq $17,%xmm6 xorl %edi,%ebx addl 24(%esp),%edx - rorl $11,%ecx - pxor %xmm6,%xmm7 + shrdl $11,%ecx,%ecx andl %ebx,%eax xorl %esi,%ecx - psrlq $2,%xmm6 - addl 36(%esp),%edx + addl 68(%esp),%edx xorl %edi,%eax - rorl $2,%ecx - pxor %xmm6,%xmm7 + shrdl $2,%ecx,%ecx addl %edx,%eax addl 8(%esp),%edx - pshufd $128,%xmm7,%xmm7 addl %ecx,%eax movl %edx,%ecx - rorl $14,%edx + shrdl $14,%edx,%edx movl 12(%esp),%esi xorl %ecx,%edx movl 16(%esp),%edi xorl %edi,%esi - rorl $5,%edx + shrdl $5,%edx,%edx andl %ecx,%esi - psrldq $8,%xmm7 movl %ecx,8(%esp) xorl %ecx,%edx xorl %esi,%edi - paddd %xmm7,%xmm0 - rorl $6,%edx + shrdl $6,%edx,%edx movl %eax,%ecx addl %edi,%edx movl 28(%esp),%edi movl %eax,%esi - rorl $9,%ecx + shrdl $9,%ecx,%ecx movl %eax,24(%esp) - pshufd $80,%xmm0,%xmm7 xorl %eax,%ecx xorl %edi,%eax addl 20(%esp),%edx - movdqa %xmm7,%xmm6 - rorl $11,%ecx - psrld $10,%xmm7 + shrdl $11,%ecx,%ecx andl %eax,%ebx - psrlq $17,%xmm6 xorl %esi,%ecx - addl 40(%esp),%edx + addl 72(%esp),%edx xorl %edi,%ebx - rorl $2,%ecx - pxor %xmm6,%xmm7 + shrdl $2,%ecx,%ecx addl %edx,%ebx addl 4(%esp),%edx - psrlq $2,%xmm6 addl %ecx,%ebx movl %edx,%ecx - rorl $14,%edx - pxor %xmm6,%xmm7 + shrdl $14,%edx,%edx movl 8(%esp),%esi xorl %ecx,%edx movl 12(%esp),%edi - pshufd $8,%xmm7,%xmm7 xorl %edi,%esi - rorl $5,%edx - movdqa (%ebp),%xmm6 + shrdl $5,%edx,%edx andl %ecx,%esi movl %ecx,4(%esp) - pslldq $8,%xmm7 xorl %ecx,%edx xorl %esi,%edi - rorl $6,%edx + shrdl $6,%edx,%edx movl %ebx,%ecx addl %edi,%edx movl 24(%esp),%edi movl %ebx,%esi - rorl $9,%ecx - paddd %xmm7,%xmm0 + shrdl $9,%ecx,%ecx movl %ebx,20(%esp) xorl %ebx,%ecx xorl %edi,%ebx addl 16(%esp),%edx - paddd %xmm0,%xmm6 - rorl $11,%ecx + shrdl $11,%ecx,%ecx andl %ebx,%eax xorl %esi,%ecx - addl 44(%esp),%edx + addl 76(%esp),%edx xorl %edi,%eax - rorl $2,%ecx + shrdl $2,%ecx,%ecx addl %edx,%eax addl (%esp),%edx addl %ecx,%eax - movdqa %xmm6,32(%esp) movl %edx,%ecx - movdqa %xmm2,%xmm4 - rorl $14,%edx + shrdl $14,%edx,%edx movl 4(%esp),%esi - movdqa %xmm0,%xmm7 xorl %ecx,%edx movl 8(%esp),%edi -.byte 102,15,58,15,225,4 xorl %edi,%esi - rorl $5,%edx + shrdl $5,%edx,%edx andl %ecx,%esi -.byte 102,15,58,15,251,4 movl %ecx,(%esp) xorl %ecx,%edx xorl %esi,%edi - movdqa %xmm4,%xmm5 - rorl $6,%edx + shrdl $6,%edx,%edx movl %eax,%ecx - movdqa %xmm4,%xmm6 addl %edi,%edx movl 20(%esp),%edi - psrld $3,%xmm4 movl %eax,%esi - rorl $9,%ecx - paddd %xmm7,%xmm1 + shrdl $9,%ecx,%ecx movl %eax,16(%esp) xorl %eax,%ecx - psrld $7,%xmm6 xorl %edi,%eax addl 12(%esp),%edx - rorl $11,%ecx + shrdl $11,%ecx,%ecx andl %eax,%ebx - pshufd $250,%xmm0,%xmm7 xorl %esi,%ecx - addl 48(%esp),%edx - pslld $14,%xmm5 + addl 80(%esp),%edx xorl %edi,%ebx - rorl $2,%ecx - pxor %xmm6,%xmm4 + shrdl $2,%ecx,%ecx addl %edx,%ebx addl 28(%esp),%edx - psrld $11,%xmm6 addl %ecx,%ebx movl %edx,%ecx - rorl $14,%edx - pxor %xmm5,%xmm4 + shrdl $14,%edx,%edx movl (%esp),%esi xorl %ecx,%edx - pslld $11,%xmm5 movl 4(%esp),%edi xorl %edi,%esi - rorl $5,%edx - pxor %xmm6,%xmm4 + shrdl $5,%edx,%edx andl %ecx,%esi movl %ecx,28(%esp) - movdqa %xmm7,%xmm6 xorl %ecx,%edx xorl %esi,%edi - rorl $6,%edx - pxor %xmm5,%xmm4 + shrdl $6,%edx,%edx movl %ebx,%ecx addl %edi,%edx - psrld $10,%xmm7 movl 16(%esp),%edi movl %ebx,%esi - rorl $9,%ecx - paddd %xmm4,%xmm1 + shrdl $9,%ecx,%ecx movl %ebx,12(%esp) xorl %ebx,%ecx - psrlq $17,%xmm6 xorl %edi,%ebx addl 8(%esp),%edx - rorl $11,%ecx - pxor %xmm6,%xmm7 + shrdl $11,%ecx,%ecx andl %ebx,%eax xorl %esi,%ecx - psrlq $2,%xmm6 - addl 52(%esp),%edx + addl 84(%esp),%edx xorl %edi,%eax - rorl $2,%ecx - pxor %xmm6,%xmm7 + shrdl $2,%ecx,%ecx addl %edx,%eax addl 24(%esp),%edx - pshufd $128,%xmm7,%xmm7 addl %ecx,%eax movl %edx,%ecx - rorl $14,%edx + shrdl $14,%edx,%edx movl 28(%esp),%esi xorl %ecx,%edx movl (%esp),%edi xorl %edi,%esi - rorl $5,%edx + shrdl $5,%edx,%edx andl %ecx,%esi - psrldq $8,%xmm7 movl %ecx,24(%esp) xorl %ecx,%edx xorl %esi,%edi - paddd %xmm7,%xmm1 - rorl $6,%edx + shrdl $6,%edx,%edx movl %eax,%ecx addl %edi,%edx movl 12(%esp),%edi movl %eax,%esi - rorl $9,%ecx + shrdl $9,%ecx,%ecx movl %eax,8(%esp) - pshufd $80,%xmm1,%xmm7 xorl %eax,%ecx xorl %edi,%eax addl 4(%esp),%edx - movdqa %xmm7,%xmm6 - rorl $11,%ecx - psrld $10,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 88(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl 96(%esp),%esi + xorl %edi,%ebx + movl 12(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebx + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl %ebx,4(%esp) + xorl %edi,%ebx + movl %edi,8(%esp) + movl %ecx,12(%esp) + movl 20(%esp),%edi + movl 24(%esp),%ecx + addl 16(%esi),%edx + addl 20(%esi),%edi + addl 24(%esi),%ecx + movl %edx,16(%esi) + movl %edi,20(%esi) + movl %edi,20(%esp) + movl 28(%esp),%edi + movl %ecx,24(%esi) + addl 28(%esi),%edi + movl %ecx,24(%esp) + movl %edi,28(%esi) + movl %edi,28(%esp) + movl 100(%esp),%edi + vmovdqa 64(%ebp),%xmm7 + subl $192,%ebp + cmpl 104(%esp),%edi + jb .L015grand_avx + movl 108(%esp),%esp + vzeroall + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 32 +.L014AVX_BMI: + leal -96(%esp),%esp + vzeroall + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edi + movl %ebx,4(%esp) + xorl %ecx,%ebx + movl %ecx,8(%esp) + movl %edi,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%edi + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ecx,24(%esp) + movl %esi,28(%esp) + vmovdqa 256(%ebp),%xmm7 + jmp .L017grand_avx_bmi +.align 32 +.L017grand_avx_bmi: + vmovdqu (%edi),%xmm0 + vmovdqu 16(%edi),%xmm1 + vmovdqu 32(%edi),%xmm2 + vmovdqu 48(%edi),%xmm3 + addl $64,%edi + vpshufb %xmm7,%xmm0,%xmm0 + movl %edi,100(%esp) + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd (%ebp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 16(%ebp),%xmm1,%xmm5 + vpaddd 32(%ebp),%xmm2,%xmm6 + vpaddd 48(%ebp),%xmm3,%xmm7 + vmovdqa %xmm4,32(%esp) + vmovdqa %xmm5,48(%esp) + vmovdqa %xmm6,64(%esp) + vmovdqa %xmm7,80(%esp) + jmp .L018avx_bmi_00_47 +.align 16 +.L018avx_bmi_00_47: + addl $64,%ebp + vpalignr $4,%xmm0,%xmm1,%xmm4 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,16(%esp) + vpalignr $4,%xmm2,%xmm3,%xmm7 + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 24(%esp),%edx,%esi + vpsrld $7,%xmm4,%xmm6 + xorl %edi,%ecx + andl 20(%esp),%edx + movl %eax,(%esp) + vpaddd %xmm7,%xmm0,%xmm0 + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrld $3,%xmm4,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpslld $14,%xmm4,%xmm5 + movl 4(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpxor %xmm6,%xmm7,%xmm4 + addl 28(%esp),%edx + andl %eax,%ebx + addl 32(%esp),%edx + vpshufd $250,%xmm3,%xmm7 + xorl %edi,%ebx + addl %edx,%ecx + addl 12(%esp),%edx + vpsrld $11,%xmm6,%xmm6 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl %edx,12(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpslld $11,%xmm5,%xmm5 + andnl 20(%esp),%edx,%esi + xorl %edi,%ecx + andl 16(%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + movl %ebx,28(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpsrld $10,%xmm7,%xmm6 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl (%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpsrlq $17,%xmm7,%xmm5 + addl 24(%esp),%edx + andl %ebx,%eax + addl 36(%esp),%edx + vpaddd %xmm4,%xmm0,%xmm0 + xorl %edi,%eax + addl %edx,%ecx + addl 8(%esp),%edx + vpxor %xmm5,%xmm6,%xmm6 + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpsrlq $19,%xmm7,%xmm7 + movl %edx,8(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + andnl 16(%esp),%edx,%esi + xorl %edi,%ecx + andl 12(%esp),%edx + vpshufd $132,%xmm6,%xmm7 + movl %eax,24(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrldq $8,%xmm7,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpaddd %xmm7,%xmm0,%xmm0 + movl 28(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpshufd $80,%xmm0,%xmm7 + addl 20(%esp),%edx + andl %eax,%ebx + addl 40(%esp),%edx + vpsrld $10,%xmm7,%xmm6 + xorl %edi,%ebx + addl %edx,%ecx + addl 4(%esp),%edx + vpsrlq $17,%xmm7,%xmm5 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm6,%xmm6 + movl %edx,4(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpsrlq $19,%xmm7,%xmm7 + andnl 12(%esp),%edx,%esi + xorl %edi,%ecx + andl 8(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + movl %ebx,20(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpshufd $232,%xmm6,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpslldq $8,%xmm7,%xmm7 + movl 24(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpaddd %xmm7,%xmm0,%xmm0 + addl 16(%esp),%edx + andl %ebx,%eax + addl 44(%esp),%edx + vpaddd (%ebp),%xmm0,%xmm6 + xorl %edi,%eax + addl %edx,%ecx + addl (%esp),%edx + leal (%eax,%ecx,1),%eax + vmovdqa %xmm6,32(%esp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,(%esp) + vpalignr $4,%xmm3,%xmm0,%xmm7 + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 8(%esp),%edx,%esi + vpsrld $7,%xmm4,%xmm6 + xorl %edi,%ecx + andl 4(%esp),%edx + movl %eax,16(%esp) + vpaddd %xmm7,%xmm1,%xmm1 + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrld $3,%xmm4,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpslld $14,%xmm4,%xmm5 + movl 20(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpxor %xmm6,%xmm7,%xmm4 + addl 12(%esp),%edx + andl %eax,%ebx + addl 48(%esp),%edx + vpshufd $250,%xmm0,%xmm7 + xorl %edi,%ebx + addl %edx,%ecx + addl 28(%esp),%edx + vpsrld $11,%xmm6,%xmm6 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl %edx,28(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpslld $11,%xmm5,%xmm5 + andnl 4(%esp),%edx,%esi + xorl %edi,%ecx + andl (%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + movl %ebx,12(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpsrld $10,%xmm7,%xmm6 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl 16(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpsrlq $17,%xmm7,%xmm5 + addl 8(%esp),%edx + andl %ebx,%eax + addl 52(%esp),%edx + vpaddd %xmm4,%xmm1,%xmm1 + xorl %edi,%eax + addl %edx,%ecx + addl 24(%esp),%edx + vpxor %xmm5,%xmm6,%xmm6 + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpsrlq $19,%xmm7,%xmm7 + movl %edx,24(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + andnl (%esp),%edx,%esi + xorl %edi,%ecx + andl 28(%esp),%edx + vpshufd $132,%xmm6,%xmm7 + movl %eax,8(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrldq $8,%xmm7,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpaddd %xmm7,%xmm1,%xmm1 + movl 12(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpshufd $80,%xmm1,%xmm7 + addl 4(%esp),%edx andl %eax,%ebx - psrlq $17,%xmm6 - xorl %esi,%ecx addl 56(%esp),%edx + vpsrld $10,%xmm7,%xmm6 xorl %edi,%ebx - rorl $2,%ecx - pxor %xmm6,%xmm7 - addl %edx,%ebx + addl %edx,%ecx addl 20(%esp),%edx - psrlq $2,%xmm6 - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - pxor %xmm6,%xmm7 - movl 24(%esp),%esi - xorl %ecx,%edx - movl 28(%esp),%edi - pshufd $8,%xmm7,%xmm7 - xorl %edi,%esi - rorl $5,%edx - movdqa 16(%ebp),%xmm6 - andl %ecx,%esi - movl %ecx,20(%esp) - pslldq $8,%xmm7 - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - paddd %xmm7,%xmm1 + vpsrlq $17,%xmm7,%xmm5 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm6,%xmm6 + movl %edx,20(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpsrlq $19,%xmm7,%xmm7 + andnl 28(%esp),%edx,%esi + xorl %edi,%ecx + andl 24(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 movl %ebx,4(%esp) - xorl %ebx,%ecx + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpshufd $232,%xmm6,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpslldq $8,%xmm7,%xmm7 + movl 8(%esp),%edi + xorl %esi,%ecx xorl %edi,%ebx + vpaddd %xmm7,%xmm1,%xmm1 addl (%esp),%edx - paddd %xmm1,%xmm6 - rorl $11,%ecx andl %ebx,%eax - xorl %esi,%ecx addl 60(%esp),%edx + vpaddd 16(%ebp),%xmm1,%xmm6 xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax + addl %edx,%ecx addl 16(%esp),%edx - addl %ecx,%eax - movdqa %xmm6,48(%esp) - movl %edx,%ecx - movdqa %xmm3,%xmm4 - rorl $14,%edx - movl 20(%esp),%esi - movdqa %xmm1,%xmm7 - xorl %ecx,%edx - movl 24(%esp),%edi -.byte 102,15,58,15,226,4 - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi -.byte 102,15,58,15,248,4 - movl %ecx,16(%esp) - xorl %ecx,%edx - xorl %esi,%edi - movdqa %xmm4,%xmm5 - rorl $6,%edx - movl %eax,%ecx - movdqa %xmm4,%xmm6 - addl %edi,%edx - movl 4(%esp),%edi - psrld $3,%xmm4 - movl %eax,%esi - rorl $9,%ecx - paddd %xmm7,%xmm2 + leal (%eax,%ecx,1),%eax + vmovdqa %xmm6,48(%esp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,16(%esp) + vpalignr $4,%xmm0,%xmm1,%xmm7 + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 24(%esp),%edx,%esi + vpsrld $7,%xmm4,%xmm6 + xorl %edi,%ecx + andl 20(%esp),%edx movl %eax,(%esp) - xorl %eax,%ecx - psrld $7,%xmm6 + vpaddd %xmm7,%xmm2,%xmm2 + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrld $3,%xmm4,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpslld $14,%xmm4,%xmm5 + movl 4(%esp),%edi + xorl %esi,%ecx xorl %edi,%eax + vpxor %xmm6,%xmm7,%xmm4 addl 28(%esp),%edx - rorl $11,%ecx andl %eax,%ebx - pshufd $250,%xmm1,%xmm7 - xorl %esi,%ecx addl 64(%esp),%edx - pslld $14,%xmm5 + vpshufd $250,%xmm1,%xmm7 xorl %edi,%ebx - rorl $2,%ecx - pxor %xmm6,%xmm4 - addl %edx,%ebx + addl %edx,%ecx addl 12(%esp),%edx - psrld $11,%xmm6 - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - pxor %xmm5,%xmm4 - movl 16(%esp),%esi - xorl %ecx,%edx - pslld $11,%xmm5 - movl 20(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - pxor %xmm6,%xmm4 - andl %ecx,%esi - movl %ecx,12(%esp) - movdqa %xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - pxor %xmm5,%xmm4 - movl %ebx,%ecx - addl %edi,%edx - psrld $10,%xmm7 - movl (%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - paddd %xmm4,%xmm2 + vpsrld $11,%xmm6,%xmm6 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl %edx,12(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpslld $11,%xmm5,%xmm5 + andnl 20(%esp),%edx,%esi + xorl %edi,%ecx + andl 16(%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 movl %ebx,28(%esp) - xorl %ebx,%ecx - psrlq $17,%xmm6 + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpsrld $10,%xmm7,%xmm6 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl (%esp),%edi + xorl %esi,%ecx xorl %edi,%ebx + vpsrlq $17,%xmm7,%xmm5 addl 24(%esp),%edx - rorl $11,%ecx - pxor %xmm6,%xmm7 andl %ebx,%eax - xorl %esi,%ecx - psrlq $2,%xmm6 addl 68(%esp),%edx + vpaddd %xmm4,%xmm2,%xmm2 xorl %edi,%eax - rorl $2,%ecx - pxor %xmm6,%xmm7 - addl %edx,%eax + addl %edx,%ecx addl 8(%esp),%edx - pshufd $128,%xmm7,%xmm7 - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 12(%esp),%esi - xorl %ecx,%edx - movl 16(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - psrldq $8,%xmm7 - movl %ecx,8(%esp) - xorl %ecx,%edx - xorl %esi,%edi - paddd %xmm7,%xmm2 - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx - movl 28(%esp),%edi - movl %eax,%esi - rorl $9,%ecx + vpxor %xmm5,%xmm6,%xmm6 + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpsrlq $19,%xmm7,%xmm7 + movl %edx,8(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + andnl 16(%esp),%edx,%esi + xorl %edi,%ecx + andl 12(%esp),%edx + vpshufd $132,%xmm6,%xmm7 movl %eax,24(%esp) - pshufd $80,%xmm2,%xmm7 - xorl %eax,%ecx + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrldq $8,%xmm7,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpaddd %xmm7,%xmm2,%xmm2 + movl 28(%esp),%edi + xorl %esi,%ecx xorl %edi,%eax + vpshufd $80,%xmm2,%xmm7 addl 20(%esp),%edx - movdqa %xmm7,%xmm6 - rorl $11,%ecx - psrld $10,%xmm7 andl %eax,%ebx - psrlq $17,%xmm6 - xorl %esi,%ecx addl 72(%esp),%edx + vpsrld $10,%xmm7,%xmm6 xorl %edi,%ebx - rorl $2,%ecx - pxor %xmm6,%xmm7 - addl %edx,%ebx + addl %edx,%ecx addl 4(%esp),%edx - psrlq $2,%xmm6 - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - pxor %xmm6,%xmm7 - movl 8(%esp),%esi - xorl %ecx,%edx - movl 12(%esp),%edi - pshufd $8,%xmm7,%xmm7 - xorl %edi,%esi - rorl $5,%edx - movdqa 32(%ebp),%xmm6 - andl %ecx,%esi - movl %ecx,4(%esp) - pslldq $8,%xmm7 - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 24(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - paddd %xmm7,%xmm2 + vpsrlq $17,%xmm7,%xmm5 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm6,%xmm6 + movl %edx,4(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpsrlq $19,%xmm7,%xmm7 + andnl 12(%esp),%edx,%esi + xorl %edi,%ecx + andl 8(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 movl %ebx,20(%esp) - xorl %ebx,%ecx + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpshufd $232,%xmm6,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpslldq $8,%xmm7,%xmm7 + movl 24(%esp),%edi + xorl %esi,%ecx xorl %edi,%ebx + vpaddd %xmm7,%xmm2,%xmm2 addl 16(%esp),%edx - paddd %xmm2,%xmm6 - rorl $11,%ecx andl %ebx,%eax - xorl %esi,%ecx addl 76(%esp),%edx + vpaddd 32(%ebp),%xmm2,%xmm6 xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax + addl %edx,%ecx addl (%esp),%edx - addl %ecx,%eax - movdqa %xmm6,64(%esp) - movl %edx,%ecx - movdqa %xmm0,%xmm4 - rorl $14,%edx - movl 4(%esp),%esi - movdqa %xmm2,%xmm7 - xorl %ecx,%edx - movl 8(%esp),%edi -.byte 102,15,58,15,227,4 - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi -.byte 102,15,58,15,249,4 - movl %ecx,(%esp) - xorl %ecx,%edx - xorl %esi,%edi - movdqa %xmm4,%xmm5 - rorl $6,%edx - movl %eax,%ecx - movdqa %xmm4,%xmm6 - addl %edi,%edx - movl 20(%esp),%edi - psrld $3,%xmm4 - movl %eax,%esi - rorl $9,%ecx - paddd %xmm7,%xmm3 - movl %eax,16(%esp) - xorl %eax,%ecx - psrld $7,%xmm6 - xorl %edi,%eax - addl 12(%esp),%edx - rorl $11,%ecx - andl %eax,%ebx - pshufd $250,%xmm2,%xmm7 - xorl %esi,%ecx - addl 80(%esp),%edx - pslld $14,%xmm5 - xorl %edi,%ebx - rorl $2,%ecx - pxor %xmm6,%xmm4 - addl %edx,%ebx - addl 28(%esp),%edx - psrld $11,%xmm6 - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - pxor %xmm5,%xmm4 - movl (%esp),%esi - xorl %ecx,%edx - pslld $11,%xmm5 - movl 4(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - pxor %xmm6,%xmm4 - andl %ecx,%esi - movl %ecx,28(%esp) - movdqa %xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - pxor %xmm5,%xmm4 - movl %ebx,%ecx - addl %edi,%edx - psrld $10,%xmm7 - movl 16(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - paddd %xmm4,%xmm3 + leal (%eax,%ecx,1),%eax + vmovdqa %xmm6,64(%esp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,(%esp) + vpalignr $4,%xmm1,%xmm2,%xmm7 + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 8(%esp),%edx,%esi + vpsrld $7,%xmm4,%xmm6 + xorl %edi,%ecx + andl 4(%esp),%edx + movl %eax,16(%esp) + vpaddd %xmm7,%xmm3,%xmm3 + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrld $3,%xmm4,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpslld $14,%xmm4,%xmm5 + movl 20(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpxor %xmm6,%xmm7,%xmm4 + addl 12(%esp),%edx + andl %eax,%ebx + addl 80(%esp),%edx + vpshufd $250,%xmm2,%xmm7 + xorl %edi,%ebx + addl %edx,%ecx + addl 28(%esp),%edx + vpsrld $11,%xmm6,%xmm6 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl %edx,28(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpslld $11,%xmm5,%xmm5 + andnl 4(%esp),%edx,%esi + xorl %edi,%ecx + andl (%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 movl %ebx,12(%esp) - xorl %ebx,%ecx - psrlq $17,%xmm6 + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpsrld $10,%xmm7,%xmm6 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl 16(%esp),%edi + xorl %esi,%ecx xorl %edi,%ebx + vpsrlq $17,%xmm7,%xmm5 addl 8(%esp),%edx - rorl $11,%ecx - pxor %xmm6,%xmm7 andl %ebx,%eax - xorl %esi,%ecx - psrlq $2,%xmm6 addl 84(%esp),%edx + vpaddd %xmm4,%xmm3,%xmm3 xorl %edi,%eax - rorl $2,%ecx - pxor %xmm6,%xmm7 - addl %edx,%eax + addl %edx,%ecx addl 24(%esp),%edx - pshufd $128,%xmm7,%xmm7 - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 28(%esp),%esi - xorl %ecx,%edx - movl (%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - psrldq $8,%xmm7 - movl %ecx,24(%esp) - xorl %ecx,%edx - xorl %esi,%edi - paddd %xmm7,%xmm3 - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx - movl 12(%esp),%edi - movl %eax,%esi - rorl $9,%ecx + vpxor %xmm5,%xmm6,%xmm6 + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpsrlq $19,%xmm7,%xmm7 + movl %edx,24(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + andnl (%esp),%edx,%esi + xorl %edi,%ecx + andl 28(%esp),%edx + vpshufd $132,%xmm6,%xmm7 movl %eax,8(%esp) - pshufd $80,%xmm3,%xmm7 - xorl %eax,%ecx + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrldq $8,%xmm7,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpaddd %xmm7,%xmm3,%xmm3 + movl 12(%esp),%edi + xorl %esi,%ecx xorl %edi,%eax + vpshufd $80,%xmm3,%xmm7 addl 4(%esp),%edx - movdqa %xmm7,%xmm6 - rorl $11,%ecx - psrld $10,%xmm7 andl %eax,%ebx - psrlq $17,%xmm6 - xorl %esi,%ecx addl 88(%esp),%edx + vpsrld $10,%xmm7,%xmm6 xorl %edi,%ebx - rorl $2,%ecx - pxor %xmm6,%xmm7 - addl %edx,%ebx + addl %edx,%ecx addl 20(%esp),%edx - psrlq $2,%xmm6 - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - pxor %xmm6,%xmm7 - movl 24(%esp),%esi - xorl %ecx,%edx - movl 28(%esp),%edi - pshufd $8,%xmm7,%xmm7 - xorl %edi,%esi - rorl $5,%edx - movdqa 48(%ebp),%xmm6 - andl %ecx,%esi - movl %ecx,20(%esp) - pslldq $8,%xmm7 - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - paddd %xmm7,%xmm3 + vpsrlq $17,%xmm7,%xmm5 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm6,%xmm6 + movl %edx,20(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpsrlq $19,%xmm7,%xmm7 + andnl 28(%esp),%edx,%esi + xorl %edi,%ecx + andl 24(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 movl %ebx,4(%esp) - xorl %ebx,%ecx + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpshufd $232,%xmm6,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpslldq $8,%xmm7,%xmm7 + movl 8(%esp),%edi + xorl %esi,%ecx xorl %edi,%ebx + vpaddd %xmm7,%xmm3,%xmm3 addl (%esp),%edx - paddd %xmm3,%xmm6 - rorl $11,%ecx andl %ebx,%eax - xorl %esi,%ecx addl 92(%esp),%edx + vpaddd 48(%ebp),%xmm3,%xmm6 xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax + addl %edx,%ecx addl 16(%esp),%edx - addl %ecx,%eax - movdqa %xmm6,80(%esp) + leal (%eax,%ecx,1),%eax + vmovdqa %xmm6,80(%esp) cmpl $66051,64(%ebp) - jne .L012ssse3_00_47 - movl %edx,%ecx - rorl $14,%edx - movl 20(%esp),%esi - xorl %ecx,%edx - movl 24(%esp),%edi + jne .L018avx_bmi_00_47 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,16(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 24(%esp),%edx,%esi + xorl %edi,%ecx + andl 20(%esp),%edx + movl %eax,(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx movl 4(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,(%esp) - xorl %eax,%ecx + xorl %esi,%ecx xorl %edi,%eax addl 28(%esp),%edx - rorl $11,%ecx andl %eax,%ebx - xorl %esi,%ecx addl 32(%esp),%edx xorl %edi,%ebx - rorl $2,%ecx - addl %edx,%ebx + addl %edx,%ecx addl 12(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - movl 16(%esp),%esi - xorl %ecx,%edx - movl 20(%esp),%edi + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,12(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 20(%esp),%edx,%esi + xorl %edi,%ecx + andl 16(%esp),%edx + movl %ebx,28(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,12(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx movl (%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - movl %ebx,28(%esp) - xorl %ebx,%ecx + xorl %esi,%ecx xorl %edi,%ebx addl 24(%esp),%edx - rorl $11,%ecx andl %ebx,%eax - xorl %esi,%ecx addl 36(%esp),%edx xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax + addl %edx,%ecx addl 8(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 12(%esp),%esi - xorl %ecx,%edx - movl 16(%esp),%edi + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,8(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 16(%esp),%edx,%esi + xorl %edi,%ecx + andl 12(%esp),%edx + movl %eax,24(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx movl 28(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,24(%esp) - xorl %eax,%ecx + xorl %esi,%ecx xorl %edi,%eax addl 20(%esp),%edx - rorl $11,%ecx andl %eax,%ebx - xorl %esi,%ecx addl 40(%esp),%edx xorl %edi,%ebx - rorl $2,%ecx - addl %edx,%ebx + addl %edx,%ecx addl 4(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - movl 8(%esp),%esi - xorl %ecx,%edx - movl 12(%esp),%edi + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,4(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 12(%esp),%edx,%esi + xorl %edi,%ecx + andl 8(%esp),%edx + movl %ebx,20(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,4(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx movl 24(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - movl %ebx,20(%esp) - xorl %ebx,%ecx + xorl %esi,%ecx xorl %edi,%ebx addl 16(%esp),%edx - rorl $11,%ecx andl %ebx,%eax - xorl %esi,%ecx addl 44(%esp),%edx xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax + addl %edx,%ecx addl (%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 4(%esp),%esi - xorl %ecx,%edx - movl 8(%esp),%edi + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 8(%esp),%edx,%esi + xorl %edi,%ecx + andl 4(%esp),%edx + movl %eax,16(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx movl 20(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,16(%esp) - xorl %eax,%ecx + xorl %esi,%ecx xorl %edi,%eax addl 12(%esp),%edx - rorl $11,%ecx andl %eax,%ebx - xorl %esi,%ecx addl 48(%esp),%edx xorl %edi,%ebx - rorl $2,%ecx - addl %edx,%ebx + addl %edx,%ecx addl 28(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - movl (%esp),%esi - xorl %ecx,%edx - movl 4(%esp),%edi + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,28(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 4(%esp),%edx,%esi + xorl %edi,%ecx + andl (%esp),%edx + movl %ebx,12(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,28(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx movl 16(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - movl %ebx,12(%esp) - xorl %ebx,%ecx + xorl %esi,%ecx xorl %edi,%ebx addl 8(%esp),%edx - rorl $11,%ecx andl %ebx,%eax - xorl %esi,%ecx addl 52(%esp),%edx xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax + addl %edx,%ecx addl 24(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 28(%esp),%esi - xorl %ecx,%edx - movl (%esp),%edi + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,24(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl (%esp),%edx,%esi + xorl %edi,%ecx + andl 28(%esp),%edx + movl %eax,8(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx movl 12(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,8(%esp) - xorl %eax,%ecx + xorl %esi,%ecx xorl %edi,%eax addl 4(%esp),%edx - rorl $11,%ecx andl %eax,%ebx - xorl %esi,%ecx addl 56(%esp),%edx xorl %edi,%ebx - rorl $2,%ecx - addl %edx,%ebx + addl %edx,%ecx addl 20(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - movl 24(%esp),%esi - xorl %ecx,%edx - movl 28(%esp),%edi + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,20(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 28(%esp),%edx,%esi + xorl %edi,%ecx + andl 24(%esp),%edx + movl %ebx,4(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,20(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx movl 8(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - movl %ebx,4(%esp) - xorl %ebx,%ecx + xorl %esi,%ecx xorl %edi,%ebx addl (%esp),%edx - rorl $11,%ecx andl %ebx,%eax - xorl %esi,%ecx addl 60(%esp),%edx xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax + addl %edx,%ecx addl 16(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 20(%esp),%esi - xorl %ecx,%edx - movl 24(%esp),%edi + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,16(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 24(%esp),%edx,%esi + xorl %edi,%ecx + andl 20(%esp),%edx + movl %eax,(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx movl 4(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,(%esp) - xorl %eax,%ecx + xorl %esi,%ecx xorl %edi,%eax addl 28(%esp),%edx - rorl $11,%ecx andl %eax,%ebx - xorl %esi,%ecx addl 64(%esp),%edx xorl %edi,%ebx - rorl $2,%ecx - addl %edx,%ebx + addl %edx,%ecx addl 12(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - movl 16(%esp),%esi - xorl %ecx,%edx - movl 20(%esp),%edi + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,12(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 20(%esp),%edx,%esi + xorl %edi,%ecx + andl 16(%esp),%edx + movl %ebx,28(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,12(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx movl (%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - movl %ebx,28(%esp) - xorl %ebx,%ecx + xorl %esi,%ecx xorl %edi,%ebx addl 24(%esp),%edx - rorl $11,%ecx andl %ebx,%eax - xorl %esi,%ecx addl 68(%esp),%edx xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax + addl %edx,%ecx addl 8(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 12(%esp),%esi - xorl %ecx,%edx - movl 16(%esp),%edi + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,8(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 16(%esp),%edx,%esi + xorl %edi,%ecx + andl 12(%esp),%edx + movl %eax,24(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx movl 28(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,24(%esp) - xorl %eax,%ecx + xorl %esi,%ecx xorl %edi,%eax addl 20(%esp),%edx - rorl $11,%ecx andl %eax,%ebx - xorl %esi,%ecx addl 72(%esp),%edx xorl %edi,%ebx - rorl $2,%ecx - addl %edx,%ebx + addl %edx,%ecx addl 4(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - movl 8(%esp),%esi - xorl %ecx,%edx - movl 12(%esp),%edi + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,4(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 12(%esp),%edx,%esi + xorl %edi,%ecx + andl 8(%esp),%edx + movl %ebx,20(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,4(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx movl 24(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - movl %ebx,20(%esp) - xorl %ebx,%ecx + xorl %esi,%ecx xorl %edi,%ebx addl 16(%esp),%edx - rorl $11,%ecx andl %ebx,%eax - xorl %esi,%ecx addl 76(%esp),%edx xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax + addl %edx,%ecx addl (%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 4(%esp),%esi - xorl %ecx,%edx - movl 8(%esp),%edi + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 8(%esp),%edx,%esi + xorl %edi,%ecx + andl 4(%esp),%edx + movl %eax,16(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx movl 20(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,16(%esp) - xorl %eax,%ecx + xorl %esi,%ecx xorl %edi,%eax addl 12(%esp),%edx - rorl $11,%ecx andl %eax,%ebx - xorl %esi,%ecx addl 80(%esp),%edx xorl %edi,%ebx - rorl $2,%ecx - addl %edx,%ebx + addl %edx,%ecx addl 28(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - movl (%esp),%esi - xorl %ecx,%edx - movl 4(%esp),%edi + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,28(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 4(%esp),%edx,%esi + xorl %edi,%ecx + andl (%esp),%edx + movl %ebx,12(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,28(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx movl 16(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - movl %ebx,12(%esp) - xorl %ebx,%ecx + xorl %esi,%ecx xorl %edi,%ebx addl 8(%esp),%edx - rorl $11,%ecx andl %ebx,%eax - xorl %esi,%ecx addl 84(%esp),%edx xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax + addl %edx,%ecx addl 24(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 28(%esp),%esi - xorl %ecx,%edx - movl (%esp),%edi + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,24(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl (%esp),%edx,%esi + xorl %edi,%ecx + andl 28(%esp),%edx + movl %eax,8(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx movl 12(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,8(%esp) - xorl %eax,%ecx + xorl %esi,%ecx xorl %edi,%eax addl 4(%esp),%edx - rorl $11,%ecx andl %eax,%ebx - xorl %esi,%ecx addl 88(%esp),%edx xorl %edi,%ebx - rorl $2,%ecx - addl %edx,%ebx + addl %edx,%ecx addl 20(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - movl 24(%esp),%esi - xorl %ecx,%edx - movl 28(%esp),%edi + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,20(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 28(%esp),%edx,%esi + xorl %edi,%ecx + andl 24(%esp),%edx + movl %ebx,4(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,20(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx movl 8(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - movl %ebx,4(%esp) - xorl %ebx,%ecx + xorl %esi,%ecx xorl %edi,%ebx addl (%esp),%edx - rorl $11,%ecx andl %ebx,%eax - xorl %esi,%ecx addl 92(%esp),%edx xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax + addl %edx,%ecx addl 16(%esp),%edx - addl %ecx,%eax + leal (%eax,%ecx,1),%eax movl 96(%esp),%esi xorl %edi,%ebx movl 12(%esp),%ecx @@ -9142,11 +13553,12 @@ sha256_block_data_order: movl %edi,28(%esi) movl %edi,28(%esp) movl 100(%esp),%edi - movdqa 64(%ebp),%xmm7 + vmovdqa 64(%ebp),%xmm7 subl $192,%ebp cmpl 104(%esp),%edi - jb .L011grand_ssse3 + jb .L017grand_avx_bmi movl 108(%esp),%esp + vzeroall popl %edi popl %esi popl %ebx -- 2.45.0