crypto/openssl/crypto/aes/asm/aesv8-armx.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2014-2019 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 #
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16 #
  17 # This module implements support for ARMv8 AES instructions. The
  18 # module is endian-agnostic in sense that it supports both big- and
  19 # little-endian cases. As does it support both 32- and 64-bit modes
  20 # of operation. Latter is achieved by limiting amount of utilized
  21 # registers to 16, which implies additional NEON load and integer
  22 # instructions. This has no effect on mighty Apple A7, where results
  23 # are literally equal to the theoretical estimates based on AES
  24 # instruction latencies and issue rates. On Cortex-A53, an in-order
  25 # execution core, this costs up to 10-15%, which is partially
  26 # compensated by implementing dedicated code path for 128-bit
  27 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
  28 # seems to be limited by sheer amount of NEON instructions...
  29 #
  30 # Performance in cycles per byte processed with 128-bit key:
  31 #
  32 #               CBC enc         CBC dec         CTR
  33 # Apple A7      2.39            1.20            1.20
  34 # Cortex-A53    1.32            1.29            1.46
  35 # Cortex-A57(*) 1.95            0.85            0.93
  36 # Denver        1.96            0.86            0.80
  37 # Mongoose      1.33            1.20            1.20
  38 # Kryo          1.26            0.94            1.00
  39 #
  40 # (*)   original 3.64/1.34/1.32 results were for r0p0 revision
  41 #       and are still same even for updated module;
  42
  43 $flavour = shift;
  44 $output  = shift;
  45
  46 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  47 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  48 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  49 die "can't locate arm-xlate.pl";
  50
  51 open OUT,"| \"$^X\" $xlate $flavour $output";
  52 *STDOUT=*OUT;
  53
  54 $prefix="aes_v8";
  55
  56 $code=<<___;
  57 #include "arm_arch.h"
  58
  59 #if __ARM_MAX_ARCH__>=7
  60 .text
  61 ___
  62 # $code.=".arch armv8-a+crypto\n"                       if ($flavour =~ /64/);
  63 $code.=<<___                                            if ($flavour !~ /64/);
  64 .arch   armv7-a // don't confuse not-so-latest binutils with argv8 :-)
  65 .fpu    neon
  66 .code   32
  67 #undef  __thumb2__
  68 ___
  69
  70 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
  71 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
  72 # maintain both 32- and 64-bit codes within single module and
  73 # transliterate common code to either flavour with regex vodoo.
  74 #
  75 {{{
  76 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
  77 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
  78         $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
  79
  80
  81 $code.=<<___;
  82 .align  5
  83 .Lrcon:
  84 .long   0x01,0x01,0x01,0x01
  85 .long   0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d     // rotate-n-splat
  86 .long   0x1b,0x1b,0x1b,0x1b
  87
  88 .globl  ${prefix}_set_encrypt_key
  89 .type   ${prefix}_set_encrypt_key,%function
  90 .align  5
  91 ${prefix}_set_encrypt_key:
  92 .Lenc_key:
  93 ___
  94 $code.=<<___    if ($flavour =~ /64/);
  95         stp     x29,x30,[sp,#-16]!
  96         add     x29,sp,#0
  97 ___
  98 $code.=<<___;
  99         mov     $ptr,#-1
 100         cmp     $inp,#0
 101         b.eq    .Lenc_key_abort
 102         cmp     $out,#0
 103         b.eq    .Lenc_key_abort
 104         mov     $ptr,#-2
 105         cmp     $bits,#128
 106         b.lt    .Lenc_key_abort
 107         cmp     $bits,#256
 108         b.gt    .Lenc_key_abort
 109         tst     $bits,#0x3f
 110         b.ne    .Lenc_key_abort
 111
 112         adr     $ptr,.Lrcon
 113         cmp     $bits,#192
 114
 115         veor    $zero,$zero,$zero
 116         vld1.8  {$in0},[$inp],#16
 117         mov     $bits,#8                // reuse $bits
 118         vld1.32 {$rcon,$mask},[$ptr],#32
 119
 120         b.lt    .Loop128
 121         b.eq    .L192
 122         b       .L256
 123
 124 .align  4
 125 .Loop128:
 126         vtbl.8  $key,{$in0},$mask
 127         vext.8  $tmp,$zero,$in0,#12
 128         vst1.32 {$in0},[$out],#16
 129         aese    $key,$zero
 130         subs    $bits,$bits,#1
 131
 132         veor    $in0,$in0,$tmp
 133         vext.8  $tmp,$zero,$tmp,#12
 134         veor    $in0,$in0,$tmp
 135         vext.8  $tmp,$zero,$tmp,#12
 136          veor   $key,$key,$rcon
 137         veor    $in0,$in0,$tmp
 138         vshl.u8 $rcon,$rcon,#1
 139         veor    $in0,$in0,$key
 140         b.ne    .Loop128
 141
 142         vld1.32 {$rcon},[$ptr]
 143
 144         vtbl.8  $key,{$in0},$mask
 145         vext.8  $tmp,$zero,$in0,#12
 146         vst1.32 {$in0},[$out],#16
 147         aese    $key,$zero
 148
 149         veor    $in0,$in0,$tmp
 150         vext.8  $tmp,$zero,$tmp,#12
 151         veor    $in0,$in0,$tmp
 152         vext.8  $tmp,$zero,$tmp,#12
 153          veor   $key,$key,$rcon
 154         veor    $in0,$in0,$tmp
 155         vshl.u8 $rcon,$rcon,#1
 156         veor    $in0,$in0,$key
 157
 158         vtbl.8  $key,{$in0},$mask
 159         vext.8  $tmp,$zero,$in0,#12
 160         vst1.32 {$in0},[$out],#16
 161         aese    $key,$zero
 162
 163         veor    $in0,$in0,$tmp
 164         vext.8  $tmp,$zero,$tmp,#12
 165         veor    $in0,$in0,$tmp
 166         vext.8  $tmp,$zero,$tmp,#12
 167          veor   $key,$key,$rcon
 168         veor    $in0,$in0,$tmp
 169         veor    $in0,$in0,$key
 170         vst1.32 {$in0},[$out]
 171         add     $out,$out,#0x50
 172
 173         mov     $rounds,#10
 174         b       .Ldone
 175
 176 .align  4
 177 .L192:
 178         vld1.8  {$in1},[$inp],#8
 179         vmov.i8 $key,#8                 // borrow $key
 180         vst1.32 {$in0},[$out],#16
 181         vsub.i8 $mask,$mask,$key        // adjust the mask
 182
 183 .Loop192:
 184         vtbl.8  $key,{$in1},$mask
 185         vext.8  $tmp,$zero,$in0,#12
 186         vst1.32 {$in1},[$out],#8
 187         aese    $key,$zero
 188         subs    $bits,$bits,#1
 189
 190         veor    $in0,$in0,$tmp
 191         vext.8  $tmp,$zero,$tmp,#12
 192         veor    $in0,$in0,$tmp
 193         vext.8  $tmp,$zero,$tmp,#12
 194         veor    $in0,$in0,$tmp
 195
 196         vdup.32 $tmp,${in0}[3]
 197         veor    $tmp,$tmp,$in1
 198          veor   $key,$key,$rcon
 199         vext.8  $in1,$zero,$in1,#12
 200         vshl.u8 $rcon,$rcon,#1
 201         veor    $in1,$in1,$tmp
 202         veor    $in0,$in0,$key
 203         veor    $in1,$in1,$key
 204         vst1.32 {$in0},[$out],#16
 205         b.ne    .Loop192
 206
 207         mov     $rounds,#12
 208         add     $out,$out,#0x20
 209         b       .Ldone
 210
 211 .align  4
 212 .L256:
 213         vld1.8  {$in1},[$inp]
 214         mov     $bits,#7
 215         mov     $rounds,#14
 216         vst1.32 {$in0},[$out],#16
 217
 218 .Loop256:
 219         vtbl.8  $key,{$in1},$mask
 220         vext.8  $tmp,$zero,$in0,#12
 221         vst1.32 {$in1},[$out],#16
 222         aese    $key,$zero
 223         subs    $bits,$bits,#1
 224
 225         veor    $in0,$in0,$tmp
 226         vext.8  $tmp,$zero,$tmp,#12
 227         veor    $in0,$in0,$tmp
 228         vext.8  $tmp,$zero,$tmp,#12
 229          veor   $key,$key,$rcon
 230         veor    $in0,$in0,$tmp
 231         vshl.u8 $rcon,$rcon,#1
 232         veor    $in0,$in0,$key
 233         vst1.32 {$in0},[$out],#16
 234         b.eq    .Ldone
 235
 236         vdup.32 $key,${in0}[3]          // just splat
 237         vext.8  $tmp,$zero,$in1,#12
 238         aese    $key,$zero
 239
 240         veor    $in1,$in1,$tmp
 241         vext.8  $tmp,$zero,$tmp,#12
 242         veor    $in1,$in1,$tmp
 243         vext.8  $tmp,$zero,$tmp,#12
 244         veor    $in1,$in1,$tmp
 245
 246         veor    $in1,$in1,$key
 247         b       .Loop256
 248
 249 .Ldone:
 250         str     $rounds,[$out]
 251         mov     $ptr,#0
 252
 253 .Lenc_key_abort:
 254         mov     x0,$ptr                 // return value
 255         `"ldr   x29,[sp],#16"           if ($flavour =~ /64/)`
 256         ret
 257 .size   ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
 258
 259 .globl  ${prefix}_set_decrypt_key
 260 .type   ${prefix}_set_decrypt_key,%function
 261 .align  5
 262 ${prefix}_set_decrypt_key:
 263 ___
 264 $code.=<<___    if ($flavour =~ /64/);
 265         .inst   0xd503233f              // paciasp
 266         stp     x29,x30,[sp,#-16]!
 267         add     x29,sp,#0
 268 ___
 269 $code.=<<___    if ($flavour !~ /64/);
 270         stmdb   sp!,{r4,lr}
 271 ___
 272 $code.=<<___;
 273         bl      .Lenc_key
 274
 275         cmp     x0,#0
 276         b.ne    .Ldec_key_abort
 277
 278         sub     $out,$out,#240          // restore original $out
 279         mov     x4,#-16
 280         add     $inp,$out,x12,lsl#4     // end of key schedule
 281
 282         vld1.32 {v0.16b},[$out]
 283         vld1.32 {v1.16b},[$inp]
 284         vst1.32 {v0.16b},[$inp],x4
 285         vst1.32 {v1.16b},[$out],#16
 286
 287 .Loop_imc:
 288         vld1.32 {v0.16b},[$out]
 289         vld1.32 {v1.16b},[$inp]
 290         aesimc  v0.16b,v0.16b
 291         aesimc  v1.16b,v1.16b
 292         vst1.32 {v0.16b},[$inp],x4
 293         vst1.32 {v1.16b},[$out],#16
 294         cmp     $inp,$out
 295         b.hi    .Loop_imc
 296
 297         vld1.32 {v0.16b},[$out]
 298         aesimc  v0.16b,v0.16b
 299         vst1.32 {v0.16b},[$inp]
 300
 301         eor     x0,x0,x0                // return value
 302 .Ldec_key_abort:
 303 ___
 304 $code.=<<___    if ($flavour !~ /64/);
 305         ldmia   sp!,{r4,pc}
 306 ___
 307 $code.=<<___    if ($flavour =~ /64/);
 308         ldp     x29,x30,[sp],#16
 309         .inst   0xd50323bf              // autiasp
 310         ret
 311 ___
 312 $code.=<<___;
 313 .size   ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
 314 ___
 315 }}}
 316 {{{
 317 sub gen_block () {
 318 my $dir = shift;
 319 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
 320 my ($inp,$out,$key)=map("x$_",(0..2));
 321 my $rounds="w3";
 322 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
 323
 324 $code.=<<___;
 325 .globl  ${prefix}_${dir}crypt
 326 .type   ${prefix}_${dir}crypt,%function
 327 .align  5
 328 ${prefix}_${dir}crypt:
 329         ldr     $rounds,[$key,#240]
 330         vld1.32 {$rndkey0},[$key],#16
 331         vld1.8  {$inout},[$inp]
 332         sub     $rounds,$rounds,#2
 333         vld1.32 {$rndkey1},[$key],#16
 334
 335 .Loop_${dir}c:
 336         aes$e   $inout,$rndkey0
 337         aes$mc  $inout,$inout
 338         vld1.32 {$rndkey0},[$key],#16
 339         subs    $rounds,$rounds,#2
 340         aes$e   $inout,$rndkey1
 341         aes$mc  $inout,$inout
 342         vld1.32 {$rndkey1},[$key],#16
 343         b.gt    .Loop_${dir}c
 344
 345         aes$e   $inout,$rndkey0
 346         aes$mc  $inout,$inout
 347         vld1.32 {$rndkey0},[$key]
 348         aes$e   $inout,$rndkey1
 349         veor    $inout,$inout,$rndkey0
 350
 351         vst1.8  {$inout},[$out]
 352         ret
 353 .size   ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
 354 ___
 355 }
 356 &gen_block("en");
 357 &gen_block("de");
 358 }}}
 359 {{{
 360 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
 361 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
 362 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
 363
 364 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
 365 my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
 366
 367 ### q8-q15      preloaded key schedule
 368
 369 $code.=<<___;
 370 .globl  ${prefix}_cbc_encrypt
 371 .type   ${prefix}_cbc_encrypt,%function
 372 .align  5
 373 ${prefix}_cbc_encrypt:
 374 ___
 375 $code.=<<___    if ($flavour =~ /64/);
 376         stp     x29,x30,[sp,#-16]!
 377         add     x29,sp,#0
 378 ___
 379 $code.=<<___    if ($flavour !~ /64/);
 380         mov     ip,sp
 381         stmdb   sp!,{r4-r8,lr}
 382         vstmdb  sp!,{d8-d15}            @ ABI specification says so
 383         ldmia   ip,{r4-r5}              @ load remaining args
 384 ___
 385 $code.=<<___;
 386         subs    $len,$len,#16
 387         mov     $step,#16
 388         b.lo    .Lcbc_abort
 389         cclr    $step,eq
 390
 391         cmp     $enc,#0                 // en- or decrypting?
 392         ldr     $rounds,[$key,#240]
 393         and     $len,$len,#-16
 394         vld1.8  {$ivec},[$ivp]
 395         vld1.8  {$dat},[$inp],$step
 396
 397         vld1.32 {q8-q9},[$key]          // load key schedule...
 398         sub     $rounds,$rounds,#6
 399         add     $key_,$key,x5,lsl#4     // pointer to last 7 round keys
 400         sub     $rounds,$rounds,#2
 401         vld1.32 {q10-q11},[$key_],#32
 402         vld1.32 {q12-q13},[$key_],#32
 403         vld1.32 {q14-q15},[$key_],#32
 404         vld1.32 {$rndlast},[$key_]
 405
 406         add     $key_,$key,#32
 407         mov     $cnt,$rounds
 408         b.eq    .Lcbc_dec
 409
 410         cmp     $rounds,#2
 411         veor    $dat,$dat,$ivec
 412         veor    $rndzero_n_last,q8,$rndlast
 413         b.eq    .Lcbc_enc128
 414
 415         vld1.32 {$in0-$in1},[$key_]
 416         add     $key_,$key,#16
 417         add     $key4,$key,#16*4
 418         add     $key5,$key,#16*5
 419         aese    $dat,q8
 420         aesmc   $dat,$dat
 421         add     $key6,$key,#16*6
 422         add     $key7,$key,#16*7
 423         b       .Lenter_cbc_enc
 424
 425 .align  4
 426 .Loop_cbc_enc:
 427         aese    $dat,q8
 428         aesmc   $dat,$dat
 429          vst1.8 {$ivec},[$out],#16
 430 .Lenter_cbc_enc:
 431         aese    $dat,q9
 432         aesmc   $dat,$dat
 433         aese    $dat,$in0
 434         aesmc   $dat,$dat
 435         vld1.32 {q8},[$key4]
 436         cmp     $rounds,#4
 437         aese    $dat,$in1
 438         aesmc   $dat,$dat
 439         vld1.32 {q9},[$key5]
 440         b.eq    .Lcbc_enc192
 441
 442         aese    $dat,q8
 443         aesmc   $dat,$dat
 444         vld1.32 {q8},[$key6]
 445         aese    $dat,q9
 446         aesmc   $dat,$dat
 447         vld1.32 {q9},[$key7]
 448         nop
 449
 450 .Lcbc_enc192:
 451         aese    $dat,q8
 452         aesmc   $dat,$dat
 453          subs   $len,$len,#16
 454         aese    $dat,q9
 455         aesmc   $dat,$dat
 456          cclr   $step,eq
 457         aese    $dat,q10
 458         aesmc   $dat,$dat
 459         aese    $dat,q11
 460         aesmc   $dat,$dat
 461          vld1.8 {q8},[$inp],$step
 462         aese    $dat,q12
 463         aesmc   $dat,$dat
 464          veor   q8,q8,$rndzero_n_last
 465         aese    $dat,q13
 466         aesmc   $dat,$dat
 467          vld1.32 {q9},[$key_]           // re-pre-load rndkey[1]
 468         aese    $dat,q14
 469         aesmc   $dat,$dat
 470         aese    $dat,q15
 471         veor    $ivec,$dat,$rndlast
 472         b.hs    .Loop_cbc_enc
 473
 474         vst1.8  {$ivec},[$out],#16
 475         b       .Lcbc_done
 476
 477 .align  5
 478 .Lcbc_enc128:
 479         vld1.32 {$in0-$in1},[$key_]
 480         aese    $dat,q8
 481         aesmc   $dat,$dat
 482         b       .Lenter_cbc_enc128
 483 .Loop_cbc_enc128:
 484         aese    $dat,q8
 485         aesmc   $dat,$dat
 486          vst1.8 {$ivec},[$out],#16
 487 .Lenter_cbc_enc128:
 488         aese    $dat,q9
 489         aesmc   $dat,$dat
 490          subs   $len,$len,#16
 491         aese    $dat,$in0
 492         aesmc   $dat,$dat
 493          cclr   $step,eq
 494         aese    $dat,$in1
 495         aesmc   $dat,$dat
 496         aese    $dat,q10
 497         aesmc   $dat,$dat
 498         aese    $dat,q11
 499         aesmc   $dat,$dat
 500          vld1.8 {q8},[$inp],$step
 501         aese    $dat,q12
 502         aesmc   $dat,$dat
 503         aese    $dat,q13
 504         aesmc   $dat,$dat
 505         aese    $dat,q14
 506         aesmc   $dat,$dat
 507          veor   q8,q8,$rndzero_n_last
 508         aese    $dat,q15
 509         veor    $ivec,$dat,$rndlast
 510         b.hs    .Loop_cbc_enc128
 511
 512         vst1.8  {$ivec},[$out],#16
 513         b       .Lcbc_done
 514 ___
 515 {
 516 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
 517 $code.=<<___;
 518 .align  5
 519 .Lcbc_dec:
 520         vld1.8  {$dat2},[$inp],#16
 521         subs    $len,$len,#32           // bias
 522         add     $cnt,$rounds,#2
 523         vorr    $in1,$dat,$dat
 524         vorr    $dat1,$dat,$dat
 525         vorr    $in2,$dat2,$dat2
 526         b.lo    .Lcbc_dec_tail
 527
 528         vorr    $dat1,$dat2,$dat2
 529         vld1.8  {$dat2},[$inp],#16
 530         vorr    $in0,$dat,$dat
 531         vorr    $in1,$dat1,$dat1
 532         vorr    $in2,$dat2,$dat2
 533
 534 .Loop3x_cbc_dec:
 535         aesd    $dat0,q8
 536         aesimc  $dat0,$dat0
 537         aesd    $dat1,q8
 538         aesimc  $dat1,$dat1
 539         aesd    $dat2,q8
 540         aesimc  $dat2,$dat2
 541         vld1.32 {q8},[$key_],#16
 542         subs    $cnt,$cnt,#2
 543         aesd    $dat0,q9
 544         aesimc  $dat0,$dat0
 545         aesd    $dat1,q9
 546         aesimc  $dat1,$dat1
 547         aesd    $dat2,q9
 548         aesimc  $dat2,$dat2
 549         vld1.32 {q9},[$key_],#16
 550         b.gt    .Loop3x_cbc_dec
 551
 552         aesd    $dat0,q8
 553         aesimc  $dat0,$dat0
 554         aesd    $dat1,q8
 555         aesimc  $dat1,$dat1
 556         aesd    $dat2,q8
 557         aesimc  $dat2,$dat2
 558          veor   $tmp0,$ivec,$rndlast
 559          subs   $len,$len,#0x30
 560          veor   $tmp1,$in0,$rndlast
 561          mov.lo x6,$len                 // x6, $cnt, is zero at this point
 562         aesd    $dat0,q9
 563         aesimc  $dat0,$dat0
 564         aesd    $dat1,q9
 565         aesimc  $dat1,$dat1
 566         aesd    $dat2,q9
 567         aesimc  $dat2,$dat2
 568          veor   $tmp2,$in1,$rndlast
 569          add    $inp,$inp,x6            // $inp is adjusted in such way that
 570                                         // at exit from the loop $dat1-$dat2
 571                                         // are loaded with last "words"
 572          vorr   $ivec,$in2,$in2
 573          mov    $key_,$key
 574         aesd    $dat0,q12
 575         aesimc  $dat0,$dat0
 576         aesd    $dat1,q12
 577         aesimc  $dat1,$dat1
 578         aesd    $dat2,q12
 579         aesimc  $dat2,$dat2
 580          vld1.8 {$in0},[$inp],#16
 581         aesd    $dat0,q13
 582         aesimc  $dat0,$dat0
 583         aesd    $dat1,q13
 584         aesimc  $dat1,$dat1
 585         aesd    $dat2,q13
 586         aesimc  $dat2,$dat2
 587          vld1.8 {$in1},[$inp],#16
 588         aesd    $dat0,q14
 589         aesimc  $dat0,$dat0
 590         aesd    $dat1,q14
 591         aesimc  $dat1,$dat1
 592         aesd    $dat2,q14
 593         aesimc  $dat2,$dat2
 594          vld1.8 {$in2},[$inp],#16
 595         aesd    $dat0,q15
 596         aesd    $dat1,q15
 597         aesd    $dat2,q15
 598          vld1.32 {q8},[$key_],#16       // re-pre-load rndkey[0]
 599          add    $cnt,$rounds,#2
 600         veor    $tmp0,$tmp0,$dat0
 601         veor    $tmp1,$tmp1,$dat1
 602         veor    $dat2,$dat2,$tmp2
 603          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
 604         vst1.8  {$tmp0},[$out],#16
 605          vorr   $dat0,$in0,$in0
 606         vst1.8  {$tmp1},[$out],#16
 607          vorr   $dat1,$in1,$in1
 608         vst1.8  {$dat2},[$out],#16
 609          vorr   $dat2,$in2,$in2
 610         b.hs    .Loop3x_cbc_dec
 611
 612         cmn     $len,#0x30
 613         b.eq    .Lcbc_done
 614         nop
 615
 616 .Lcbc_dec_tail:
 617         aesd    $dat1,q8
 618         aesimc  $dat1,$dat1
 619         aesd    $dat2,q8
 620         aesimc  $dat2,$dat2
 621         vld1.32 {q8},[$key_],#16
 622         subs    $cnt,$cnt,#2
 623         aesd    $dat1,q9
 624         aesimc  $dat1,$dat1
 625         aesd    $dat2,q9
 626         aesimc  $dat2,$dat2
 627         vld1.32 {q9},[$key_],#16
 628         b.gt    .Lcbc_dec_tail
 629
 630         aesd    $dat1,q8
 631         aesimc  $dat1,$dat1
 632         aesd    $dat2,q8
 633         aesimc  $dat2,$dat2
 634         aesd    $dat1,q9
 635         aesimc  $dat1,$dat1
 636         aesd    $dat2,q9
 637         aesimc  $dat2,$dat2
 638         aesd    $dat1,q12
 639         aesimc  $dat1,$dat1
 640         aesd    $dat2,q12
 641         aesimc  $dat2,$dat2
 642          cmn    $len,#0x20
 643         aesd    $dat1,q13
 644         aesimc  $dat1,$dat1
 645         aesd    $dat2,q13
 646         aesimc  $dat2,$dat2
 647          veor   $tmp1,$ivec,$rndlast
 648         aesd    $dat1,q14
 649         aesimc  $dat1,$dat1
 650         aesd    $dat2,q14
 651         aesimc  $dat2,$dat2
 652          veor   $tmp2,$in1,$rndlast
 653         aesd    $dat1,q15
 654         aesd    $dat2,q15
 655         b.eq    .Lcbc_dec_one
 656         veor    $tmp1,$tmp1,$dat1
 657         veor    $tmp2,$tmp2,$dat2
 658          vorr   $ivec,$in2,$in2
 659         vst1.8  {$tmp1},[$out],#16
 660         vst1.8  {$tmp2},[$out],#16
 661         b       .Lcbc_done
 662
 663 .Lcbc_dec_one:
 664         veor    $tmp1,$tmp1,$dat2
 665          vorr   $ivec,$in2,$in2
 666         vst1.8  {$tmp1},[$out],#16
 667
 668 .Lcbc_done:
 669         vst1.8  {$ivec},[$ivp]
 670 .Lcbc_abort:
 671 ___
 672 }
 673 $code.=<<___    if ($flavour !~ /64/);
 674         vldmia  sp!,{d8-d15}
 675         ldmia   sp!,{r4-r8,pc}
 676 ___
 677 $code.=<<___    if ($flavour =~ /64/);
 678         ldr     x29,[sp],#16
 679         ret
 680 ___
 681 $code.=<<___;
 682 .size   ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
 683 ___
 684 }}}
 685 {{{
 686 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
 687 my ($rounds,$cnt,$key_)=("w5","w6","x7");
 688 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
 689 my $step="x12";         # aliases with $tctr2
 690
 691 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
 692 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
 693
 694 my ($dat,$tmp)=($dat0,$tmp0);
 695
 696 ### q8-q15      preloaded key schedule
 697
 698 $code.=<<___;
 699 .globl  ${prefix}_ctr32_encrypt_blocks
 700 .type   ${prefix}_ctr32_encrypt_blocks,%function
 701 .align  5
 702 ${prefix}_ctr32_encrypt_blocks:
 703 ___
 704 $code.=<<___    if ($flavour =~ /64/);
 705         stp             x29,x30,[sp,#-16]!
 706         add             x29,sp,#0
 707 ___
 708 $code.=<<___    if ($flavour !~ /64/);
 709         mov             ip,sp
 710         stmdb           sp!,{r4-r10,lr}
 711         vstmdb          sp!,{d8-d15}            @ ABI specification says so
 712         ldr             r4, [ip]                @ load remaining arg
 713 ___
 714 $code.=<<___;
 715         ldr             $rounds,[$key,#240]
 716
 717         ldr             $ctr, [$ivp, #12]
 718         vld1.32         {$dat0},[$ivp]
 719
 720         vld1.32         {q8-q9},[$key]          // load key schedule...
 721         sub             $rounds,$rounds,#4
 722         mov             $step,#16
 723         cmp             $len,#2
 724         add             $key_,$key,x5,lsl#4     // pointer to last 5 round keys
 725         sub             $rounds,$rounds,#2
 726         vld1.32         {q12-q13},[$key_],#32
 727         vld1.32         {q14-q15},[$key_],#32
 728         vld1.32         {$rndlast},[$key_]
 729         add             $key_,$key,#32
 730         mov             $cnt,$rounds
 731         cclr            $step,lo
 732 #ifndef __ARMEB__
 733         rev             $ctr, $ctr
 734 #endif
 735         vorr            $dat1,$dat0,$dat0
 736         add             $tctr1, $ctr, #1
 737         vorr            $dat2,$dat0,$dat0
 738         add             $ctr, $ctr, #2
 739         vorr            $ivec,$dat0,$dat0
 740         rev             $tctr1, $tctr1
 741         vmov.32         ${dat1}[3],$tctr1
 742         b.ls            .Lctr32_tail
 743         rev             $tctr2, $ctr
 744         sub             $len,$len,#3            // bias
 745         vmov.32         ${dat2}[3],$tctr2
 746         b               .Loop3x_ctr32
 747
 748 .align  4
 749 .Loop3x_ctr32:
 750         aese            $dat0,q8
 751         aesmc           $dat0,$dat0
 752         aese            $dat1,q8
 753         aesmc           $dat1,$dat1
 754         aese            $dat2,q8
 755         aesmc           $dat2,$dat2
 756         vld1.32         {q8},[$key_],#16
 757         subs            $cnt,$cnt,#2
 758         aese            $dat0,q9
 759         aesmc           $dat0,$dat0
 760         aese            $dat1,q9
 761         aesmc           $dat1,$dat1
 762         aese            $dat2,q9
 763         aesmc           $dat2,$dat2
 764         vld1.32         {q9},[$key_],#16
 765         b.gt            .Loop3x_ctr32
 766
 767         aese            $dat0,q8
 768         aesmc           $tmp0,$dat0
 769         aese            $dat1,q8
 770         aesmc           $tmp1,$dat1
 771          vld1.8         {$in0},[$inp],#16
 772          vorr           $dat0,$ivec,$ivec
 773         aese            $dat2,q8
 774         aesmc           $dat2,$dat2
 775          vld1.8         {$in1},[$inp],#16
 776          vorr           $dat1,$ivec,$ivec
 777         aese            $tmp0,q9
 778         aesmc           $tmp0,$tmp0
 779         aese            $tmp1,q9
 780         aesmc           $tmp1,$tmp1
 781          vld1.8         {$in2},[$inp],#16
 782          mov            $key_,$key
 783         aese            $dat2,q9
 784         aesmc           $tmp2,$dat2
 785          vorr           $dat2,$ivec,$ivec
 786          add            $tctr0,$ctr,#1
 787         aese            $tmp0,q12
 788         aesmc           $tmp0,$tmp0
 789         aese            $tmp1,q12
 790         aesmc           $tmp1,$tmp1
 791          veor           $in0,$in0,$rndlast
 792          add            $tctr1,$ctr,#2
 793         aese            $tmp2,q12
 794         aesmc           $tmp2,$tmp2
 795          veor           $in1,$in1,$rndlast
 796          add            $ctr,$ctr,#3
 797         aese            $tmp0,q13
 798         aesmc           $tmp0,$tmp0
 799         aese            $tmp1,q13
 800         aesmc           $tmp1,$tmp1
 801          veor           $in2,$in2,$rndlast
 802          rev            $tctr0,$tctr0
 803         aese            $tmp2,q13
 804         aesmc           $tmp2,$tmp2
 805          vmov.32        ${dat0}[3], $tctr0
 806          rev            $tctr1,$tctr1
 807         aese            $tmp0,q14
 808         aesmc           $tmp0,$tmp0
 809         aese            $tmp1,q14
 810         aesmc           $tmp1,$tmp1
 811          vmov.32        ${dat1}[3], $tctr1
 812          rev            $tctr2,$ctr
 813         aese            $tmp2,q14
 814         aesmc           $tmp2,$tmp2
 815          vmov.32        ${dat2}[3], $tctr2
 816          subs           $len,$len,#3
 817         aese            $tmp0,q15
 818         aese            $tmp1,q15
 819         aese            $tmp2,q15
 820
 821         veor            $in0,$in0,$tmp0
 822          vld1.32         {q8},[$key_],#16       // re-pre-load rndkey[0]
 823         vst1.8          {$in0},[$out],#16
 824         veor            $in1,$in1,$tmp1
 825          mov            $cnt,$rounds
 826         vst1.8          {$in1},[$out],#16
 827         veor            $in2,$in2,$tmp2
 828          vld1.32         {q9},[$key_],#16       // re-pre-load rndkey[1]
 829         vst1.8          {$in2},[$out],#16
 830         b.hs            .Loop3x_ctr32
 831
 832         adds            $len,$len,#3
 833         b.eq            .Lctr32_done
 834         cmp             $len,#1
 835         mov             $step,#16
 836         cclr            $step,eq
 837
 838 .Lctr32_tail:
 839         aese            $dat0,q8
 840         aesmc           $dat0,$dat0
 841         aese            $dat1,q8
 842         aesmc           $dat1,$dat1
 843         vld1.32         {q8},[$key_],#16
 844         subs            $cnt,$cnt,#2
 845         aese            $dat0,q9
 846         aesmc           $dat0,$dat0
 847         aese            $dat1,q9
 848         aesmc           $dat1,$dat1
 849         vld1.32         {q9},[$key_],#16
 850         b.gt            .Lctr32_tail
 851
 852         aese            $dat0,q8
 853         aesmc           $dat0,$dat0
 854         aese            $dat1,q8
 855         aesmc           $dat1,$dat1
 856         aese            $dat0,q9
 857         aesmc           $dat0,$dat0
 858         aese            $dat1,q9
 859         aesmc           $dat1,$dat1
 860          vld1.8         {$in0},[$inp],$step
 861         aese            $dat0,q12
 862         aesmc           $dat0,$dat0
 863         aese            $dat1,q12
 864         aesmc           $dat1,$dat1
 865          vld1.8         {$in1},[$inp]
 866         aese            $dat0,q13
 867         aesmc           $dat0,$dat0
 868         aese            $dat1,q13
 869         aesmc           $dat1,$dat1
 870          veor           $in0,$in0,$rndlast
 871         aese            $dat0,q14
 872         aesmc           $dat0,$dat0
 873         aese            $dat1,q14
 874         aesmc           $dat1,$dat1
 875          veor           $in1,$in1,$rndlast
 876         aese            $dat0,q15
 877         aese            $dat1,q15
 878
 879         cmp             $len,#1
 880         veor            $in0,$in0,$dat0
 881         veor            $in1,$in1,$dat1
 882         vst1.8          {$in0},[$out],#16
 883         b.eq            .Lctr32_done
 884         vst1.8          {$in1},[$out]
 885
 886 .Lctr32_done:
 887 ___
 888 $code.=<<___    if ($flavour !~ /64/);
 889         vldmia          sp!,{d8-d15}
 890         ldmia           sp!,{r4-r10,pc}
 891 ___
 892 $code.=<<___    if ($flavour =~ /64/);
 893         ldr             x29,[sp],#16
 894         ret
 895 ___
 896 $code.=<<___;
 897 .size   ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
 898 ___
 899 }}}
 900 $code.=<<___;
 901 #endif
 902 ___
 903 ########################################
 904 if ($flavour =~ /64/) {                 ######## 64-bit code
 905     my %opcode = (
 906         "aesd"  =>      0x4e285800,     "aese"  =>      0x4e284800,
 907         "aesimc"=>      0x4e287800,     "aesmc" =>      0x4e286800      );
 908
 909     local *unaes = sub {
 910         my ($mnemonic,$arg)=@_;
 911
 912         $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o   &&
 913         sprintf ".inst\t0x%08x\t//%s %s",
 914                         $opcode{$mnemonic}|$1|($2<<5),
 915                         $mnemonic,$arg;
 916     };
 917
 918     foreach(split("\n",$code)) {
 919         s/\`([^\`]*)\`/eval($1)/geo;
 920
 921         s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
 922         s/@\s/\/\//o;                   # old->new style commentary
 923
 924         #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo     or
 925         s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
 926         s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel     $2,$3,$2,$1/o   or
 927         s/vmov\.i8/movi/o       or      # fix up legacy mnemonics
 928         s/vext\.8/ext/o         or
 929         s/vrev32\.8/rev32/o     or
 930         s/vtst\.8/cmtst/o       or
 931         s/vshr/ushr/o           or
 932         s/^(\s+)v/$1/o          or      # strip off v prefix
 933         s/\bbx\s+lr\b/ret/o;
 934
 935         # fix up remaining legacy suffixes
 936         s/\.[ui]?8//o;
 937         m/\],#8/o and s/\.16b/\.8b/go;
 938         s/\.[ui]?32//o and s/\.16b/\.4s/go;
 939         s/\.[ui]?64//o and s/\.16b/\.2d/go;
 940         s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
 941
 942         print $_,"\n";
 943     }
 944 } else {                                ######## 32-bit code
 945     my %opcode = (
 946         "aesd"  =>      0xf3b00340,     "aese"  =>      0xf3b00300,
 947         "aesimc"=>      0xf3b003c0,     "aesmc" =>      0xf3b00380      );
 948
 949     local *unaes = sub {
 950         my ($mnemonic,$arg)=@_;
 951
 952         if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
 953             my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
 954                                          |(($2&7)<<1) |(($2&8)<<2);
 955             # since ARMv7 instructions are always encoded little-endian.
 956             # correct solution is to use .inst directive, but older
 957             # assemblers don't implement it:-(
 958             sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
 959                         $word&0xff,($word>>8)&0xff,
 960                         ($word>>16)&0xff,($word>>24)&0xff,
 961                         $mnemonic,$arg;
 962         }
 963     };
 964
 965     sub unvtbl {
 966         my $arg=shift;
 967
 968         $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
 969         sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
 970                 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
 971     }
 972
 973     sub unvdup32 {
 974         my $arg=shift;
 975
 976         $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
 977         sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
 978     }
 979
 980     sub unvmov32 {
 981         my $arg=shift;
 982
 983         $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
 984         sprintf "vmov.32        d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
 985     }
 986
 987     foreach(split("\n",$code)) {
 988         s/\`([^\`]*)\`/eval($1)/geo;
 989
 990         s/\b[wx]([0-9]+)\b/r$1/go;              # new->old registers
 991         s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
 992         s/\/\/\s?/@ /o;                         # new->old style commentary
 993
 994         # fix up remaining new-style suffixes
 995         s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo    or
 996         s/\],#[0-9]+/]!/o;
 997
 998         s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo      or
 999         s/cclr\s+([^,]+),\s*([a-z]+)/mov$2      $1,#0/o or
1000         s/vtbl\.8\s+(.*)/unvtbl($1)/geo                 or
1001         s/vdup\.32\s+(.*)/unvdup32($1)/geo              or
1002         s/vmov\.32\s+(.*)/unvmov32($1)/geo              or
1003         s/^(\s+)b\./$1b/o                               or
1004         s/^(\s+)mov\./$1mov/o                           or
1005         s/^(\s+)ret/$1bx\tlr/o;
1006
1007         print $_,"\n";
1008     }
1009 }
1010
1011 close STDOUT;