]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - crypto/openssl/crypto/aes/asm/aesv8-armx.pl
Merge OpenSSL 1.1.1i.
[FreeBSD/FreeBSD.git] / crypto / openssl / crypto / aes / asm / aesv8-armx.pl
1 #! /usr/bin/env perl
2 # Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # This module implements support for ARMv8 AES instructions. The
18 # module is endian-agnostic in sense that it supports both big- and
19 # little-endian cases. As does it support both 32- and 64-bit modes
20 # of operation. Latter is achieved by limiting amount of utilized
21 # registers to 16, which implies additional NEON load and integer
22 # instructions. This has no effect on mighty Apple A7, where results
23 # are literally equal to the theoretical estimates based on AES
24 # instruction latencies and issue rates. On Cortex-A53, an in-order
25 # execution core, this costs up to 10-15%, which is partially
26 # compensated by implementing dedicated code path for 128-bit
27 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
28 # seems to be limited by sheer amount of NEON instructions...
29 #
30 # Performance in cycles per byte processed with 128-bit key:
31 #
32 #               CBC enc         CBC dec         CTR
33 # Apple A7      2.39            1.20            1.20
34 # Cortex-A53    1.32            1.29            1.46
35 # Cortex-A57(*) 1.95            0.85            0.93
36 # Denver        1.96            0.86            0.80
37 # Mongoose      1.33            1.20            1.20
38 # Kryo          1.26            0.94            1.00
39 #
40 # (*)   original 3.64/1.34/1.32 results were for r0p0 revision
41 #       and are still same even for updated module;
42
43 $flavour = shift;
44 $output  = shift;
45
46 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
48 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
49 die "can't locate arm-xlate.pl";
50
51 open OUT,"| \"$^X\" $xlate $flavour $output";
52 *STDOUT=*OUT;
53
54 $prefix="aes_v8";
55
56 $code=<<___;
57 #include "arm_arch.h"
58
59 #if __ARM_MAX_ARCH__>=7
60 .text
61 ___
62 # $code.=".arch armv8-a+crypto\n"                       if ($flavour =~ /64/);
63 $code.=<<___                                            if ($flavour !~ /64/);
64 .arch   armv7-a // don't confuse not-so-latest binutils with argv8 :-)
65 .fpu    neon
66 .code   32
67 #undef  __thumb2__
68 ___
69
70 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
71 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
72 # maintain both 32- and 64-bit codes within single module and
73 # transliterate common code to either flavour with regex vodoo.
74 #
75 {{{
76 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
77 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
78         $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
79
80
81 $code.=<<___;
82 .align  5
83 .Lrcon:
84 .long   0x01,0x01,0x01,0x01
85 .long   0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d     // rotate-n-splat
86 .long   0x1b,0x1b,0x1b,0x1b
87
88 .globl  ${prefix}_set_encrypt_key
89 .type   ${prefix}_set_encrypt_key,%function
90 .align  5
91 ${prefix}_set_encrypt_key:
92 .Lenc_key:
93 ___
94 $code.=<<___    if ($flavour =~ /64/);
95         stp     x29,x30,[sp,#-16]!
96         add     x29,sp,#0
97 ___
98 $code.=<<___;
99         mov     $ptr,#-1
100         cmp     $inp,#0
101         b.eq    .Lenc_key_abort
102         cmp     $out,#0
103         b.eq    .Lenc_key_abort
104         mov     $ptr,#-2
105         cmp     $bits,#128
106         b.lt    .Lenc_key_abort
107         cmp     $bits,#256
108         b.gt    .Lenc_key_abort
109         tst     $bits,#0x3f
110         b.ne    .Lenc_key_abort
111
112         adr     $ptr,.Lrcon
113         cmp     $bits,#192
114
115         veor    $zero,$zero,$zero
116         vld1.8  {$in0},[$inp],#16
117         mov     $bits,#8                // reuse $bits
118         vld1.32 {$rcon,$mask},[$ptr],#32
119
120         b.lt    .Loop128
121         b.eq    .L192
122         b       .L256
123
124 .align  4
125 .Loop128:
126         vtbl.8  $key,{$in0},$mask
127         vext.8  $tmp,$zero,$in0,#12
128         vst1.32 {$in0},[$out],#16
129         aese    $key,$zero
130         subs    $bits,$bits,#1
131
132         veor    $in0,$in0,$tmp
133         vext.8  $tmp,$zero,$tmp,#12
134         veor    $in0,$in0,$tmp
135         vext.8  $tmp,$zero,$tmp,#12
136          veor   $key,$key,$rcon
137         veor    $in0,$in0,$tmp
138         vshl.u8 $rcon,$rcon,#1
139         veor    $in0,$in0,$key
140         b.ne    .Loop128
141
142         vld1.32 {$rcon},[$ptr]
143
144         vtbl.8  $key,{$in0},$mask
145         vext.8  $tmp,$zero,$in0,#12
146         vst1.32 {$in0},[$out],#16
147         aese    $key,$zero
148
149         veor    $in0,$in0,$tmp
150         vext.8  $tmp,$zero,$tmp,#12
151         veor    $in0,$in0,$tmp
152         vext.8  $tmp,$zero,$tmp,#12
153          veor   $key,$key,$rcon
154         veor    $in0,$in0,$tmp
155         vshl.u8 $rcon,$rcon,#1
156         veor    $in0,$in0,$key
157
158         vtbl.8  $key,{$in0},$mask
159         vext.8  $tmp,$zero,$in0,#12
160         vst1.32 {$in0},[$out],#16
161         aese    $key,$zero
162
163         veor    $in0,$in0,$tmp
164         vext.8  $tmp,$zero,$tmp,#12
165         veor    $in0,$in0,$tmp
166         vext.8  $tmp,$zero,$tmp,#12
167          veor   $key,$key,$rcon
168         veor    $in0,$in0,$tmp
169         veor    $in0,$in0,$key
170         vst1.32 {$in0},[$out]
171         add     $out,$out,#0x50
172
173         mov     $rounds,#10
174         b       .Ldone
175
176 .align  4
177 .L192:
178         vld1.8  {$in1},[$inp],#8
179         vmov.i8 $key,#8                 // borrow $key
180         vst1.32 {$in0},[$out],#16
181         vsub.i8 $mask,$mask,$key        // adjust the mask
182
183 .Loop192:
184         vtbl.8  $key,{$in1},$mask
185         vext.8  $tmp,$zero,$in0,#12
186 #ifdef __ARMEB__
187         vst1.32 {$in1},[$out],#16
188         sub     $out,$out,#8
189 #else
190         vst1.32 {$in1},[$out],#8
191 #endif
192         aese    $key,$zero
193         subs    $bits,$bits,#1
194
195         veor    $in0,$in0,$tmp
196         vext.8  $tmp,$zero,$tmp,#12
197         veor    $in0,$in0,$tmp
198         vext.8  $tmp,$zero,$tmp,#12
199         veor    $in0,$in0,$tmp
200
201         vdup.32 $tmp,${in0}[3]
202         veor    $tmp,$tmp,$in1
203          veor   $key,$key,$rcon
204         vext.8  $in1,$zero,$in1,#12
205         vshl.u8 $rcon,$rcon,#1
206         veor    $in1,$in1,$tmp
207         veor    $in0,$in0,$key
208         veor    $in1,$in1,$key
209         vst1.32 {$in0},[$out],#16
210         b.ne    .Loop192
211
212         mov     $rounds,#12
213         add     $out,$out,#0x20
214         b       .Ldone
215
216 .align  4
217 .L256:
218         vld1.8  {$in1},[$inp]
219         mov     $bits,#7
220         mov     $rounds,#14
221         vst1.32 {$in0},[$out],#16
222
223 .Loop256:
224         vtbl.8  $key,{$in1},$mask
225         vext.8  $tmp,$zero,$in0,#12
226         vst1.32 {$in1},[$out],#16
227         aese    $key,$zero
228         subs    $bits,$bits,#1
229
230         veor    $in0,$in0,$tmp
231         vext.8  $tmp,$zero,$tmp,#12
232         veor    $in0,$in0,$tmp
233         vext.8  $tmp,$zero,$tmp,#12
234          veor   $key,$key,$rcon
235         veor    $in0,$in0,$tmp
236         vshl.u8 $rcon,$rcon,#1
237         veor    $in0,$in0,$key
238         vst1.32 {$in0},[$out],#16
239         b.eq    .Ldone
240
241         vdup.32 $key,${in0}[3]          // just splat
242         vext.8  $tmp,$zero,$in1,#12
243         aese    $key,$zero
244
245         veor    $in1,$in1,$tmp
246         vext.8  $tmp,$zero,$tmp,#12
247         veor    $in1,$in1,$tmp
248         vext.8  $tmp,$zero,$tmp,#12
249         veor    $in1,$in1,$tmp
250
251         veor    $in1,$in1,$key
252         b       .Loop256
253
254 .Ldone:
255         str     $rounds,[$out]
256         mov     $ptr,#0
257
258 .Lenc_key_abort:
259         mov     x0,$ptr                 // return value
260         `"ldr   x29,[sp],#16"           if ($flavour =~ /64/)`
261         ret
262 .size   ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
263
264 .globl  ${prefix}_set_decrypt_key
265 .type   ${prefix}_set_decrypt_key,%function
266 .align  5
267 ${prefix}_set_decrypt_key:
268 ___
269 $code.=<<___    if ($flavour =~ /64/);
270         .inst   0xd503233f              // paciasp
271         stp     x29,x30,[sp,#-16]!
272         add     x29,sp,#0
273 ___
274 $code.=<<___    if ($flavour !~ /64/);
275         stmdb   sp!,{r4,lr}
276 ___
277 $code.=<<___;
278         bl      .Lenc_key
279
280         cmp     x0,#0
281         b.ne    .Ldec_key_abort
282
283         sub     $out,$out,#240          // restore original $out
284         mov     x4,#-16
285         add     $inp,$out,x12,lsl#4     // end of key schedule
286
287         vld1.32 {v0.16b},[$out]
288         vld1.32 {v1.16b},[$inp]
289         vst1.32 {v0.16b},[$inp],x4
290         vst1.32 {v1.16b},[$out],#16
291
292 .Loop_imc:
293         vld1.32 {v0.16b},[$out]
294         vld1.32 {v1.16b},[$inp]
295         aesimc  v0.16b,v0.16b
296         aesimc  v1.16b,v1.16b
297         vst1.32 {v0.16b},[$inp],x4
298         vst1.32 {v1.16b},[$out],#16
299         cmp     $inp,$out
300         b.hi    .Loop_imc
301
302         vld1.32 {v0.16b},[$out]
303         aesimc  v0.16b,v0.16b
304         vst1.32 {v0.16b},[$inp]
305
306         eor     x0,x0,x0                // return value
307 .Ldec_key_abort:
308 ___
309 $code.=<<___    if ($flavour !~ /64/);
310         ldmia   sp!,{r4,pc}
311 ___
312 $code.=<<___    if ($flavour =~ /64/);
313         ldp     x29,x30,[sp],#16
314         .inst   0xd50323bf              // autiasp
315         ret
316 ___
317 $code.=<<___;
318 .size   ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
319 ___
320 }}}
321 {{{
322 sub gen_block () {
323 my $dir = shift;
324 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
325 my ($inp,$out,$key)=map("x$_",(0..2));
326 my $rounds="w3";
327 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
328
329 $code.=<<___;
330 .globl  ${prefix}_${dir}crypt
331 .type   ${prefix}_${dir}crypt,%function
332 .align  5
333 ${prefix}_${dir}crypt:
334         ldr     $rounds,[$key,#240]
335         vld1.32 {$rndkey0},[$key],#16
336         vld1.8  {$inout},[$inp]
337         sub     $rounds,$rounds,#2
338         vld1.32 {$rndkey1},[$key],#16
339
340 .Loop_${dir}c:
341         aes$e   $inout,$rndkey0
342         aes$mc  $inout,$inout
343         vld1.32 {$rndkey0},[$key],#16
344         subs    $rounds,$rounds,#2
345         aes$e   $inout,$rndkey1
346         aes$mc  $inout,$inout
347         vld1.32 {$rndkey1},[$key],#16
348         b.gt    .Loop_${dir}c
349
350         aes$e   $inout,$rndkey0
351         aes$mc  $inout,$inout
352         vld1.32 {$rndkey0},[$key]
353         aes$e   $inout,$rndkey1
354         veor    $inout,$inout,$rndkey0
355
356         vst1.8  {$inout},[$out]
357         ret
358 .size   ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
359 ___
360 }
361 &gen_block("en");
362 &gen_block("de");
363 }}}
364 {{{
365 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
366 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
367 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
368
369 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
370 my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
371
372 ### q8-q15      preloaded key schedule
373
374 $code.=<<___;
375 .globl  ${prefix}_cbc_encrypt
376 .type   ${prefix}_cbc_encrypt,%function
377 .align  5
378 ${prefix}_cbc_encrypt:
379 ___
380 $code.=<<___    if ($flavour =~ /64/);
381         stp     x29,x30,[sp,#-16]!
382         add     x29,sp,#0
383 ___
384 $code.=<<___    if ($flavour !~ /64/);
385         mov     ip,sp
386         stmdb   sp!,{r4-r8,lr}
387         vstmdb  sp!,{d8-d15}            @ ABI specification says so
388         ldmia   ip,{r4-r5}              @ load remaining args
389 ___
390 $code.=<<___;
391         subs    $len,$len,#16
392         mov     $step,#16
393         b.lo    .Lcbc_abort
394         cclr    $step,eq
395
396         cmp     $enc,#0                 // en- or decrypting?
397         ldr     $rounds,[$key,#240]
398         and     $len,$len,#-16
399         vld1.8  {$ivec},[$ivp]
400         vld1.8  {$dat},[$inp],$step
401
402         vld1.32 {q8-q9},[$key]          // load key schedule...
403         sub     $rounds,$rounds,#6
404         add     $key_,$key,x5,lsl#4     // pointer to last 7 round keys
405         sub     $rounds,$rounds,#2
406         vld1.32 {q10-q11},[$key_],#32
407         vld1.32 {q12-q13},[$key_],#32
408         vld1.32 {q14-q15},[$key_],#32
409         vld1.32 {$rndlast},[$key_]
410
411         add     $key_,$key,#32
412         mov     $cnt,$rounds
413         b.eq    .Lcbc_dec
414
415         cmp     $rounds,#2
416         veor    $dat,$dat,$ivec
417         veor    $rndzero_n_last,q8,$rndlast
418         b.eq    .Lcbc_enc128
419
420         vld1.32 {$in0-$in1},[$key_]
421         add     $key_,$key,#16
422         add     $key4,$key,#16*4
423         add     $key5,$key,#16*5
424         aese    $dat,q8
425         aesmc   $dat,$dat
426         add     $key6,$key,#16*6
427         add     $key7,$key,#16*7
428         b       .Lenter_cbc_enc
429
430 .align  4
431 .Loop_cbc_enc:
432         aese    $dat,q8
433         aesmc   $dat,$dat
434          vst1.8 {$ivec},[$out],#16
435 .Lenter_cbc_enc:
436         aese    $dat,q9
437         aesmc   $dat,$dat
438         aese    $dat,$in0
439         aesmc   $dat,$dat
440         vld1.32 {q8},[$key4]
441         cmp     $rounds,#4
442         aese    $dat,$in1
443         aesmc   $dat,$dat
444         vld1.32 {q9},[$key5]
445         b.eq    .Lcbc_enc192
446
447         aese    $dat,q8
448         aesmc   $dat,$dat
449         vld1.32 {q8},[$key6]
450         aese    $dat,q9
451         aesmc   $dat,$dat
452         vld1.32 {q9},[$key7]
453         nop
454
455 .Lcbc_enc192:
456         aese    $dat,q8
457         aesmc   $dat,$dat
458          subs   $len,$len,#16
459         aese    $dat,q9
460         aesmc   $dat,$dat
461          cclr   $step,eq
462         aese    $dat,q10
463         aesmc   $dat,$dat
464         aese    $dat,q11
465         aesmc   $dat,$dat
466          vld1.8 {q8},[$inp],$step
467         aese    $dat,q12
468         aesmc   $dat,$dat
469          veor   q8,q8,$rndzero_n_last
470         aese    $dat,q13
471         aesmc   $dat,$dat
472          vld1.32 {q9},[$key_]           // re-pre-load rndkey[1]
473         aese    $dat,q14
474         aesmc   $dat,$dat
475         aese    $dat,q15
476         veor    $ivec,$dat,$rndlast
477         b.hs    .Loop_cbc_enc
478
479         vst1.8  {$ivec},[$out],#16
480         b       .Lcbc_done
481
482 .align  5
483 .Lcbc_enc128:
484         vld1.32 {$in0-$in1},[$key_]
485         aese    $dat,q8
486         aesmc   $dat,$dat
487         b       .Lenter_cbc_enc128
488 .Loop_cbc_enc128:
489         aese    $dat,q8
490         aesmc   $dat,$dat
491          vst1.8 {$ivec},[$out],#16
492 .Lenter_cbc_enc128:
493         aese    $dat,q9
494         aesmc   $dat,$dat
495          subs   $len,$len,#16
496         aese    $dat,$in0
497         aesmc   $dat,$dat
498          cclr   $step,eq
499         aese    $dat,$in1
500         aesmc   $dat,$dat
501         aese    $dat,q10
502         aesmc   $dat,$dat
503         aese    $dat,q11
504         aesmc   $dat,$dat
505          vld1.8 {q8},[$inp],$step
506         aese    $dat,q12
507         aesmc   $dat,$dat
508         aese    $dat,q13
509         aesmc   $dat,$dat
510         aese    $dat,q14
511         aesmc   $dat,$dat
512          veor   q8,q8,$rndzero_n_last
513         aese    $dat,q15
514         veor    $ivec,$dat,$rndlast
515         b.hs    .Loop_cbc_enc128
516
517         vst1.8  {$ivec},[$out],#16
518         b       .Lcbc_done
519 ___
520 {
521 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
522 $code.=<<___;
523 .align  5
524 .Lcbc_dec:
525         vld1.8  {$dat2},[$inp],#16
526         subs    $len,$len,#32           // bias
527         add     $cnt,$rounds,#2
528         vorr    $in1,$dat,$dat
529         vorr    $dat1,$dat,$dat
530         vorr    $in2,$dat2,$dat2
531         b.lo    .Lcbc_dec_tail
532
533         vorr    $dat1,$dat2,$dat2
534         vld1.8  {$dat2},[$inp],#16
535         vorr    $in0,$dat,$dat
536         vorr    $in1,$dat1,$dat1
537         vorr    $in2,$dat2,$dat2
538
539 .Loop3x_cbc_dec:
540         aesd    $dat0,q8
541         aesimc  $dat0,$dat0
542         aesd    $dat1,q8
543         aesimc  $dat1,$dat1
544         aesd    $dat2,q8
545         aesimc  $dat2,$dat2
546         vld1.32 {q8},[$key_],#16
547         subs    $cnt,$cnt,#2
548         aesd    $dat0,q9
549         aesimc  $dat0,$dat0
550         aesd    $dat1,q9
551         aesimc  $dat1,$dat1
552         aesd    $dat2,q9
553         aesimc  $dat2,$dat2
554         vld1.32 {q9},[$key_],#16
555         b.gt    .Loop3x_cbc_dec
556
557         aesd    $dat0,q8
558         aesimc  $dat0,$dat0
559         aesd    $dat1,q8
560         aesimc  $dat1,$dat1
561         aesd    $dat2,q8
562         aesimc  $dat2,$dat2
563          veor   $tmp0,$ivec,$rndlast
564          subs   $len,$len,#0x30
565          veor   $tmp1,$in0,$rndlast
566          mov.lo x6,$len                 // x6, $cnt, is zero at this point
567         aesd    $dat0,q9
568         aesimc  $dat0,$dat0
569         aesd    $dat1,q9
570         aesimc  $dat1,$dat1
571         aesd    $dat2,q9
572         aesimc  $dat2,$dat2
573          veor   $tmp2,$in1,$rndlast
574          add    $inp,$inp,x6            // $inp is adjusted in such way that
575                                         // at exit from the loop $dat1-$dat2
576                                         // are loaded with last "words"
577          vorr   $ivec,$in2,$in2
578          mov    $key_,$key
579         aesd    $dat0,q12
580         aesimc  $dat0,$dat0
581         aesd    $dat1,q12
582         aesimc  $dat1,$dat1
583         aesd    $dat2,q12
584         aesimc  $dat2,$dat2
585          vld1.8 {$in0},[$inp],#16
586         aesd    $dat0,q13
587         aesimc  $dat0,$dat0
588         aesd    $dat1,q13
589         aesimc  $dat1,$dat1
590         aesd    $dat2,q13
591         aesimc  $dat2,$dat2
592          vld1.8 {$in1},[$inp],#16
593         aesd    $dat0,q14
594         aesimc  $dat0,$dat0
595         aesd    $dat1,q14
596         aesimc  $dat1,$dat1
597         aesd    $dat2,q14
598         aesimc  $dat2,$dat2
599          vld1.8 {$in2},[$inp],#16
600         aesd    $dat0,q15
601         aesd    $dat1,q15
602         aesd    $dat2,q15
603          vld1.32 {q8},[$key_],#16       // re-pre-load rndkey[0]
604          add    $cnt,$rounds,#2
605         veor    $tmp0,$tmp0,$dat0
606         veor    $tmp1,$tmp1,$dat1
607         veor    $dat2,$dat2,$tmp2
608          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
609         vst1.8  {$tmp0},[$out],#16
610          vorr   $dat0,$in0,$in0
611         vst1.8  {$tmp1},[$out],#16
612          vorr   $dat1,$in1,$in1
613         vst1.8  {$dat2},[$out],#16
614          vorr   $dat2,$in2,$in2
615         b.hs    .Loop3x_cbc_dec
616
617         cmn     $len,#0x30
618         b.eq    .Lcbc_done
619         nop
620
621 .Lcbc_dec_tail:
622         aesd    $dat1,q8
623         aesimc  $dat1,$dat1
624         aesd    $dat2,q8
625         aesimc  $dat2,$dat2
626         vld1.32 {q8},[$key_],#16
627         subs    $cnt,$cnt,#2
628         aesd    $dat1,q9
629         aesimc  $dat1,$dat1
630         aesd    $dat2,q9
631         aesimc  $dat2,$dat2
632         vld1.32 {q9},[$key_],#16
633         b.gt    .Lcbc_dec_tail
634
635         aesd    $dat1,q8
636         aesimc  $dat1,$dat1
637         aesd    $dat2,q8
638         aesimc  $dat2,$dat2
639         aesd    $dat1,q9
640         aesimc  $dat1,$dat1
641         aesd    $dat2,q9
642         aesimc  $dat2,$dat2
643         aesd    $dat1,q12
644         aesimc  $dat1,$dat1
645         aesd    $dat2,q12
646         aesimc  $dat2,$dat2
647          cmn    $len,#0x20
648         aesd    $dat1,q13
649         aesimc  $dat1,$dat1
650         aesd    $dat2,q13
651         aesimc  $dat2,$dat2
652          veor   $tmp1,$ivec,$rndlast
653         aesd    $dat1,q14
654         aesimc  $dat1,$dat1
655         aesd    $dat2,q14
656         aesimc  $dat2,$dat2
657          veor   $tmp2,$in1,$rndlast
658         aesd    $dat1,q15
659         aesd    $dat2,q15
660         b.eq    .Lcbc_dec_one
661         veor    $tmp1,$tmp1,$dat1
662         veor    $tmp2,$tmp2,$dat2
663          vorr   $ivec,$in2,$in2
664         vst1.8  {$tmp1},[$out],#16
665         vst1.8  {$tmp2},[$out],#16
666         b       .Lcbc_done
667
668 .Lcbc_dec_one:
669         veor    $tmp1,$tmp1,$dat2
670          vorr   $ivec,$in2,$in2
671         vst1.8  {$tmp1},[$out],#16
672
673 .Lcbc_done:
674         vst1.8  {$ivec},[$ivp]
675 .Lcbc_abort:
676 ___
677 }
678 $code.=<<___    if ($flavour !~ /64/);
679         vldmia  sp!,{d8-d15}
680         ldmia   sp!,{r4-r8,pc}
681 ___
682 $code.=<<___    if ($flavour =~ /64/);
683         ldr     x29,[sp],#16
684         ret
685 ___
686 $code.=<<___;
687 .size   ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
688 ___
689 }}}
690 {{{
691 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
692 my ($rounds,$cnt,$key_)=("w5","w6","x7");
693 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
694 my $step="x12";         # aliases with $tctr2
695
696 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
697 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
698
699 my ($dat,$tmp)=($dat0,$tmp0);
700
701 ### q8-q15      preloaded key schedule
702
703 $code.=<<___;
704 .globl  ${prefix}_ctr32_encrypt_blocks
705 .type   ${prefix}_ctr32_encrypt_blocks,%function
706 .align  5
707 ${prefix}_ctr32_encrypt_blocks:
708 ___
709 $code.=<<___    if ($flavour =~ /64/);
710         stp             x29,x30,[sp,#-16]!
711         add             x29,sp,#0
712 ___
713 $code.=<<___    if ($flavour !~ /64/);
714         mov             ip,sp
715         stmdb           sp!,{r4-r10,lr}
716         vstmdb          sp!,{d8-d15}            @ ABI specification says so
717         ldr             r4, [ip]                @ load remaining arg
718 ___
719 $code.=<<___;
720         ldr             $rounds,[$key,#240]
721
722         ldr             $ctr, [$ivp, #12]
723 #ifdef __ARMEB__
724         vld1.8          {$dat0},[$ivp]
725 #else
726         vld1.32         {$dat0},[$ivp]
727 #endif
728         vld1.32         {q8-q9},[$key]          // load key schedule...
729         sub             $rounds,$rounds,#4
730         mov             $step,#16
731         cmp             $len,#2
732         add             $key_,$key,x5,lsl#4     // pointer to last 5 round keys
733         sub             $rounds,$rounds,#2
734         vld1.32         {q12-q13},[$key_],#32
735         vld1.32         {q14-q15},[$key_],#32
736         vld1.32         {$rndlast},[$key_]
737         add             $key_,$key,#32
738         mov             $cnt,$rounds
739         cclr            $step,lo
740 #ifndef __ARMEB__
741         rev             $ctr, $ctr
742 #endif
743         add             $tctr1, $ctr, #1
744         vorr            $ivec,$dat0,$dat0
745         rev             $tctr1, $tctr1
746         vmov.32         ${ivec}[3],$tctr1
747         add             $ctr, $ctr, #2
748         vorr            $dat1,$ivec,$ivec
749         b.ls            .Lctr32_tail
750         rev             $tctr2, $ctr
751         vmov.32         ${ivec}[3],$tctr2
752         sub             $len,$len,#3            // bias
753         vorr            $dat2,$ivec,$ivec
754         b               .Loop3x_ctr32
755
756 .align  4
757 .Loop3x_ctr32:
758         aese            $dat0,q8
759         aesmc           $dat0,$dat0
760         aese            $dat1,q8
761         aesmc           $dat1,$dat1
762         aese            $dat2,q8
763         aesmc           $dat2,$dat2
764         vld1.32         {q8},[$key_],#16
765         subs            $cnt,$cnt,#2
766         aese            $dat0,q9
767         aesmc           $dat0,$dat0
768         aese            $dat1,q9
769         aesmc           $dat1,$dat1
770         aese            $dat2,q9
771         aesmc           $dat2,$dat2
772         vld1.32         {q9},[$key_],#16
773         b.gt            .Loop3x_ctr32
774
775         aese            $dat0,q8
776         aesmc           $tmp0,$dat0
777         aese            $dat1,q8
778         aesmc           $tmp1,$dat1
779          vld1.8         {$in0},[$inp],#16
780          add            $tctr0,$ctr,#1
781         aese            $dat2,q8
782         aesmc           $dat2,$dat2
783          vld1.8         {$in1},[$inp],#16
784          rev            $tctr0,$tctr0
785         aese            $tmp0,q9
786         aesmc           $tmp0,$tmp0
787         aese            $tmp1,q9
788         aesmc           $tmp1,$tmp1
789          vld1.8         {$in2},[$inp],#16
790          mov            $key_,$key
791         aese            $dat2,q9
792         aesmc           $tmp2,$dat2
793         aese            $tmp0,q12
794         aesmc           $tmp0,$tmp0
795         aese            $tmp1,q12
796         aesmc           $tmp1,$tmp1
797          veor           $in0,$in0,$rndlast
798          add            $tctr1,$ctr,#2
799         aese            $tmp2,q12
800         aesmc           $tmp2,$tmp2
801          veor           $in1,$in1,$rndlast
802          add            $ctr,$ctr,#3
803         aese            $tmp0,q13
804         aesmc           $tmp0,$tmp0
805         aese            $tmp1,q13
806         aesmc           $tmp1,$tmp1
807          veor           $in2,$in2,$rndlast
808          vmov.32        ${ivec}[3], $tctr0
809         aese            $tmp2,q13
810         aesmc           $tmp2,$tmp2
811          vorr           $dat0,$ivec,$ivec
812          rev            $tctr1,$tctr1
813         aese            $tmp0,q14
814         aesmc           $tmp0,$tmp0
815          vmov.32        ${ivec}[3], $tctr1
816          rev            $tctr2,$ctr
817         aese            $tmp1,q14
818         aesmc           $tmp1,$tmp1
819          vorr           $dat1,$ivec,$ivec
820          vmov.32        ${ivec}[3], $tctr2
821         aese            $tmp2,q14
822         aesmc           $tmp2,$tmp2
823          vorr           $dat2,$ivec,$ivec
824          subs           $len,$len,#3
825         aese            $tmp0,q15
826         aese            $tmp1,q15
827         aese            $tmp2,q15
828
829         veor            $in0,$in0,$tmp0
830          vld1.32         {q8},[$key_],#16       // re-pre-load rndkey[0]
831         vst1.8          {$in0},[$out],#16
832         veor            $in1,$in1,$tmp1
833          mov            $cnt,$rounds
834         vst1.8          {$in1},[$out],#16
835         veor            $in2,$in2,$tmp2
836          vld1.32         {q9},[$key_],#16       // re-pre-load rndkey[1]
837         vst1.8          {$in2},[$out],#16
838         b.hs            .Loop3x_ctr32
839
840         adds            $len,$len,#3
841         b.eq            .Lctr32_done
842         cmp             $len,#1
843         mov             $step,#16
844         cclr            $step,eq
845
846 .Lctr32_tail:
847         aese            $dat0,q8
848         aesmc           $dat0,$dat0
849         aese            $dat1,q8
850         aesmc           $dat1,$dat1
851         vld1.32         {q8},[$key_],#16
852         subs            $cnt,$cnt,#2
853         aese            $dat0,q9
854         aesmc           $dat0,$dat0
855         aese            $dat1,q9
856         aesmc           $dat1,$dat1
857         vld1.32         {q9},[$key_],#16
858         b.gt            .Lctr32_tail
859
860         aese            $dat0,q8
861         aesmc           $dat0,$dat0
862         aese            $dat1,q8
863         aesmc           $dat1,$dat1
864         aese            $dat0,q9
865         aesmc           $dat0,$dat0
866         aese            $dat1,q9
867         aesmc           $dat1,$dat1
868          vld1.8         {$in0},[$inp],$step
869         aese            $dat0,q12
870         aesmc           $dat0,$dat0
871         aese            $dat1,q12
872         aesmc           $dat1,$dat1
873          vld1.8         {$in1},[$inp]
874         aese            $dat0,q13
875         aesmc           $dat0,$dat0
876         aese            $dat1,q13
877         aesmc           $dat1,$dat1
878          veor           $in0,$in0,$rndlast
879         aese            $dat0,q14
880         aesmc           $dat0,$dat0
881         aese            $dat1,q14
882         aesmc           $dat1,$dat1
883          veor           $in1,$in1,$rndlast
884         aese            $dat0,q15
885         aese            $dat1,q15
886
887         cmp             $len,#1
888         veor            $in0,$in0,$dat0
889         veor            $in1,$in1,$dat1
890         vst1.8          {$in0},[$out],#16
891         b.eq            .Lctr32_done
892         vst1.8          {$in1},[$out]
893
894 .Lctr32_done:
895 ___
896 $code.=<<___    if ($flavour !~ /64/);
897         vldmia          sp!,{d8-d15}
898         ldmia           sp!,{r4-r10,pc}
899 ___
900 $code.=<<___    if ($flavour =~ /64/);
901         ldr             x29,[sp],#16
902         ret
903 ___
904 $code.=<<___;
905 .size   ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
906 ___
907 }}}
908 $code.=<<___;
909 #endif
910 ___
911 ########################################
912 if ($flavour =~ /64/) {                 ######## 64-bit code
913     my %opcode = (
914         "aesd"  =>      0x4e285800,     "aese"  =>      0x4e284800,
915         "aesimc"=>      0x4e287800,     "aesmc" =>      0x4e286800      );
916
917     local *unaes = sub {
918         my ($mnemonic,$arg)=@_;
919
920         $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o   &&
921         sprintf ".inst\t0x%08x\t//%s %s",
922                         $opcode{$mnemonic}|$1|($2<<5),
923                         $mnemonic,$arg;
924     };
925
926     foreach(split("\n",$code)) {
927         s/\`([^\`]*)\`/eval($1)/geo;
928
929         s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
930         s/@\s/\/\//o;                   # old->new style commentary
931
932         #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo     or
933         s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
934         s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel     $2,$3,$2,$1/o   or
935         s/vmov\.i8/movi/o       or      # fix up legacy mnemonics
936         s/vext\.8/ext/o         or
937         s/vrev32\.8/rev32/o     or
938         s/vtst\.8/cmtst/o       or
939         s/vshr/ushr/o           or
940         s/^(\s+)v/$1/o          or      # strip off v prefix
941         s/\bbx\s+lr\b/ret/o;
942
943         # fix up remaining legacy suffixes
944         s/\.[ui]?8//o;
945         m/\],#8/o and s/\.16b/\.8b/go;
946         s/\.[ui]?32//o and s/\.16b/\.4s/go;
947         s/\.[ui]?64//o and s/\.16b/\.2d/go;
948         s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
949
950         print $_,"\n";
951     }
952 } else {                                ######## 32-bit code
953     my %opcode = (
954         "aesd"  =>      0xf3b00340,     "aese"  =>      0xf3b00300,
955         "aesimc"=>      0xf3b003c0,     "aesmc" =>      0xf3b00380      );
956
957     local *unaes = sub {
958         my ($mnemonic,$arg)=@_;
959
960         if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
961             my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
962                                          |(($2&7)<<1) |(($2&8)<<2);
963             # since ARMv7 instructions are always encoded little-endian.
964             # correct solution is to use .inst directive, but older
965             # assemblers don't implement it:-(
966             sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
967                         $word&0xff,($word>>8)&0xff,
968                         ($word>>16)&0xff,($word>>24)&0xff,
969                         $mnemonic,$arg;
970         }
971     };
972
973     sub unvtbl {
974         my $arg=shift;
975
976         $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
977         sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
978                 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
979     }
980
981     sub unvdup32 {
982         my $arg=shift;
983
984         $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
985         sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
986     }
987
988     sub unvmov32 {
989         my $arg=shift;
990
991         $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
992         sprintf "vmov.32        d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
993     }
994
995     foreach(split("\n",$code)) {
996         s/\`([^\`]*)\`/eval($1)/geo;
997
998         s/\b[wx]([0-9]+)\b/r$1/go;              # new->old registers
999         s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
1000         s/\/\/\s?/@ /o;                         # new->old style commentary
1001
1002         # fix up remaining new-style suffixes
1003         s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo    or
1004         s/\],#[0-9]+/]!/o;
1005
1006         s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo      or
1007         s/cclr\s+([^,]+),\s*([a-z]+)/mov$2      $1,#0/o or
1008         s/vtbl\.8\s+(.*)/unvtbl($1)/geo                 or
1009         s/vdup\.32\s+(.*)/unvdup32($1)/geo              or
1010         s/vmov\.32\s+(.*)/unvmov32($1)/geo              or
1011         s/^(\s+)b\./$1b/o                               or
1012         s/^(\s+)mov\./$1mov/o                           or
1013         s/^(\s+)ret/$1bx\tlr/o;
1014
1015         print $_,"\n";
1016     }
1017 }
1018
1019 close STDOUT or die "error closing STDOUT: $!";