]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - crypto/openssl/crypto/aes/asm/aesv8-armx.pl
Merge llvm, clang, lld and lldb trunk r291012, and resolve conflicts.
[FreeBSD/FreeBSD.git] / crypto / openssl / crypto / aes / asm / aesv8-armx.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # This module implements support for ARMv8 AES instructions. The
11 # module is endian-agnostic in sense that it supports both big- and
12 # little-endian cases. As does it support both 32- and 64-bit modes
13 # of operation. Latter is achieved by limiting amount of utilized
14 # registers to 16, which implies additional NEON load and integer
15 # instructions. This has no effect on mighty Apple A7, where results
16 # are literally equal to the theoretical estimates based on AES
17 # instruction latencies and issue rates. On Cortex-A53, an in-order
18 # execution core, this costs up to 10-15%, which is partially
19 # compensated by implementing dedicated code path for 128-bit
20 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
21 # seems to be limited by sheer amount of NEON instructions...
22 #
23 # Performance in cycles per byte processed with 128-bit key:
24 #
25 #               CBC enc         CBC dec         CTR
26 # Apple A7      2.39            1.20            1.20
27 # Cortex-A53    1.32            1.29            1.46
28 # Cortex-A57(*) 1.95            0.85            0.93
29 # Denver        1.96            0.86            0.80
30 #
31 # (*)   original 3.64/1.34/1.32 results were for r0p0 revision
32 #       and are still same even for updated module;
33
34 $flavour = shift;
35 open STDOUT,">".shift;
36
37 $prefix="aes_v8";
38
39 $code=<<___;
40 #include "arm_arch.h"
41
42 #if __ARM_MAX_ARCH__>=7
43 .text
44 ___
45 # $code.=".arch armv8-a+crypto\n"                       if ($flavour =~ /64/);
46 $code.=".arch   armv7-a\n.fpu   neon\n.code     32\n"   if ($flavour !~ /64/);
47                 #^^^^^^ this is done to simplify adoption by not depending
48                 #       on latest binutils.
49
50 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
51 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
52 # maintain both 32- and 64-bit codes within single module and
53 # transliterate common code to either flavour with regex vodoo.
54 #
55 {{{
56 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
57 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
58         $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
59
60
61 $code.=<<___;
62 .align  5
63 rcon:
64 .long   0x01,0x01,0x01,0x01
65 .long   0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d     // rotate-n-splat
66 .long   0x1b,0x1b,0x1b,0x1b
67
68 .globl  ${prefix}_set_encrypt_key
69 .type   ${prefix}_set_encrypt_key,%function
70 .align  5
71 ${prefix}_set_encrypt_key:
72 .Lenc_key:
73 ___
74 $code.=<<___    if ($flavour =~ /64/);
75         stp     x29,x30,[sp,#-16]!
76         add     x29,sp,#0
77 ___
78 $code.=<<___;
79         mov     $ptr,#-1
80         cmp     $inp,#0
81         b.eq    .Lenc_key_abort
82         cmp     $out,#0
83         b.eq    .Lenc_key_abort
84         mov     $ptr,#-2
85         cmp     $bits,#128
86         b.lt    .Lenc_key_abort
87         cmp     $bits,#256
88         b.gt    .Lenc_key_abort
89         tst     $bits,#0x3f
90         b.ne    .Lenc_key_abort
91
92         adr     $ptr,rcon
93         cmp     $bits,#192
94
95         veor    $zero,$zero,$zero
96         vld1.8  {$in0},[$inp],#16
97         mov     $bits,#8                // reuse $bits
98         vld1.32 {$rcon,$mask},[$ptr],#32
99
100         b.lt    .Loop128
101         b.eq    .L192
102         b       .L256
103
104 .align  4
105 .Loop128:
106         vtbl.8  $key,{$in0},$mask
107         vext.8  $tmp,$zero,$in0,#12
108         vst1.32 {$in0},[$out],#16
109         aese    $key,$zero
110         subs    $bits,$bits,#1
111
112         veor    $in0,$in0,$tmp
113         vext.8  $tmp,$zero,$tmp,#12
114         veor    $in0,$in0,$tmp
115         vext.8  $tmp,$zero,$tmp,#12
116          veor   $key,$key,$rcon
117         veor    $in0,$in0,$tmp
118         vshl.u8 $rcon,$rcon,#1
119         veor    $in0,$in0,$key
120         b.ne    .Loop128
121
122         vld1.32 {$rcon},[$ptr]
123
124         vtbl.8  $key,{$in0},$mask
125         vext.8  $tmp,$zero,$in0,#12
126         vst1.32 {$in0},[$out],#16
127         aese    $key,$zero
128
129         veor    $in0,$in0,$tmp
130         vext.8  $tmp,$zero,$tmp,#12
131         veor    $in0,$in0,$tmp
132         vext.8  $tmp,$zero,$tmp,#12
133          veor   $key,$key,$rcon
134         veor    $in0,$in0,$tmp
135         vshl.u8 $rcon,$rcon,#1
136         veor    $in0,$in0,$key
137
138         vtbl.8  $key,{$in0},$mask
139         vext.8  $tmp,$zero,$in0,#12
140         vst1.32 {$in0},[$out],#16
141         aese    $key,$zero
142
143         veor    $in0,$in0,$tmp
144         vext.8  $tmp,$zero,$tmp,#12
145         veor    $in0,$in0,$tmp
146         vext.8  $tmp,$zero,$tmp,#12
147          veor   $key,$key,$rcon
148         veor    $in0,$in0,$tmp
149         veor    $in0,$in0,$key
150         vst1.32 {$in0},[$out]
151         add     $out,$out,#0x50
152
153         mov     $rounds,#10
154         b       .Ldone
155
156 .align  4
157 .L192:
158         vld1.8  {$in1},[$inp],#8
159         vmov.i8 $key,#8                 // borrow $key
160         vst1.32 {$in0},[$out],#16
161         vsub.i8 $mask,$mask,$key        // adjust the mask
162
163 .Loop192:
164         vtbl.8  $key,{$in1},$mask
165         vext.8  $tmp,$zero,$in0,#12
166         vst1.32 {$in1},[$out],#8
167         aese    $key,$zero
168         subs    $bits,$bits,#1
169
170         veor    $in0,$in0,$tmp
171         vext.8  $tmp,$zero,$tmp,#12
172         veor    $in0,$in0,$tmp
173         vext.8  $tmp,$zero,$tmp,#12
174         veor    $in0,$in0,$tmp
175
176         vdup.32 $tmp,${in0}[3]
177         veor    $tmp,$tmp,$in1
178          veor   $key,$key,$rcon
179         vext.8  $in1,$zero,$in1,#12
180         vshl.u8 $rcon,$rcon,#1
181         veor    $in1,$in1,$tmp
182         veor    $in0,$in0,$key
183         veor    $in1,$in1,$key
184         vst1.32 {$in0},[$out],#16
185         b.ne    .Loop192
186
187         mov     $rounds,#12
188         add     $out,$out,#0x20
189         b       .Ldone
190
191 .align  4
192 .L256:
193         vld1.8  {$in1},[$inp]
194         mov     $bits,#7
195         mov     $rounds,#14
196         vst1.32 {$in0},[$out],#16
197
198 .Loop256:
199         vtbl.8  $key,{$in1},$mask
200         vext.8  $tmp,$zero,$in0,#12
201         vst1.32 {$in1},[$out],#16
202         aese    $key,$zero
203         subs    $bits,$bits,#1
204
205         veor    $in0,$in0,$tmp
206         vext.8  $tmp,$zero,$tmp,#12
207         veor    $in0,$in0,$tmp
208         vext.8  $tmp,$zero,$tmp,#12
209          veor   $key,$key,$rcon
210         veor    $in0,$in0,$tmp
211         vshl.u8 $rcon,$rcon,#1
212         veor    $in0,$in0,$key
213         vst1.32 {$in0},[$out],#16
214         b.eq    .Ldone
215
216         vdup.32 $key,${in0}[3]          // just splat
217         vext.8  $tmp,$zero,$in1,#12
218         aese    $key,$zero
219
220         veor    $in1,$in1,$tmp
221         vext.8  $tmp,$zero,$tmp,#12
222         veor    $in1,$in1,$tmp
223         vext.8  $tmp,$zero,$tmp,#12
224         veor    $in1,$in1,$tmp
225
226         veor    $in1,$in1,$key
227         b       .Loop256
228
229 .Ldone:
230         str     $rounds,[$out]
231         mov     $ptr,#0
232
233 .Lenc_key_abort:
234         mov     x0,$ptr                 // return value
235         `"ldr   x29,[sp],#16"           if ($flavour =~ /64/)`
236         ret
237 .size   ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
238
239 .globl  ${prefix}_set_decrypt_key
240 .type   ${prefix}_set_decrypt_key,%function
241 .align  5
242 ${prefix}_set_decrypt_key:
243 ___
244 $code.=<<___    if ($flavour =~ /64/);
245         stp     x29,x30,[sp,#-16]!
246         add     x29,sp,#0
247 ___
248 $code.=<<___    if ($flavour !~ /64/);
249         stmdb   sp!,{r4,lr}
250 ___
251 $code.=<<___;
252         bl      .Lenc_key
253
254         cmp     x0,#0
255         b.ne    .Ldec_key_abort
256
257         sub     $out,$out,#240          // restore original $out
258         mov     x4,#-16
259         add     $inp,$out,x12,lsl#4     // end of key schedule
260
261         vld1.32 {v0.16b},[$out]
262         vld1.32 {v1.16b},[$inp]
263         vst1.32 {v0.16b},[$inp],x4
264         vst1.32 {v1.16b},[$out],#16
265
266 .Loop_imc:
267         vld1.32 {v0.16b},[$out]
268         vld1.32 {v1.16b},[$inp]
269         aesimc  v0.16b,v0.16b
270         aesimc  v1.16b,v1.16b
271         vst1.32 {v0.16b},[$inp],x4
272         vst1.32 {v1.16b},[$out],#16
273         cmp     $inp,$out
274         b.hi    .Loop_imc
275
276         vld1.32 {v0.16b},[$out]
277         aesimc  v0.16b,v0.16b
278         vst1.32 {v0.16b},[$inp]
279
280         eor     x0,x0,x0                // return value
281 .Ldec_key_abort:
282 ___
283 $code.=<<___    if ($flavour !~ /64/);
284         ldmia   sp!,{r4,pc}
285 ___
286 $code.=<<___    if ($flavour =~ /64/);
287         ldp     x29,x30,[sp],#16
288         ret
289 ___
290 $code.=<<___;
291 .size   ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
292 ___
293 }}}
294 {{{
295 sub gen_block () {
296 my $dir = shift;
297 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
298 my ($inp,$out,$key)=map("x$_",(0..2));
299 my $rounds="w3";
300 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
301
302 $code.=<<___;
303 .globl  ${prefix}_${dir}crypt
304 .type   ${prefix}_${dir}crypt,%function
305 .align  5
306 ${prefix}_${dir}crypt:
307         ldr     $rounds,[$key,#240]
308         vld1.32 {$rndkey0},[$key],#16
309         vld1.8  {$inout},[$inp]
310         sub     $rounds,$rounds,#2
311         vld1.32 {$rndkey1},[$key],#16
312
313 .Loop_${dir}c:
314         aes$e   $inout,$rndkey0
315         aes$mc  $inout,$inout
316         vld1.32 {$rndkey0},[$key],#16
317         subs    $rounds,$rounds,#2
318         aes$e   $inout,$rndkey1
319         aes$mc  $inout,$inout
320         vld1.32 {$rndkey1},[$key],#16
321         b.gt    .Loop_${dir}c
322
323         aes$e   $inout,$rndkey0
324         aes$mc  $inout,$inout
325         vld1.32 {$rndkey0},[$key]
326         aes$e   $inout,$rndkey1
327         veor    $inout,$inout,$rndkey0
328
329         vst1.8  {$inout},[$out]
330         ret
331 .size   ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
332 ___
333 }
334 &gen_block("en");
335 &gen_block("de");
336 }}}
337 {{{
338 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
339 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
340 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
341
342 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
343 my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
344
345 ### q8-q15      preloaded key schedule
346
347 $code.=<<___;
348 .globl  ${prefix}_cbc_encrypt
349 .type   ${prefix}_cbc_encrypt,%function
350 .align  5
351 ${prefix}_cbc_encrypt:
352 ___
353 $code.=<<___    if ($flavour =~ /64/);
354         stp     x29,x30,[sp,#-16]!
355         add     x29,sp,#0
356 ___
357 $code.=<<___    if ($flavour !~ /64/);
358         mov     ip,sp
359         stmdb   sp!,{r4-r8,lr}
360         vstmdb  sp!,{d8-d15}            @ ABI specification says so
361         ldmia   ip,{r4-r5}              @ load remaining args
362 ___
363 $code.=<<___;
364         subs    $len,$len,#16
365         mov     $step,#16
366         b.lo    .Lcbc_abort
367         cclr    $step,eq
368
369         cmp     $enc,#0                 // en- or decrypting?
370         ldr     $rounds,[$key,#240]
371         and     $len,$len,#-16
372         vld1.8  {$ivec},[$ivp]
373         vld1.8  {$dat},[$inp],$step
374
375         vld1.32 {q8-q9},[$key]          // load key schedule...
376         sub     $rounds,$rounds,#6
377         add     $key_,$key,x5,lsl#4     // pointer to last 7 round keys
378         sub     $rounds,$rounds,#2
379         vld1.32 {q10-q11},[$key_],#32
380         vld1.32 {q12-q13},[$key_],#32
381         vld1.32 {q14-q15},[$key_],#32
382         vld1.32 {$rndlast},[$key_]
383
384         add     $key_,$key,#32
385         mov     $cnt,$rounds
386         b.eq    .Lcbc_dec
387
388         cmp     $rounds,#2
389         veor    $dat,$dat,$ivec
390         veor    $rndzero_n_last,q8,$rndlast
391         b.eq    .Lcbc_enc128
392
393         vld1.32 {$in0-$in1},[$key_]
394         add     $key_,$key,#16
395         add     $key4,$key,#16*4
396         add     $key5,$key,#16*5
397         aese    $dat,q8
398         aesmc   $dat,$dat
399         add     $key6,$key,#16*6
400         add     $key7,$key,#16*7
401         b       .Lenter_cbc_enc
402
403 .align  4
404 .Loop_cbc_enc:
405         aese    $dat,q8
406         aesmc   $dat,$dat
407          vst1.8 {$ivec},[$out],#16
408 .Lenter_cbc_enc:
409         aese    $dat,q9
410         aesmc   $dat,$dat
411         aese    $dat,$in0
412         aesmc   $dat,$dat
413         vld1.32 {q8},[$key4]
414         cmp     $rounds,#4
415         aese    $dat,$in1
416         aesmc   $dat,$dat
417         vld1.32 {q9},[$key5]
418         b.eq    .Lcbc_enc192
419
420         aese    $dat,q8
421         aesmc   $dat,$dat
422         vld1.32 {q8},[$key6]
423         aese    $dat,q9
424         aesmc   $dat,$dat
425         vld1.32 {q9},[$key7]
426         nop
427
428 .Lcbc_enc192:
429         aese    $dat,q8
430         aesmc   $dat,$dat
431          subs   $len,$len,#16
432         aese    $dat,q9
433         aesmc   $dat,$dat
434          cclr   $step,eq
435         aese    $dat,q10
436         aesmc   $dat,$dat
437         aese    $dat,q11
438         aesmc   $dat,$dat
439          vld1.8 {q8},[$inp],$step
440         aese    $dat,q12
441         aesmc   $dat,$dat
442          veor   q8,q8,$rndzero_n_last
443         aese    $dat,q13
444         aesmc   $dat,$dat
445          vld1.32 {q9},[$key_]           // re-pre-load rndkey[1]
446         aese    $dat,q14
447         aesmc   $dat,$dat
448         aese    $dat,q15
449         veor    $ivec,$dat,$rndlast
450         b.hs    .Loop_cbc_enc
451
452         vst1.8  {$ivec},[$out],#16
453         b       .Lcbc_done
454
455 .align  5
456 .Lcbc_enc128:
457         vld1.32 {$in0-$in1},[$key_]
458         aese    $dat,q8
459         aesmc   $dat,$dat
460         b       .Lenter_cbc_enc128
461 .Loop_cbc_enc128:
462         aese    $dat,q8
463         aesmc   $dat,$dat
464          vst1.8 {$ivec},[$out],#16
465 .Lenter_cbc_enc128:
466         aese    $dat,q9
467         aesmc   $dat,$dat
468          subs   $len,$len,#16
469         aese    $dat,$in0
470         aesmc   $dat,$dat
471          cclr   $step,eq
472         aese    $dat,$in1
473         aesmc   $dat,$dat
474         aese    $dat,q10
475         aesmc   $dat,$dat
476         aese    $dat,q11
477         aesmc   $dat,$dat
478          vld1.8 {q8},[$inp],$step
479         aese    $dat,q12
480         aesmc   $dat,$dat
481         aese    $dat,q13
482         aesmc   $dat,$dat
483         aese    $dat,q14
484         aesmc   $dat,$dat
485          veor   q8,q8,$rndzero_n_last
486         aese    $dat,q15
487         veor    $ivec,$dat,$rndlast
488         b.hs    .Loop_cbc_enc128
489
490         vst1.8  {$ivec},[$out],#16
491         b       .Lcbc_done
492 ___
493 {
494 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
495 $code.=<<___;
496 .align  5
497 .Lcbc_dec:
498         vld1.8  {$dat2},[$inp],#16
499         subs    $len,$len,#32           // bias
500         add     $cnt,$rounds,#2
501         vorr    $in1,$dat,$dat
502         vorr    $dat1,$dat,$dat
503         vorr    $in2,$dat2,$dat2
504         b.lo    .Lcbc_dec_tail
505
506         vorr    $dat1,$dat2,$dat2
507         vld1.8  {$dat2},[$inp],#16
508         vorr    $in0,$dat,$dat
509         vorr    $in1,$dat1,$dat1
510         vorr    $in2,$dat2,$dat2
511
512 .Loop3x_cbc_dec:
513         aesd    $dat0,q8
514         aesimc  $dat0,$dat0
515         aesd    $dat1,q8
516         aesimc  $dat1,$dat1
517         aesd    $dat2,q8
518         aesimc  $dat2,$dat2
519         vld1.32 {q8},[$key_],#16
520         subs    $cnt,$cnt,#2
521         aesd    $dat0,q9
522         aesimc  $dat0,$dat0
523         aesd    $dat1,q9
524         aesimc  $dat1,$dat1
525         aesd    $dat2,q9
526         aesimc  $dat2,$dat2
527         vld1.32 {q9},[$key_],#16
528         b.gt    .Loop3x_cbc_dec
529
530         aesd    $dat0,q8
531         aesimc  $dat0,$dat0
532         aesd    $dat1,q8
533         aesimc  $dat1,$dat1
534         aesd    $dat2,q8
535         aesimc  $dat2,$dat2
536          veor   $tmp0,$ivec,$rndlast
537          subs   $len,$len,#0x30
538          veor   $tmp1,$in0,$rndlast
539          mov.lo x6,$len                 // x6, $cnt, is zero at this point
540         aesd    $dat0,q9
541         aesimc  $dat0,$dat0
542         aesd    $dat1,q9
543         aesimc  $dat1,$dat1
544         aesd    $dat2,q9
545         aesimc  $dat2,$dat2
546          veor   $tmp2,$in1,$rndlast
547          add    $inp,$inp,x6            // $inp is adjusted in such way that
548                                         // at exit from the loop $dat1-$dat2
549                                         // are loaded with last "words"
550          vorr   $ivec,$in2,$in2
551          mov    $key_,$key
552         aesd    $dat0,q12
553         aesimc  $dat0,$dat0
554         aesd    $dat1,q12
555         aesimc  $dat1,$dat1
556         aesd    $dat2,q12
557         aesimc  $dat2,$dat2
558          vld1.8 {$in0},[$inp],#16
559         aesd    $dat0,q13
560         aesimc  $dat0,$dat0
561         aesd    $dat1,q13
562         aesimc  $dat1,$dat1
563         aesd    $dat2,q13
564         aesimc  $dat2,$dat2
565          vld1.8 {$in1},[$inp],#16
566         aesd    $dat0,q14
567         aesimc  $dat0,$dat0
568         aesd    $dat1,q14
569         aesimc  $dat1,$dat1
570         aesd    $dat2,q14
571         aesimc  $dat2,$dat2
572          vld1.8 {$in2},[$inp],#16
573         aesd    $dat0,q15
574         aesd    $dat1,q15
575         aesd    $dat2,q15
576          vld1.32 {q8},[$key_],#16       // re-pre-load rndkey[0]
577          add    $cnt,$rounds,#2
578         veor    $tmp0,$tmp0,$dat0
579         veor    $tmp1,$tmp1,$dat1
580         veor    $dat2,$dat2,$tmp2
581          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
582         vst1.8  {$tmp0},[$out],#16
583          vorr   $dat0,$in0,$in0
584         vst1.8  {$tmp1},[$out],#16
585          vorr   $dat1,$in1,$in1
586         vst1.8  {$dat2},[$out],#16
587          vorr   $dat2,$in2,$in2
588         b.hs    .Loop3x_cbc_dec
589
590         cmn     $len,#0x30
591         b.eq    .Lcbc_done
592         nop
593
594 .Lcbc_dec_tail:
595         aesd    $dat1,q8
596         aesimc  $dat1,$dat1
597         aesd    $dat2,q8
598         aesimc  $dat2,$dat2
599         vld1.32 {q8},[$key_],#16
600         subs    $cnt,$cnt,#2
601         aesd    $dat1,q9
602         aesimc  $dat1,$dat1
603         aesd    $dat2,q9
604         aesimc  $dat2,$dat2
605         vld1.32 {q9},[$key_],#16
606         b.gt    .Lcbc_dec_tail
607
608         aesd    $dat1,q8
609         aesimc  $dat1,$dat1
610         aesd    $dat2,q8
611         aesimc  $dat2,$dat2
612         aesd    $dat1,q9
613         aesimc  $dat1,$dat1
614         aesd    $dat2,q9
615         aesimc  $dat2,$dat2
616         aesd    $dat1,q12
617         aesimc  $dat1,$dat1
618         aesd    $dat2,q12
619         aesimc  $dat2,$dat2
620          cmn    $len,#0x20
621         aesd    $dat1,q13
622         aesimc  $dat1,$dat1
623         aesd    $dat2,q13
624         aesimc  $dat2,$dat2
625          veor   $tmp1,$ivec,$rndlast
626         aesd    $dat1,q14
627         aesimc  $dat1,$dat1
628         aesd    $dat2,q14
629         aesimc  $dat2,$dat2
630          veor   $tmp2,$in1,$rndlast
631         aesd    $dat1,q15
632         aesd    $dat2,q15
633         b.eq    .Lcbc_dec_one
634         veor    $tmp1,$tmp1,$dat1
635         veor    $tmp2,$tmp2,$dat2
636          vorr   $ivec,$in2,$in2
637         vst1.8  {$tmp1},[$out],#16
638         vst1.8  {$tmp2},[$out],#16
639         b       .Lcbc_done
640
641 .Lcbc_dec_one:
642         veor    $tmp1,$tmp1,$dat2
643          vorr   $ivec,$in2,$in2
644         vst1.8  {$tmp1},[$out],#16
645
646 .Lcbc_done:
647         vst1.8  {$ivec},[$ivp]
648 .Lcbc_abort:
649 ___
650 }
651 $code.=<<___    if ($flavour !~ /64/);
652         vldmia  sp!,{d8-d15}
653         ldmia   sp!,{r4-r8,pc}
654 ___
655 $code.=<<___    if ($flavour =~ /64/);
656         ldr     x29,[sp],#16
657         ret
658 ___
659 $code.=<<___;
660 .size   ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
661 ___
662 }}}
663 {{{
664 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
665 my ($rounds,$cnt,$key_)=("w5","w6","x7");
666 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
667 my $step="x12";         # aliases with $tctr2
668
669 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
670 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
671
672 my ($dat,$tmp)=($dat0,$tmp0);
673
674 ### q8-q15      preloaded key schedule
675
676 $code.=<<___;
677 .globl  ${prefix}_ctr32_encrypt_blocks
678 .type   ${prefix}_ctr32_encrypt_blocks,%function
679 .align  5
680 ${prefix}_ctr32_encrypt_blocks:
681 ___
682 $code.=<<___    if ($flavour =~ /64/);
683         stp             x29,x30,[sp,#-16]!
684         add             x29,sp,#0
685 ___
686 $code.=<<___    if ($flavour !~ /64/);
687         mov             ip,sp
688         stmdb           sp!,{r4-r10,lr}
689         vstmdb          sp!,{d8-d15}            @ ABI specification says so
690         ldr             r4, [ip]                @ load remaining arg
691 ___
692 $code.=<<___;
693         ldr             $rounds,[$key,#240]
694
695         ldr             $ctr, [$ivp, #12]
696         vld1.32         {$dat0},[$ivp]
697
698         vld1.32         {q8-q9},[$key]          // load key schedule...
699         sub             $rounds,$rounds,#4
700         mov             $step,#16
701         cmp             $len,#2
702         add             $key_,$key,x5,lsl#4     // pointer to last 5 round keys
703         sub             $rounds,$rounds,#2
704         vld1.32         {q12-q13},[$key_],#32
705         vld1.32         {q14-q15},[$key_],#32
706         vld1.32         {$rndlast},[$key_]
707         add             $key_,$key,#32
708         mov             $cnt,$rounds
709         cclr            $step,lo
710 #ifndef __ARMEB__
711         rev             $ctr, $ctr
712 #endif
713         vorr            $dat1,$dat0,$dat0
714         add             $tctr1, $ctr, #1
715         vorr            $dat2,$dat0,$dat0
716         add             $ctr, $ctr, #2
717         vorr            $ivec,$dat0,$dat0
718         rev             $tctr1, $tctr1
719         vmov.32         ${dat1}[3],$tctr1
720         b.ls            .Lctr32_tail
721         rev             $tctr2, $ctr
722         sub             $len,$len,#3            // bias
723         vmov.32         ${dat2}[3],$tctr2
724         b               .Loop3x_ctr32
725
726 .align  4
727 .Loop3x_ctr32:
728         aese            $dat0,q8
729         aesmc           $dat0,$dat0
730         aese            $dat1,q8
731         aesmc           $dat1,$dat1
732         aese            $dat2,q8
733         aesmc           $dat2,$dat2
734         vld1.32         {q8},[$key_],#16
735         subs            $cnt,$cnt,#2
736         aese            $dat0,q9
737         aesmc           $dat0,$dat0
738         aese            $dat1,q9
739         aesmc           $dat1,$dat1
740         aese            $dat2,q9
741         aesmc           $dat2,$dat2
742         vld1.32         {q9},[$key_],#16
743         b.gt            .Loop3x_ctr32
744
745         aese            $dat0,q8
746         aesmc           $tmp0,$dat0
747         aese            $dat1,q8
748         aesmc           $tmp1,$dat1
749          vld1.8         {$in0},[$inp],#16
750          vorr           $dat0,$ivec,$ivec
751         aese            $dat2,q8
752         aesmc           $dat2,$dat2
753          vld1.8         {$in1},[$inp],#16
754          vorr           $dat1,$ivec,$ivec
755         aese            $tmp0,q9
756         aesmc           $tmp0,$tmp0
757         aese            $tmp1,q9
758         aesmc           $tmp1,$tmp1
759          vld1.8         {$in2},[$inp],#16
760          mov            $key_,$key
761         aese            $dat2,q9
762         aesmc           $tmp2,$dat2
763          vorr           $dat2,$ivec,$ivec
764          add            $tctr0,$ctr,#1
765         aese            $tmp0,q12
766         aesmc           $tmp0,$tmp0
767         aese            $tmp1,q12
768         aesmc           $tmp1,$tmp1
769          veor           $in0,$in0,$rndlast
770          add            $tctr1,$ctr,#2
771         aese            $tmp2,q12
772         aesmc           $tmp2,$tmp2
773          veor           $in1,$in1,$rndlast
774          add            $ctr,$ctr,#3
775         aese            $tmp0,q13
776         aesmc           $tmp0,$tmp0
777         aese            $tmp1,q13
778         aesmc           $tmp1,$tmp1
779          veor           $in2,$in2,$rndlast
780          rev            $tctr0,$tctr0
781         aese            $tmp2,q13
782         aesmc           $tmp2,$tmp2
783          vmov.32        ${dat0}[3], $tctr0
784          rev            $tctr1,$tctr1
785         aese            $tmp0,q14
786         aesmc           $tmp0,$tmp0
787         aese            $tmp1,q14
788         aesmc           $tmp1,$tmp1
789          vmov.32        ${dat1}[3], $tctr1
790          rev            $tctr2,$ctr
791         aese            $tmp2,q14
792         aesmc           $tmp2,$tmp2
793          vmov.32        ${dat2}[3], $tctr2
794          subs           $len,$len,#3
795         aese            $tmp0,q15
796         aese            $tmp1,q15
797         aese            $tmp2,q15
798
799         veor            $in0,$in0,$tmp0
800          vld1.32         {q8},[$key_],#16       // re-pre-load rndkey[0]
801         vst1.8          {$in0},[$out],#16
802         veor            $in1,$in1,$tmp1
803          mov            $cnt,$rounds
804         vst1.8          {$in1},[$out],#16
805         veor            $in2,$in2,$tmp2
806          vld1.32         {q9},[$key_],#16       // re-pre-load rndkey[1]
807         vst1.8          {$in2},[$out],#16
808         b.hs            .Loop3x_ctr32
809
810         adds            $len,$len,#3
811         b.eq            .Lctr32_done
812         cmp             $len,#1
813         mov             $step,#16
814         cclr            $step,eq
815
816 .Lctr32_tail:
817         aese            $dat0,q8
818         aesmc           $dat0,$dat0
819         aese            $dat1,q8
820         aesmc           $dat1,$dat1
821         vld1.32         {q8},[$key_],#16
822         subs            $cnt,$cnt,#2
823         aese            $dat0,q9
824         aesmc           $dat0,$dat0
825         aese            $dat1,q9
826         aesmc           $dat1,$dat1
827         vld1.32         {q9},[$key_],#16
828         b.gt            .Lctr32_tail
829
830         aese            $dat0,q8
831         aesmc           $dat0,$dat0
832         aese            $dat1,q8
833         aesmc           $dat1,$dat1
834         aese            $dat0,q9
835         aesmc           $dat0,$dat0
836         aese            $dat1,q9
837         aesmc           $dat1,$dat1
838          vld1.8         {$in0},[$inp],$step
839         aese            $dat0,q12
840         aesmc           $dat0,$dat0
841         aese            $dat1,q12
842         aesmc           $dat1,$dat1
843          vld1.8         {$in1},[$inp]
844         aese            $dat0,q13
845         aesmc           $dat0,$dat0
846         aese            $dat1,q13
847         aesmc           $dat1,$dat1
848          veor           $in0,$in0,$rndlast
849         aese            $dat0,q14
850         aesmc           $dat0,$dat0
851         aese            $dat1,q14
852         aesmc           $dat1,$dat1
853          veor           $in1,$in1,$rndlast
854         aese            $dat0,q15
855         aese            $dat1,q15
856
857         cmp             $len,#1
858         veor            $in0,$in0,$dat0
859         veor            $in1,$in1,$dat1
860         vst1.8          {$in0},[$out],#16
861         b.eq            .Lctr32_done
862         vst1.8          {$in1},[$out]
863
864 .Lctr32_done:
865 ___
866 $code.=<<___    if ($flavour !~ /64/);
867         vldmia          sp!,{d8-d15}
868         ldmia           sp!,{r4-r10,pc}
869 ___
870 $code.=<<___    if ($flavour =~ /64/);
871         ldr             x29,[sp],#16
872         ret
873 ___
874 $code.=<<___;
875 .size   ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
876 ___
877 }}}
878 $code.=<<___;
879 #endif
880 ___
881 ########################################
882 if ($flavour =~ /64/) {                 ######## 64-bit code
883     my %opcode = (
884         "aesd"  =>      0x4e285800,     "aese"  =>      0x4e284800,
885         "aesimc"=>      0x4e287800,     "aesmc" =>      0x4e286800      );
886
887     local *unaes = sub {
888         my ($mnemonic,$arg)=@_;
889
890         $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o   &&
891         sprintf ".inst\t0x%08x\t//%s %s",
892                         $opcode{$mnemonic}|$1|($2<<5),
893                         $mnemonic,$arg;
894     };
895
896     foreach(split("\n",$code)) {
897         s/\`([^\`]*)\`/eval($1)/geo;
898
899         s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
900         s/@\s/\/\//o;                   # old->new style commentary
901
902         #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo     or
903         s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
904         s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel     $2,$3,$2,$1/o   or
905         s/vmov\.i8/movi/o       or      # fix up legacy mnemonics
906         s/vext\.8/ext/o         or
907         s/vrev32\.8/rev32/o     or
908         s/vtst\.8/cmtst/o       or
909         s/vshr/ushr/o           or
910         s/^(\s+)v/$1/o          or      # strip off v prefix
911         s/\bbx\s+lr\b/ret/o;
912
913         # fix up remainig legacy suffixes
914         s/\.[ui]?8//o;
915         m/\],#8/o and s/\.16b/\.8b/go;
916         s/\.[ui]?32//o and s/\.16b/\.4s/go;
917         s/\.[ui]?64//o and s/\.16b/\.2d/go;
918         s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
919
920         print $_,"\n";
921     }
922 } else {                                ######## 32-bit code
923     my %opcode = (
924         "aesd"  =>      0xf3b00340,     "aese"  =>      0xf3b00300,
925         "aesimc"=>      0xf3b003c0,     "aesmc" =>      0xf3b00380      );
926
927     local *unaes = sub {
928         my ($mnemonic,$arg)=@_;
929
930         if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
931             my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
932                                          |(($2&7)<<1) |(($2&8)<<2);
933             # since ARMv7 instructions are always encoded little-endian.
934             # correct solution is to use .inst directive, but older
935             # assemblers don't implement it:-(
936             sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
937                         $word&0xff,($word>>8)&0xff,
938                         ($word>>16)&0xff,($word>>24)&0xff,
939                         $mnemonic,$arg;
940         }
941     };
942
943     sub unvtbl {
944         my $arg=shift;
945
946         $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
947         sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
948                 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; 
949     }
950
951     sub unvdup32 {
952         my $arg=shift;
953
954         $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
955         sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;      
956     }
957
958     sub unvmov32 {
959         my $arg=shift;
960
961         $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
962         sprintf "vmov.32        d%d[%d],%s",2*$1+($2>>1),$2&1,$3;       
963     }
964
965     foreach(split("\n",$code)) {
966         s/\`([^\`]*)\`/eval($1)/geo;
967
968         s/\b[wx]([0-9]+)\b/r$1/go;              # new->old registers
969         s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
970         s/\/\/\s?/@ /o;                         # new->old style commentary
971
972         # fix up remainig new-style suffixes
973         s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo    or
974         s/\],#[0-9]+/]!/o;
975
976         s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo      or
977         s/cclr\s+([^,]+),\s*([a-z]+)/mov$2      $1,#0/o or
978         s/vtbl\.8\s+(.*)/unvtbl($1)/geo                 or
979         s/vdup\.32\s+(.*)/unvdup32($1)/geo              or
980         s/vmov\.32\s+(.*)/unvmov32($1)/geo              or
981         s/^(\s+)b\./$1b/o                               or
982         s/^(\s+)mov\./$1mov/o                           or
983         s/^(\s+)ret/$1bx\tlr/o;
984
985         print $_,"\n";
986     }
987 }
988
989 close STDOUT;