2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 ######################################################################
11 ## Constant-time SSSE3 AES core implementation.
14 ## By Mike Hamburg (Stanford University), 2009
17 ## For details see http://shiftleft.org/papers/vector_aes/ and
18 ## http://crypto.stanford.edu/vpaes/.
20 # CBC encrypt/decrypt performance in cycles per byte processed with
24 # PPC74x0/G4e 35.5/52.1/(23.8) 11.9(*)/15.4
25 # PPC970/G5 37.9/55.0/(28.5) 22.2/28.5
26 # POWER6 42.7/54.3/(28.2) 63.0/92.8(**)
27 # POWER7 32.3/42.9/(18.4) 18.5/23.3
29 # (*) This is ~10% worse than reported in paper. The reason is
30 # twofold. This module doesn't make any assumption about
31 # key schedule (or data for that matter) alignment and handles
32 # it in-line. Secondly it, being transliterated from
33 # vpaes-x86_64.pl, relies on "nested inversion" better suited
35 # (**) Inadequate POWER6 performance is due to astronomic AltiVec
36 # latency, 9 cycles per simple logical operation.
40 if ($flavour =~ /64/) {
47 } elsif ($flavour =~ /32/) {
54 } else { die "nonsense $flavour"; }
57 $FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
59 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
60 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
61 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
62 die "can't locate ppc-xlate.pl";
64 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
71 .align 7 # totally strategic alignment
73 Lk_mc_forward: # mc_forward
74 .long 0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c ?inv
75 .long 0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300 ?inv
76 .long 0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704 ?inv
77 .long 0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08 ?inv
78 Lk_mc_backward: # mc_backward
79 .long 0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e ?inv
80 .long 0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a ?inv
81 .long 0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506 ?inv
82 .long 0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102 ?inv
84 .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f ?inv
85 .long 0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b ?inv
86 .long 0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07 ?inv
87 .long 0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603 ?inv
93 .long 0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704 ?rev
94 .long 0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03 ?rev
95 Lk_ipt: # input transform (lo, hi)
96 .long 0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca ?rev
97 .long 0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd ?rev
99 .long 0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15 ?rev
100 .long 0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e ?rev
102 .long 0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b ?rev
103 .long 0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5 ?rev
105 .long 0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2 ?rev
106 .long 0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e ?rev
111 Lk_dipt: # decryption input transform
112 .long 0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15 ?rev
113 .long 0x00650560, 0xe683e386, 0x94f191f4, 0x72177712 ?rev
114 Lk_dsbo: # decryption sbox final output
115 .long 0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7 ?rev
116 .long 0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca ?rev
117 Lk_dsb9: # decryption sbox output *9*u, *9*t
118 .long 0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca ?rev
119 .long 0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72 ?rev
120 Lk_dsbd: # decryption sbox output *D*u, *D*t
121 .long 0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5 ?rev
122 .long 0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129 ?rev
123 Lk_dsbb: # decryption sbox output *B*u, *B*t
124 .long 0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660 ?rev
125 .long 0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3 ?rev
126 Lk_dsbe: # decryption sbox output *E*u, *E*t
127 .long 0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222 ?rev
128 .long 0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794 ?rev
131 ## Key schedule constants
133 Lk_dksd: # decryption key schedule: invskew x*D
134 .long 0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007 ?rev
135 .long 0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f ?rev
136 Lk_dksb: # decryption key schedule: invskew x*B
137 .long 0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603 ?rev
138 .long 0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9 ?rev
139 Lk_dkse: # decryption key schedule: invskew x*E + 0x63
140 .long 0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553 ?rev
141 .long 0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd ?rev
142 Lk_dks9: # decryption key schedule: invskew x*9
143 .long 0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a ?rev
144 .long 0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b ?rev
147 .long 0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70 ?asis
149 .long 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b ?asis
151 Lk_opt: # output transform
152 .long 0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7 ?rev
153 .long 0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1 ?rev
154 Lk_deskew: # deskew tables: inverts the sbox's "skew"
155 .long 0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d ?rev
156 .long 0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128 ?rev
161 mflr r12 #vvvvv "distance between . and _vpaes_consts
166 .byte 0,12,0x14,0,0,0,0,0
167 .asciz "Vector Permutation AES for AltiVec, Mike Hamburg (Stanford University)"
171 my ($inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm) = map("v$_",(26..31));
173 my ($inp,$out,$key) = map("r$_",(3..5));
175 my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_",(10..15));
176 my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_",(16..19));
177 my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_",(16..23));
183 ## Fills register %r10 -> .aes_consts (so you can -fPIC)
184 ## and %xmm9-%xmm15 as specified below.
187 _vpaes_encrypt_preheat:
191 li r11, 0xc0 # Lk_inv
195 vxor v7, v7, v7 # 0x00..00
196 vspltisb v8,4 # 0x04..04
197 vspltisb v9,0x0f # 0x0f..0f
216 .byte 0,12,0x14,0,0,0,0,0
221 ## AES-encrypt %xmm0.
225 ## %xmm9-%xmm15 as in _vpaes_preheat
226 ## (%rdx) = scheduled keys
229 ## Clobbers %xmm1-%xmm6, %r9, %r10, %r11, %rax
234 lwz r8, 240($key) # pull rounds
236 lvx v5, 0, $key # vmovdqu (%r9), %xmm5 # round0 key
240 ?vperm v5, v5, v6, $keyperm # align round key
242 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
243 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm1
244 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm3, %xmm2
245 vxor v0, v0, v5 # vpxor %xmm5, %xmm1, %xmm0
246 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
252 # middle of middle round
253 vperm v4, $sb1t, v7, v2 # vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
254 lvx v1, r12, r11 # vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
256 vperm v0, $sb1u, v7, v3 # vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
257 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
258 andi. r11, r11, 0x30 # and \$0x30, %r11 # ... mod 4
259 vperm v5, $sb2t, v7, v2 # vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
260 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
261 vperm v2, $sb2u, v7, v3 # vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
262 lvx v4, r12, r10 # vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
264 vperm v3, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
265 vxor v2, v2, v5 # vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
266 vperm v0, v0, v7, v4 # vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
267 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
268 vperm v4, v3, v7, v1 # vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
269 vxor v0, v0, v3 # vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
270 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
274 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
275 vperm v5, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
276 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
277 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
278 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
280 vxor v3, v3, v5 # vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
281 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
282 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
284 lvx v6, r9, $key # vmovdqu (%r9), %xmm5
285 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
287 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
288 ?vperm v5, v5, v6, $keyperm # align round key
289 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
292 # middle of last round
294 # vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
295 # vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
296 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
297 lvx v1, r12, r10 # vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
298 vperm v0, $sbot, v7, v3 # vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
299 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
300 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
301 vperm v0, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm0
304 .byte 0,12,0x14,0,0,0,0,0
306 .globl .vpaes_encrypt
309 $STU $sp,-$FRAME($sp)
310 li r10,`15+6*$SIZE_T`
311 li r11,`31+6*$SIZE_T`
313 mfspr r7, 256 # save vrsave
336 stw r7,`$FRAME-4`($sp) # save vrsave
338 $PUSH r6,`$FRAME+$LRSAVE`($sp)
339 mtspr 256, r0 # preserve all AltiVec registers
341 bl _vpaes_encrypt_preheat
343 ?lvsl $inpperm, 0, $inp # prepare for unaligned access
345 addi $inp, $inp, 15 # 15 is not a typo
346 ?lvsr $outperm, 0, $out
347 ?lvsl $keyperm, 0, $key # prepare for unaligned access
348 lvx $inptail, 0, $inp # redundant in aligned case
349 ?vperm v0, v0, $inptail, $inpperm
351 bl _vpaes_encrypt_core
357 vperm v0, v0, v0, $outperm # rotate right/left
362 bdnz Lenc_out_unaligned
370 li r10,`15+6*$SIZE_T`
371 li r11,`31+6*$SIZE_T`
373 mtspr 256, r7 # restore vrsave
399 .byte 0,12,0x04,1,0x80,0,3,0
401 .size .vpaes_encrypt,.-.vpaes_encrypt
404 _vpaes_decrypt_preheat:
408 li r11, 0xc0 # Lk_inv
412 vxor v7, v7, v7 # 0x00..00
413 vspltisb v8,4 # 0x04..04
414 vspltisb v9,0x0f # 0x0f..0f
441 .byte 0,12,0x14,0,0,0,0,0
446 ## Same API as encryption core.
450 lwz r8, 240($key) # pull rounds
452 lvx v5, 0, $key # vmovdqu (%r9), %xmm4 # round0 key
456 ?vperm v5, v5, v6, $keyperm # align round key
457 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
458 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
459 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm1, %xmm0
460 vxor v0, v0, v5 # vpxor %xmm4, %xmm2, %xmm2
461 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
468 # Inverse mix columns
470 lvx v0, r12, r11 # v5 and v0 are flipped
471 # vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
472 # vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
473 vperm v4, $sb9u, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
475 vperm v1, $sb9t, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
477 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0
478 # vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
479 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
480 # vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
482 vperm v4, $sbdu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
483 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
484 vperm v1, $sbdt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
485 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
486 # vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
487 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
488 # vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
490 vperm v4, $sbbu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
491 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
492 vperm v1, $sbbt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
493 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
494 # vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
495 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
496 # vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
498 vperm v4, $sbeu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
499 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
500 vperm v1, $sbet, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
501 vxor v0, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
502 vxor v0, v0, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
506 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
507 vperm v2, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
508 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
509 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
510 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
512 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
513 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
514 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
516 lvx v6, r9, $key # vmovdqu (%r9), %xmm0
517 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
519 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
520 ?vperm v5, v5, v6, $keyperm # align round key
521 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
524 # middle of last round
526 # vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
527 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
528 # vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
529 lvx v2, r12, r10 # vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
530 vperm v1, $sbot, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
531 vxor v4, v4, v5 # vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
532 vxor v0, v1, v4 # vpxor %xmm4, %xmm1, %xmm0 # 0 = A
533 vperm v0, v0, v7, v2 # vpshufb %xmm2, %xmm0, %xmm0
536 .byte 0,12,0x14,0,0,0,0,0
538 .globl .vpaes_decrypt
541 $STU $sp,-$FRAME($sp)
542 li r10,`15+6*$SIZE_T`
543 li r11,`31+6*$SIZE_T`
545 mfspr r7, 256 # save vrsave
568 stw r7,`$FRAME-4`($sp) # save vrsave
570 $PUSH r6,`$FRAME+$LRSAVE`($sp)
571 mtspr 256, r0 # preserve all AltiVec registers
573 bl _vpaes_decrypt_preheat
575 ?lvsl $inpperm, 0, $inp # prepare for unaligned access
577 addi $inp, $inp, 15 # 15 is not a typo
578 ?lvsr $outperm, 0, $out
579 ?lvsl $keyperm, 0, $key
580 lvx $inptail, 0, $inp # redundant in aligned case
581 ?vperm v0, v0, $inptail, $inpperm
583 bl _vpaes_decrypt_core
589 vperm v0, v0, v0, $outperm # rotate right/left
594 bdnz Ldec_out_unaligned
602 li r10,`15+6*$SIZE_T`
603 li r11,`31+6*$SIZE_T`
605 mtspr 256, r7 # restore vrsave
631 .byte 0,12,0x04,1,0x80,0,3,0
633 .size .vpaes_decrypt,.-.vpaes_decrypt
635 .globl .vpaes_cbc_encrypt
641 $STU $sp,-`($FRAME+2*$SIZE_T)`($sp)
643 li r10,`15+6*$SIZE_T`
644 li r11,`31+6*$SIZE_T`
668 stw r12,`$FRAME-4`($sp) # save vrsave
669 $PUSH r30,`$FRAME+$SIZE_T*0`($sp)
670 $PUSH r31,`$FRAME+$SIZE_T*1`($sp)
672 $PUSH r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
674 and r30, r5, r9 # copy length&-16
675 andi. r9, $out, 15 # is $out aligned?
676 mr r5, r6 # copy pointer to key
677 mr r31, r7 # copy pointer to iv
679 mcrf cr1, cr0 # put aside $out alignment flag
680 mr r7, r12 # copy vrsave
681 mtspr 256, r6 # preserve all AltiVec registers
683 lvx v24, 0, r31 # load [potentially unaligned] iv
685 ?lvsl $inpperm, 0, r31
687 ?vperm v24, v24, v25, $inpperm
689 cmpwi r8, 0 # test direction
690 neg r8, $inp # prepare for unaligned access
692 ?lvsl $keyperm, 0, $key
693 ?lvsr $outperm, 0, $out
694 ?lvsr $inpperm, 0, r8 # -$inp
695 vnor $outmask, v7, v7 # 0xff..ff
696 lvx $inptail, 0, $inp
697 ?vperm $outmask, v7, $outmask, $outperm
698 addi $inp, $inp, 15 # 15 is not a typo
702 bl _vpaes_encrypt_preheat
705 beq cr1, Lcbc_enc_loop # $out is aligned
708 lvx $inptail, 0, $inp
710 ?vperm v0, v0, $inptail, $inpperm
711 vxor v0, v0, v24 # ^= iv
713 bl _vpaes_encrypt_core
716 vmr v24, v0 # put aside iv
718 vperm $outhead, v0, v0, $outperm # rotate right/left
721 stvebx $outhead, r8, r9
726 sub. r30, r30, r0 # len -= 16
728 beq Lcbc_unaligned_done
732 lvx $inptail, 0, $inp
734 ?vperm v0, v0, $inptail, $inpperm
735 vxor v0, v0, v24 # ^= iv
737 bl _vpaes_encrypt_core
739 vmr v24, v0 # put aside iv
740 sub. r30, r30, r0 # len -= 16
741 vperm v0, v0, v0, $outperm # rotate right/left
742 vsel v1, $outhead, v0, $outmask
752 bl _vpaes_decrypt_preheat
755 beq cr1, Lcbc_dec_loop # $out is aligned
758 lvx $inptail, 0, $inp
760 ?vperm v0, v0, $inptail, $inpperm
761 vmr v25, v0 # put aside input
763 bl _vpaes_decrypt_core
766 vxor v0, v0, v24 # ^= iv
769 vperm $outhead, v0, v0, $outperm # rotate right/left
772 stvebx $outhead, r8, r9
777 sub. r30, r30, r0 # len -= 16
779 beq Lcbc_unaligned_done
783 lvx $inptail, 0, $inp
785 ?vperm v0, v0, $inptail, $inpperm
786 vmr v25, v0 # put aside input
788 bl _vpaes_decrypt_core
790 vxor v0, v0, v24 # ^= iv
792 sub. r30, r30, r0 # len -= 16
793 vperm v0, v0, v0, $outperm # rotate right/left
794 vsel v1, $outhead, v0, $outmask
801 beq cr1, Lcbc_write_iv # $out is aligned
808 stvebx $outhead, r9, $out
814 neg r8, r31 # write [potentially unaligned] iv
816 ?lvsl $outperm, 0, r8
819 vperm v24, v24, v24, $outperm # rotate right/left
820 stvewx v24, 0, r31 # ivp is at least 32-bit aligned
825 mtspr 256, r7 # restore vrsave
826 li r10,`15+6*$SIZE_T`
827 li r11,`31+6*$SIZE_T`
851 $POP r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
852 $POP r30,`$FRAME+$SIZE_T*0`($sp)
853 $POP r31,`$FRAME+$SIZE_T*1`($sp)
855 addi $sp,$sp,`$FRAME+$SIZE_T*2`
858 .byte 0,12,0x04,1,0x80,2,6,0
860 .size .vpaes_cbc_encrypt,.-.vpaes_cbc_encrypt
864 my ($inp,$bits,$out)=map("r$_",(3..5));
866 my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_",(10..13,24));
869 ########################################################
871 ## AES key schedule ##
873 ########################################################
879 li r11, 0xc0 # Lk_inv
884 vspltisb v8,4 # 0x04..04
885 vxor v9,v9,v9 # 0x00..00
886 lvx $invlo, r12, r11 # Lk_inv
890 lvx $iptlo, r12, r9 # Lk_ipt
895 lvx v14, r12, r11 # Lk_sb1
900 lvx v16, r12, r9 # Lk_dksd
904 lvx v18, r12, r11 # Lk_dksb
908 lvx v20, r12, r9 # Lk_dkse
912 lvx v22, r12, r11 # Lk_dks9
915 lvx v24, r12, r9 # Lk_rcon
916 lvx v25, 0, r12 # Lk_mc_forward[0]
917 lvx v26, r12, r8 # Lks63
920 .byte 0,12,0x14,0,0,0,0,0
923 _vpaes_schedule_core:
926 bl _vpaes_key_preheat # load the tables
928 #lvx v0, 0, $inp # vmovdqu (%rdi), %xmm0 # load key (unaligned)
929 neg r8, $inp # prepare for unaligned access
931 addi $inp, $inp, 15 # 15 is not typo
932 ?lvsr $inpperm, 0, r8 # -$inp
933 lvx v6, 0, $inp # v6 serves as inptail
935 ?vperm v0, v0, v6, $inpperm
938 vmr v3, v0 # vmovdqa %xmm0, %xmm3
939 bl _vpaes_schedule_transform
940 vmr v7, v0 # vmovdqa %xmm0, %xmm7
942 bne $dir, Lschedule_am_decrypting
944 # encrypting, output zeroth round key after transform
945 li r8, 0x30 # mov \$0x30,%r8d
950 ?lvsr $outperm, 0, $out # prepare for unaligned access
951 vnor $outmask, v9, v9 # 0xff..ff
952 ?vperm $outmask, v9, $outmask, $outperm
954 #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx)
955 vperm $outhead, v0, v0, $outperm # rotate right/left
956 stvewx $outhead, 0, $out # some are superfluous
957 stvewx $outhead, r9, $out
958 stvewx $outhead, r10, $out
959 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
960 stvewx $outhead, r11, $out
963 Lschedule_am_decrypting:
964 srwi r8, $bits, 1 # shr \$1,%r8d
965 andi. r8, r8, 32 # and \$32,%r8d
966 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
967 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
968 # decrypting, output zeroth round key after shiftrows
969 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
973 vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
975 neg r0, $out # prepare for unaligned access
976 ?lvsl $outperm, 0, r0
977 vnor $outmask, v9, v9 # 0xff..ff
978 ?vperm $outmask, $outmask, v9, $outperm
980 #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx)
981 vperm $outhead, v4, v4, $outperm # rotate right/left
982 stvewx $outhead, 0, $out # some are superfluous
983 stvewx $outhead, r9, $out
984 stvewx $outhead, r10, $out
985 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
986 stvewx $outhead, r11, $out
987 addi $out, $out, 15 # 15 is not typo
988 xori r8, r8, 0x30 # xor \$0x30, %r8
991 cmplwi $bits, 192 # cmp \$192, %esi
999 ## 128-bit specific part of key schedule.
1001 ## This schedule is really simple, because all its parts
1002 ## are accomplished by the subroutines.
1005 li r0, 10 # mov \$10, %esi
1009 bl _vpaes_schedule_round
1010 bdz Lschedule_mangle_last # dec %esi
1011 bl _vpaes_schedule_mangle # write output
1015 ## .aes_schedule_192
1017 ## 192-bit specific part of key schedule.
1019 ## The main body of this schedule is the same as the 128-bit
1020 ## schedule, but with more smearing. The long, high side is
1021 ## stored in %xmm7 as before, and the short, low side is in
1022 ## the high bits of %xmm6.
1024 ## This schedule is somewhat nastier, however, because each
1025 ## round produces 192 bits of key material, or 1.5 round keys.
1026 ## Therefore, on each cycle we do 2 rounds and produce 3 round
1031 li r0, 4 # mov \$4, %esi
1033 ?vperm v0, v6, v0, $inpperm
1034 ?vsldoi v0, v3, v0, 8 # vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
1035 bl _vpaes_schedule_transform # input transform
1036 ?vsldoi v6, v0, v9, 8
1037 ?vsldoi v6, v9, v6, 8 # clobber "low" side with zeros
1041 bl _vpaes_schedule_round
1042 ?vsldoi v0, v6, v0, 8 # vpalignr \$8,%xmm6,%xmm0,%xmm0
1043 bl _vpaes_schedule_mangle # save key n
1044 bl _vpaes_schedule_192_smear
1045 bl _vpaes_schedule_mangle # save key n+1
1046 bl _vpaes_schedule_round
1047 bdz Lschedule_mangle_last # dec %esi
1048 bl _vpaes_schedule_mangle # save key n+2
1049 bl _vpaes_schedule_192_smear
1053 ## .aes_schedule_256
1055 ## 256-bit specific part of key schedule.
1057 ## The structure here is very similar to the 128-bit
1058 ## schedule, but with an additional "low side" in
1059 ## %xmm6. The low side's rounds are the same as the
1060 ## high side's, except no rcon and no rotation.
1064 li r0, 7 # mov \$7, %esi
1066 lvx v0, 0, $inp # vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
1067 ?vperm v0, v6, v0, $inpperm
1068 bl _vpaes_schedule_transform # input transform
1072 bl _vpaes_schedule_mangle # output low result
1073 vmr v6, v0 # vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
1076 bl _vpaes_schedule_round
1077 bdz Lschedule_mangle_last # dec %esi
1078 bl _vpaes_schedule_mangle
1080 # low round. swap xmm7 and xmm6
1081 ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
1082 vmr v5, v7 # vmovdqa %xmm7, %xmm5
1083 vmr v7, v6 # vmovdqa %xmm6, %xmm7
1084 bl _vpaes_schedule_low_round
1085 vmr v7, v5 # vmovdqa %xmm5, %xmm7
1089 ## .aes_schedule_mangle_last
1091 ## Mangler for last round of key schedule
1093 ## when encrypting, outputs out(%xmm0) ^ 63
1094 ## when decrypting, outputs unskew(%xmm0)
1096 ## Always called right before return... jumps to cleanup and exits
1099 Lschedule_mangle_last:
1100 # schedule last round key from xmm0
1101 li r11, 0x2e0 # lea .Lk_deskew(%rip),%r11
1103 bne $dir, Lschedule_mangle_last_dec
1106 lvx v1, r8, r10 # vmovdqa (%r8,%r10),%xmm1
1107 li r11, 0x2c0 # lea .Lk_opt(%rip), %r11 # prepare to output transform
1108 li r9, 0x2d0 # prepare to output transform
1109 vperm v0, v0, v0, v1 # vpshufb %xmm1, %xmm0, %xmm0 # output permute
1111 lvx $iptlo, r11, r12 # reload $ipt
1113 addi $out, $out, 16 # add \$16, %rdx
1114 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
1115 bl _vpaes_schedule_transform # output transform
1117 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
1118 vperm v0, v0, v0, $outperm # rotate right/left
1120 vsel v2, $outhead, v0, $outmask
1124 stvewx v0, 0, $out # some (or all) are redundant
1125 stvewx v0, r10, $out
1126 stvewx v0, r11, $out
1127 stvewx v0, r12, $out
1128 b Lschedule_mangle_done
1131 Lschedule_mangle_last_dec:
1132 lvx $iptlo, r11, r12 # reload $ipt
1134 addi $out, $out, -16 # add \$-16, %rdx
1135 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
1136 bl _vpaes_schedule_transform # output transform
1138 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
1139 addi r9, $out, -15 # -15 is not typo
1140 vperm v0, v0, v0, $outperm # rotate right/left
1142 vsel v2, $outhead, v0, $outmask
1146 stvewx v0, 0, r9 # some (or all) are redundant
1152 Lschedule_mangle_done:
1155 vxor v0, v0, v0 # vpxor %xmm0, %xmm0, %xmm0
1156 vxor v1, v1, v1 # vpxor %xmm1, %xmm1, %xmm1
1157 vxor v2, v2, v2 # vpxor %xmm2, %xmm2, %xmm2
1158 vxor v3, v3, v3 # vpxor %xmm3, %xmm3, %xmm3
1159 vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
1160 vxor v5, v5, v5 # vpxor %xmm5, %xmm5, %xmm5
1161 vxor v6, v6, v6 # vpxor %xmm6, %xmm6, %xmm6
1162 vxor v7, v7, v7 # vpxor %xmm7, %xmm7, %xmm7
1166 .byte 0,12,0x14,0,0,0,0,0
1169 ## .aes_schedule_192_smear
1171 ## Smear the short, low side in the 192-bit key schedule.
1174 ## %xmm7: high side, b a x y
1175 ## %xmm6: low side, d c 0 0
1179 ## %xmm6: b+c+d b+c 0 0
1180 ## %xmm0: b+c+d b+c b a
1183 _vpaes_schedule_192_smear:
1185 ?vsldoi v1, v9, v6, 12 # vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
1186 ?vsldoi v0, v7, v0, 8 # vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
1187 vxor v6, v6, v1 # vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
1188 vxor v6, v6, v0 # vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
1190 ?vsldoi v6, v6, v9, 8
1191 ?vsldoi v6, v9, v6, 8 # clobber low side with zeros
1194 .byte 0,12,0x14,0,0,0,0,0
1197 ## .aes_schedule_round
1199 ## Runs one main round of the key schedule on %xmm0, %xmm7
1201 ## Specifically, runs subbytes on the high dword of %xmm0
1202 ## then rotates it by one byte and xors into the low dword of
1205 ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
1208 ## Smears the dwords of %xmm7 by xoring the low into the
1209 ## second low, result into third, result into highest.
1211 ## Returns results in %xmm7 = %xmm0.
1212 ## Clobbers %xmm1-%xmm4, %r11.
1215 _vpaes_schedule_round:
1216 # extract rcon from xmm8
1217 #vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
1218 ?vsldoi v1, $rcon, v9, 15 # vpalignr \$15, %xmm8, %xmm4, %xmm1
1219 ?vsldoi $rcon, $rcon, $rcon, 15 # vpalignr \$15, %xmm8, %xmm8, %xmm8
1220 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
1223 ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
1224 ?vsldoi v0, v0, v0, 1 # vpalignr \$1, %xmm0, %xmm0, %xmm0
1228 # low round: same as high round, but no rotation and no rcon.
1229 _vpaes_schedule_low_round:
1231 ?vsldoi v1, v9, v7, 12 # vpslldq \$4, %xmm7, %xmm1
1232 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
1233 vspltisb v1, 0x0f # 0x0f..0f
1234 ?vsldoi v4, v9, v7, 8 # vpslldq \$8, %xmm7, %xmm4
1237 vand v1, v1, v0 # vpand %xmm9, %xmm0, %xmm1 # 0 = k
1238 vsrb v0, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
1239 vxor v7, v7, v4 # vpxor %xmm4, %xmm7, %xmm7
1240 vperm v2, $invhi, v9, v1 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
1241 vxor v1, v1, v0 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
1242 vperm v3, $invlo, v9, v0 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
1243 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
1244 vperm v4, $invlo, v9, v1 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
1245 vxor v7, v7, v26 # vpxor .Lk_s63(%rip), %xmm7, %xmm7
1246 vperm v3, $invlo, v9, v3 # vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
1247 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
1248 vperm v2, $invlo, v9, v4 # vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
1249 vxor v3, v3, v1 # vpxor %xmm1, %xmm3, %xmm3 # 2 = io
1250 vxor v2, v2, v0 # vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
1251 vperm v4, v15, v9, v3 # vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
1252 vperm v1, v14, v9, v2 # vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
1253 vxor v1, v1, v4 # vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
1255 # add in smeared stuff
1256 vxor v0, v1, v7 # vpxor %xmm7, %xmm1, %xmm0
1257 vxor v7, v1, v7 # vmovdqa %xmm0, %xmm7
1260 .byte 0,12,0x14,0,0,0,0,0
1263 ## .aes_schedule_transform
1265 ## Linear-transform %xmm0 according to tables at (%r11)
1267 ## Requires that %xmm9 = 0x0F0F... as in preheat
1272 _vpaes_schedule_transform:
1273 #vand v1, v0, v9 # vpand %xmm9, %xmm0, %xmm1
1274 vsrb v2, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
1275 # vmovdqa (%r11), %xmm2 # lo
1276 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
1277 # vmovdqa 16(%r11), %xmm1 # hi
1278 vperm v2, $ipthi, $ipthi, v2 # vpshufb %xmm0, %xmm1, %xmm0
1279 vxor v0, v0, v2 # vpxor %xmm2, %xmm0, %xmm0
1282 .byte 0,12,0x14,0,0,0,0,0
1285 ## .aes_schedule_mangle
1287 ## Mangle xmm0 from (basis-transformed) standard version
1292 ## multiply by circulant 0,1,1,1
1293 ## apply shiftrows transform
1297 ## multiply by "inverse mixcolumns" circulant E,B,D,9
1299 ## apply shiftrows transform
1302 ## Writes out to (%rdx), and increments or decrements it
1303 ## Keeps track of round number mod 4 in %r8
1305 ## Clobbers xmm1-xmm5
1308 _vpaes_schedule_mangle:
1309 #vmr v4, v0 # vmovdqa %xmm0, %xmm4 # save xmm0 for later
1310 # vmovdqa .Lk_mc_forward(%rip),%xmm5
1311 bne $dir, Lschedule_mangle_dec
1314 vxor v4, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm4
1315 addi $out, $out, 16 # add \$16, %rdx
1316 vperm v4, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm4
1317 vperm v1, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm1
1318 vperm v3, v1, v1, v25 # vpshufb %xmm5, %xmm1, %xmm3
1319 vxor v4, v4, v1 # vpxor %xmm1, %xmm4, %xmm4
1320 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
1321 vxor v3, v3, v4 # vpxor %xmm4, %xmm3, %xmm3
1323 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
1324 addi r8, r8, -16 # add \$-16, %r8
1325 andi. r8, r8, 0x30 # and \$0x30, %r8
1327 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
1328 vperm v1, v3, v3, $outperm # rotate right/left
1329 vsel v2, $outhead, v1, $outmask
1335 Lschedule_mangle_dec:
1336 # inverse mix columns
1337 # lea .Lk_dksd(%rip),%r11
1338 vsrb v1, v0, v8 # vpsrlb \$4, %xmm4, %xmm1 # 1 = hi
1339 #and v4, v0, v9 # vpand %xmm9, %xmm4, %xmm4 # 4 = lo
1341 # vmovdqa 0x00(%r11), %xmm2
1342 vperm v2, v16, v16, v0 # vpshufb %xmm4, %xmm2, %xmm2
1343 # vmovdqa 0x10(%r11), %xmm3
1344 vperm v3, v17, v17, v1 # vpshufb %xmm1, %xmm3, %xmm3
1345 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1346 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1348 # vmovdqa 0x20(%r11), %xmm2
1349 vperm v2, v18, v18, v0 # vpshufb %xmm4, %xmm2, %xmm2
1350 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1351 # vmovdqa 0x30(%r11), %xmm3
1352 vperm v3, v19, v19, v1 # vpshufb %xmm1, %xmm3, %xmm3
1353 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1354 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1356 # vmovdqa 0x40(%r11), %xmm2
1357 vperm v2, v20, v20, v0 # vpshufb %xmm4, %xmm2, %xmm2
1358 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1359 # vmovdqa 0x50(%r11), %xmm3
1360 vperm v3, v21, v21, v1 # vpshufb %xmm1, %xmm3, %xmm3
1361 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1363 # vmovdqa 0x60(%r11), %xmm2
1364 vperm v2, v22, v22, v0 # vpshufb %xmm4, %xmm2, %xmm2
1365 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1366 # vmovdqa 0x70(%r11), %xmm4
1367 vperm v4, v23, v23, v1 # vpshufb %xmm1, %xmm4, %xmm4
1368 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
1369 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1370 vxor v3, v4, v2 # vpxor %xmm2, %xmm4, %xmm3
1372 addi $out, $out, -16 # add \$-16, %rdx
1374 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
1375 addi r8, r8, -16 # add \$-16, %r8
1376 andi. r8, r8, 0x30 # and \$0x30, %r8
1378 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
1379 vperm v1, v3, v3, $outperm # rotate right/left
1380 vsel v2, $outhead, v1, $outmask
1385 .byte 0,12,0x14,0,0,0,0,0
1387 .globl .vpaes_set_encrypt_key
1389 .vpaes_set_encrypt_key:
1390 $STU $sp,-$FRAME($sp)
1391 li r10,`15+6*$SIZE_T`
1392 li r11,`31+6*$SIZE_T`
1394 mfspr r6, 256 # save vrsave
1417 stw r6,`$FRAME-4`($sp) # save vrsave
1419 $PUSH r0, `$FRAME+$LRSAVE`($sp)
1420 mtspr 256, r7 # preserve all AltiVec registers
1422 srwi r9, $bits, 5 # shr \$5,%eax
1423 addi r9, r9, 6 # add \$5,%eax
1424 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1426 cmplw $dir, $bits, $bits # set encrypt direction
1427 li r8, 0x30 # mov \$0x30,%r8d
1428 bl _vpaes_schedule_core
1430 $POP r0, `$FRAME+$LRSAVE`($sp)
1431 li r10,`15+6*$SIZE_T`
1432 li r11,`31+6*$SIZE_T`
1433 mtspr 256, r6 # restore vrsave
1461 .byte 0,12,0x04,1,0x80,0,3,0
1463 .size .vpaes_set_encrypt_key,.-.vpaes_set_encrypt_key
1465 .globl .vpaes_set_decrypt_key
1467 .vpaes_set_decrypt_key:
1468 $STU $sp,-$FRAME($sp)
1469 li r10,`15+6*$SIZE_T`
1470 li r11,`31+6*$SIZE_T`
1472 mfspr r6, 256 # save vrsave
1495 stw r6,`$FRAME-4`($sp) # save vrsave
1497 $PUSH r0, `$FRAME+$LRSAVE`($sp)
1498 mtspr 256, r7 # preserve all AltiVec registers
1500 srwi r9, $bits, 5 # shr \$5,%eax
1501 addi r9, r9, 6 # add \$5,%eax
1502 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1504 slwi r9, r9, 4 # shl \$4,%eax
1505 add $out, $out, r9 # lea (%rdx,%rax),%rdx
1507 cmplwi $dir, $bits, 0 # set decrypt direction
1508 srwi r8, $bits, 1 # shr \$1,%r8d
1509 andi. r8, r8, 32 # and \$32,%r8d
1510 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
1511 bl _vpaes_schedule_core
1513 $POP r0, `$FRAME+$LRSAVE`($sp)
1514 li r10,`15+6*$SIZE_T`
1515 li r11,`31+6*$SIZE_T`
1516 mtspr 256, r6 # restore vrsave
1544 .byte 0,12,0x04,1,0x80,0,3,0
1546 .size .vpaes_set_decrypt_key,.-.vpaes_set_decrypt_key
1551 foreach (split("\n",$code)) {
1552 s/\`([^\`]*)\`/eval $1/geo;
1554 # constants table endian-specific conversion
1555 if ($consts && m/\.long\s+(.+)\s+(\?[a-z]*)$/o) {
1559 # convert to endian-agnostic format
1560 foreach (split(/,\s+/,$1)) {
1561 my $l = /^0/?oct:int;
1562 push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
1565 # little-endian conversion
1566 if ($flavour =~ /le$/o) {
1567 SWITCH: for($conv) {
1568 /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
1569 /\?rev/ && do { @bytes=reverse(@bytes); last; };
1574 print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
1577 $consts=0 if (m/Lconsts:/o); # end of table
1579 # instructions prefixed with '?' are endian-specific and need
1580 # to be adjusted accordingly...
1581 if ($flavour =~ /le$/o) { # little-endian
1584 s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
1585 s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
1586 s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
1587 } else { # big-endian