2 # Do not modify. This file is auto-generated from bsaes-armv7.pl.
4 @ ====================================================================
5 @ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
6 @ project. The module is, however, dual licensed under OpenSSL and
7 @ CRYPTOGAMS licenses depending on where you obtain it. For further
8 @ details see http://www.openssl.org/~appro/cryptogams/.
10 @ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
11 @ <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
13 @ ====================================================================
15 @ Bit-sliced AES for ARM NEON
19 @ This implementation is direct adaptation of bsaes-x86_64 module for
20 @ ARM NEON. Except that this module is endian-neutral [in sense that
21 @ it can be compiled for either endianness] by courtesy of vld1.8's
22 @ neutrality. Initial version doesn't implement interface to OpenSSL,
23 @ only low-level primitives and unsupported entry points, just enough
24 @ to collect performance results, which for Cortex-A8 core are:
26 @ encrypt 19.5 cycles per byte processed with 128-bit key
27 @ decrypt 22.1 cycles per byte processed with 128-bit key
28 @ key conv. 440 cycles per 128-bit key/0.18 of 8x block
30 @ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
31 @ which is [much] worse than anticipated (for further details see
32 @ http://www.openssl.org/~appro/Snapdragon-S4.html).
34 @ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
35 @ manages in 20.0 cycles].
37 @ When comparing to x86_64 results keep in mind that NEON unit is
38 @ [mostly] single-issue and thus can't [fully] benefit from
39 @ instruction-level parallelism. And when comparing to aes-armv4
40 @ results keep in mind key schedule conversion overhead (see
41 @ bsaes-x86_64.pl for further details)...
47 @ Add CBC, CTR and XTS subroutines, adapt for kernel use.
49 @ <ard.biesheuvel@linaro.org>
52 # include "arm_arch.h"
54 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
55 # define VFP_ABI_POP vldmia sp!,{d8-d15}
56 # define VFP_ABI_FRAME 0x40
60 # define VFP_ABI_FRAME 0
61 # define BSAES_ASM_EXTENDED_KEY
62 # define XTS_CHAIN_TWEAK
63 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
64 # define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
71 #if __ARM_MAX_ARCH__>=7
76 .syntax unified @ ARMv7-capable assembler is expected to handle this
83 .type _bsaes_decrypt8,%function
86 adr r6,_bsaes_decrypt8
87 vldmia r4!, {q9} @ round 0 key
88 add r6,r6,#.LM0ISR-_bsaes_decrypt8
90 vldmia r6!, {q8} @ .LM0ISR
91 veor q10, q0, q9 @ xor with round0 key
100 vtbl.8 d5, {q12}, d17
102 vtbl.8 d6, {q13}, d16
103 vtbl.8 d7, {q13}, d17
105 vtbl.8 d8, {q14}, d16
106 vtbl.8 d9, {q14}, d17
108 vtbl.8 d10, {q15}, d16
109 vtbl.8 d11, {q15}, d17
111 vtbl.8 d12, {q10}, d16
112 vtbl.8 d13, {q10}, d17
113 vtbl.8 d14, {q11}, d16
114 vtbl.8 d15, {q11}, d17
115 vmov.i8 q8,#0x55 @ compose .LBS0
116 vmov.i8 q9,#0x33 @ compose .LBS1
124 vshl.u64 q10, q10, #1
126 vshl.u64 q11, q11, #1
136 vshl.u64 q10, q10, #1
138 vshl.u64 q11, q11, #1
141 vmov.i8 q8,#0x0f @ compose .LBS2
149 vshl.u64 q10, q10, #2
151 vshl.u64 q11, q11, #2
161 vshl.u64 q10, q10, #2
163 vshl.u64 q11, q11, #2
173 vshl.u64 q10, q10, #4
175 vshl.u64 q11, q11, #4
185 vshl.u64 q10, q10, #4
187 vshl.u64 q11, q11, #4
205 vtbl.8 d4, {q10}, d24
206 vtbl.8 d5, {q10}, d25
208 vtbl.8 d6, {q11}, d24
209 vtbl.8 d7, {q11}, d25
216 vtbl.8 d10, {q9}, d24
217 vtbl.8 d11, {q9}, d25
219 vtbl.8 d12, {q10}, d24
220 vtbl.8 d13, {q10}, d25
221 vtbl.8 d14, {q11}, d24
222 vtbl.8 d15, {q11}, d25
275 @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
277 @ new smaller inversion
284 veor q14, q8, q14 @ q14=q15
373 @ multiplication by 0x05-0x00-0x04-0x00
374 vext.8 q8, q0, q0, #8
375 vext.8 q14, q3, q3, #8
376 vext.8 q15, q5, q5, #8
378 vext.8 q9, q1, q1, #8
380 vext.8 q10, q6, q6, #8
382 vext.8 q11, q4, q4, #8
384 vext.8 q12, q2, q2, #8
386 vext.8 q13, q7, q7, #8
405 vext.8 q8, q0, q0, #12 @ x0 <<< 32
406 vext.8 q9, q1, q1, #12
407 veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
408 vext.8 q10, q6, q6, #12
410 vext.8 q11, q4, q4, #12
412 vext.8 q12, q2, q2, #12
414 vext.8 q13, q7, q7, #12
416 vext.8 q14, q3, q3, #12
418 vext.8 q15, q5, q5, #12
423 vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
427 vext.8 q1, q1, q1, #8
432 vext.8 q8, q2, q2, #8
434 vext.8 q9, q7, q7, #8
436 vext.8 q2, q4, q4, #8
438 vext.8 q7, q5, q5, #8
440 vext.8 q4, q3, q3, #8
442 vext.8 q3, q6, q6, #8
451 vldmia r6, {q12} @ .LISR
452 ite eq @ Thumb2 thing, sanity check in ARM
455 vldmia r6, {q12} @ .LISRM0
459 vmov.i8 q8,#0x55 @ compose .LBS0
460 vmov.i8 q9,#0x33 @ compose .LBS1
468 vshl.u64 q10, q10, #1
470 vshl.u64 q11, q11, #1
480 vshl.u64 q10, q10, #1
482 vshl.u64 q11, q11, #1
485 vmov.i8 q8,#0x0f @ compose .LBS2
493 vshl.u64 q10, q10, #2
495 vshl.u64 q11, q11, #2
505 vshl.u64 q10, q10, #2
507 vshl.u64 q11, q11, #2
517 vshl.u64 q10, q10, #4
519 vshl.u64 q11, q11, #4
529 vshl.u64 q10, q10, #4
531 vshl.u64 q11, q11, #4
534 vldmia r4, {q8} @ last round key
544 .size _bsaes_decrypt8,.-_bsaes_decrypt8
546 .type _bsaes_const,%object
549 .LM0ISR: @ InvShiftRows constants
550 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
552 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
554 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
555 .LM0SR: @ ShiftRows constants
556 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
558 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
560 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
562 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
564 .quad 0x090d01050c000408, 0x03070b0f060a0e02
565 .asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro@openssl.org>"
567 .size _bsaes_const,.-_bsaes_const
569 .type _bsaes_encrypt8,%function
572 adr r6,_bsaes_encrypt8
573 vldmia r4!, {q9} @ round 0 key
574 sub r6,r6,#_bsaes_encrypt8-.LM0SR
576 vldmia r6!, {q8} @ .LM0SR
578 veor q10, q0, q9 @ xor with round0 key
580 vtbl.8 d0, {q10}, d16
581 vtbl.8 d1, {q10}, d17
583 vtbl.8 d2, {q11}, d16
584 vtbl.8 d3, {q11}, d17
586 vtbl.8 d4, {q12}, d16
587 vtbl.8 d5, {q12}, d17
589 vtbl.8 d6, {q13}, d16
590 vtbl.8 d7, {q13}, d17
592 vtbl.8 d8, {q14}, d16
593 vtbl.8 d9, {q14}, d17
595 vtbl.8 d10, {q15}, d16
596 vtbl.8 d11, {q15}, d17
598 vtbl.8 d12, {q10}, d16
599 vtbl.8 d13, {q10}, d17
600 vtbl.8 d14, {q11}, d16
601 vtbl.8 d15, {q11}, d17
602 _bsaes_encrypt8_bitslice:
603 vmov.i8 q8,#0x55 @ compose .LBS0
604 vmov.i8 q9,#0x33 @ compose .LBS1
612 vshl.u64 q10, q10, #1
614 vshl.u64 q11, q11, #1
624 vshl.u64 q10, q10, #1
626 vshl.u64 q11, q11, #1
629 vmov.i8 q8,#0x0f @ compose .LBS2
637 vshl.u64 q10, q10, #2
639 vshl.u64 q11, q11, #2
649 vshl.u64 q10, q10, #2
651 vshl.u64 q11, q11, #2
661 vshl.u64 q10, q10, #4
663 vshl.u64 q11, q11, #4
673 vshl.u64 q10, q10, #4
675 vshl.u64 q11, q11, #4
693 vtbl.8 d4, {q10}, d24
694 vtbl.8 d5, {q10}, d25
696 vtbl.8 d6, {q11}, d24
697 vtbl.8 d7, {q11}, d25
704 vtbl.8 d10, {q9}, d24
705 vtbl.8 d11, {q9}, d25
707 vtbl.8 d12, {q10}, d24
708 vtbl.8 d13, {q10}, d25
709 vtbl.8 d14, {q11}, d24
710 vtbl.8 d15, {q11}, d25
764 @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
766 @ new smaller inversion
773 veor q14, q8, q14 @ q14=q15
860 vext.8 q8, q0, q0, #12 @ x0 <<< 32
861 vext.8 q9, q1, q1, #12
862 veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
863 vext.8 q10, q4, q4, #12
865 vext.8 q11, q6, q6, #12
867 vext.8 q12, q3, q3, #12
869 vext.8 q13, q7, q7, #12
871 vext.8 q14, q2, q2, #12
873 vext.8 q15, q5, q5, #12
878 vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
882 vext.8 q1, q1, q1, #8
887 vext.8 q8, q3, q3, #8
889 vext.8 q9, q7, q7, #8
891 vext.8 q3, q6, q6, #8
893 vext.8 q7, q5, q5, #8
895 vext.8 q6, q2, q2, #8
897 vext.8 q2, q4, q4, #8
906 vldmia r6, {q12} @ .LSR
907 ite eq @ Thumb2 thing, samity check in ARM
910 vldmia r6, {q12} @ .LSRM0
914 vmov.i8 q8,#0x55 @ compose .LBS0
915 vmov.i8 q9,#0x33 @ compose .LBS1
923 vshl.u64 q10, q10, #1
925 vshl.u64 q11, q11, #1
935 vshl.u64 q10, q10, #1
937 vshl.u64 q11, q11, #1
940 vmov.i8 q8,#0x0f @ compose .LBS2
948 vshl.u64 q10, q10, #2
950 vshl.u64 q11, q11, #2
960 vshl.u64 q10, q10, #2
962 vshl.u64 q11, q11, #2
972 vshl.u64 q10, q10, #4
974 vshl.u64 q11, q11, #4
984 vshl.u64 q10, q10, #4
986 vshl.u64 q11, q11, #4
989 vldmia r4, {q8} @ last round key
999 .size _bsaes_encrypt8,.-_bsaes_encrypt8
1000 .type _bsaes_key_convert,%function
1003 adr r6,_bsaes_key_convert
1004 vld1.8 {q7}, [r4]! @ load round 0 key
1005 sub r6,r6,#_bsaes_key_convert-.LM0
1006 vld1.8 {q15}, [r4]! @ load round 1 key
1008 vmov.i8 q8, #0x01 @ bit masks
1014 vldmia r6, {q14} @ .LM0
1021 vstmia r12!, {q7} @ save round 0 key
1026 vtbl.8 d14,{q15},d28
1027 vtbl.8 d15,{q15},d29
1039 vld1.8 {q15}, [r4]! @ load next round key
1040 vmvn q0, q0 @ "pnot"
1048 vstmia r12!,{q0-q7} @ write bit-sliced round key
1051 vmov.i8 q7,#0x63 @ compose .L63
1052 @ don't save last round key
1054 .size _bsaes_key_convert,.-_bsaes_key_convert
1055 .extern AES_cbc_encrypt
1058 .global bsaes_cbc_encrypt
1059 .type bsaes_cbc_encrypt,%function
1073 @ it is up to the caller to make sure we are called with enc == 0
1076 stmdb sp!, {r4-r10, lr}
1078 ldr r8, [ip] @ IV is 1st arg on the stack
1079 mov r2, r2, lsr#4 @ len in 16 byte blocks
1080 sub sp, #0x10 @ scratch space to carry over the IV
1081 mov r9, sp @ save sp
1083 ldr r10, [r3, #240] @ get # of rounds
1084 #ifndef BSAES_ASM_EXTENDED_KEY
1085 @ allocate the key schedule on the stack
1086 sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
1087 add r12, #96 @ sifze of bit-slices key schedule
1089 @ populate the key schedule
1090 mov r4, r3 @ pass key
1091 mov r5, r10 @ pass # of rounds
1092 mov sp, r12 @ sp is sp
1093 bl _bsaes_key_convert
1095 vstmia r12, {q15} @ save last round key
1096 veor q7, q7, q6 @ fix up round 0 key
1103 @ populate the key schedule
1105 mov r4, r3 @ pass key
1106 mov r5, r10 @ pass # of rounds
1107 add r12, r3, #248 @ pass key schedule
1108 bl _bsaes_key_convert
1111 vstmia r12, {q15} @ save last round key
1112 veor q7, q7, q6 @ fix up round 0 key
1119 vld1.8 {q15}, [r8] @ load IV
1125 bmi .Lcbc_dec_loop_finish
1127 vld1.8 {q0-q1}, [r0]! @ load input
1128 vld1.8 {q2-q3}, [r0]!
1129 #ifndef BSAES_ASM_EXTENDED_KEY
1130 mov r4, sp @ pass the key
1134 vld1.8 {q4-q5}, [r0]!
1136 vld1.8 {q6-q7}, [r0]
1138 vstmia r9, {q15} @ put aside IV
1142 vldmia r9, {q14} @ reload IV
1143 vld1.8 {q8-q9}, [r0]! @ reload input
1144 veor q0, q0, q14 @ ^= IV
1145 vld1.8 {q10-q11}, [r0]!
1148 vld1.8 {q12-q13}, [r0]!
1151 vld1.8 {q14-q15}, [r0]!
1153 vst1.8 {q0-q1}, [r1]! @ write output
1165 .Lcbc_dec_loop_finish:
1169 vld1.8 {q0}, [r0]! @ load input
1173 #ifndef BSAES_ASM_EXTENDED_KEY
1174 mov r4, sp @ pass the key
1179 vstmia r9, {q15} @ put aside IV
1196 vldmia r9, {q14} @ reload IV
1197 vld1.8 {q8-q9}, [r0]! @ reload input
1198 veor q0, q0, q14 @ ^= IV
1199 vld1.8 {q10-q11}, [r0]!
1202 vld1.8 {q12-q13}, [r0]!
1207 vst1.8 {q0-q1}, [r1]! @ write output
1219 vldmia r9,{q14} @ reload IV
1220 vld1.8 {q8-q9}, [r0]! @ reload input
1221 veor q0, q0, q14 @ ^= IV
1222 vld1.8 {q10-q11}, [r0]!
1230 vst1.8 {q0-q1}, [r1]! @ write output
1240 vldmia r9, {q14} @ reload IV
1241 vld1.8 {q8-q9}, [r0]! @ reload input
1242 veor q0, q0, q14 @ ^= IV
1243 vld1.8 {q10-q11}, [r0]!
1248 vst1.8 {q0-q1}, [r1]! @ write output
1258 vldmia r9, {q14} @ reload IV
1259 vld1.8 {q8-q9}, [r0]! @ reload input
1260 veor q0, q0, q14 @ ^= IV
1266 vst1.8 {q0-q1}, [r1]! @ write output
1274 vldmia r9, {q14} @ reload IV
1275 vld1.8 {q8-q9}, [r0]! @ reload input
1276 veor q0, q0, q14 @ ^= IV
1280 vst1.8 {q0-q1}, [r1]! @ write output
1287 vldmia r9, {q14} @ reload IV
1288 vld1.8 {q8}, [r0]! @ reload input
1289 veor q0, q0, q14 @ ^= IV
1290 vld1.8 {q15}, [r0]! @ reload input
1292 vst1.8 {q0-q1}, [r1]! @ write output
1297 mov r10, r1 @ save original out pointer
1298 mov r1, r9 @ use the iv scratch space as out buffer
1300 vmov q4,q15 @ just in case ensure that IV
1301 vmov q5,q0 @ and input are preserved
1303 vld1.8 {q0}, [r9,:64] @ load result
1304 veor q0, q0, q4 @ ^= IV
1305 vmov q15, q5 @ q5 holds input
1306 vst1.8 {q0}, [r10] @ write output
1309 #ifndef BSAES_ASM_EXTENDED_KEY
1312 .Lcbc_dec_bzero: @ wipe key schedule [if any]
1319 add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
1320 vst1.8 {q15}, [r8] @ return IV
1322 ldmia sp!, {r4-r10, pc}
1323 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1325 .global bsaes_ctr32_encrypt_blocks
1326 .type bsaes_ctr32_encrypt_blocks,%function
1328 bsaes_ctr32_encrypt_blocks:
1329 cmp r2, #8 @ use plain AES for
1330 blo .Lctr_enc_short @ small sizes
1333 stmdb sp!, {r4-r10, lr}
1335 ldr r8, [ip] @ ctr is 1st arg on the stack
1336 sub sp, sp, #0x10 @ scratch space to carry over the ctr
1337 mov r9, sp @ save sp
1339 ldr r10, [r3, #240] @ get # of rounds
1340 #ifndef BSAES_ASM_EXTENDED_KEY
1341 @ allocate the key schedule on the stack
1342 sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
1343 add r12, #96 @ size of bit-sliced key schedule
1345 @ populate the key schedule
1346 mov r4, r3 @ pass key
1347 mov r5, r10 @ pass # of rounds
1348 mov sp, r12 @ sp is sp
1349 bl _bsaes_key_convert
1350 veor q7,q7,q15 @ fix up last round key
1351 vstmia r12, {q7} @ save last round key
1353 vld1.8 {q0}, [r8] @ load counter
1354 add r8, r6, #.LREVM0SR-.LM0 @ borrow r8
1355 vldmia sp, {q4} @ load round0 key
1361 @ populate the key schedule
1363 mov r4, r3 @ pass key
1364 mov r5, r10 @ pass # of rounds
1365 add r12, r3, #248 @ pass key schedule
1366 bl _bsaes_key_convert
1367 veor q7,q7,q15 @ fix up last round key
1368 vstmia r12, {q7} @ save last round key
1371 0: add r12, r3, #248
1372 vld1.8 {q0}, [r8] @ load counter
1373 adrl r8, .LREVM0SR @ borrow r8
1374 vldmia r12, {q4} @ load round0 key
1375 sub sp, #0x10 @ place for adjusted round0 key
1378 vmov.i32 q8,#1 @ compose 1<<96
1383 vadd.u32 q9,q8,q8 @ compose 2<<96
1384 vstmia sp, {q4} @ save adjusted round0 key
1389 vadd.u32 q10, q8, q9 @ compose 3<<96
1390 vadd.u32 q1, q0, q8 @ +1
1391 vadd.u32 q2, q0, q9 @ +2
1392 vadd.u32 q3, q0, q10 @ +3
1393 vadd.u32 q4, q1, q10
1394 vadd.u32 q5, q2, q10
1395 vadd.u32 q6, q3, q10
1396 vadd.u32 q7, q4, q10
1397 vadd.u32 q10, q5, q10 @ next counter
1399 @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
1400 @ to flip byte order in 32-bit counter
1402 vldmia sp, {q9} @ load round0 key
1403 #ifndef BSAES_ASM_EXTENDED_KEY
1404 add r4, sp, #0x10 @ pass next round key
1408 vldmia r8, {q8} @ .LREVM0SR
1409 mov r5, r10 @ pass rounds
1410 vstmia r9, {q10} @ save next counter
1411 sub r6, r8, #.LREVM0SR-.LSR @ pass constants
1413 bl _bsaes_encrypt8_alt
1416 blo .Lctr_enc_loop_done
1418 vld1.8 {q8-q9}, [r0]! @ load input
1419 vld1.8 {q10-q11}, [r0]!
1422 vld1.8 {q12-q13}, [r0]!
1425 vld1.8 {q14-q15}, [r0]!
1427 vst1.8 {q0-q1}, [r1]! @ write output
1433 vmov.i32 q8, #1 @ compose 1<<96
1437 vext.8 q8, q9, q8, #4
1439 vadd.u32 q9,q8,q8 @ compose 2<<96
1441 vldmia r9, {q0} @ load counter
1447 .Lctr_enc_loop_done:
1449 vld1.8 {q8}, [r0]! @ load input
1451 vst1.8 {q0}, [r1]! @ write output
1483 #ifndef BSAES_ASM_EXTENDED_KEY
1484 .Lctr_enc_bzero: @ wipe key schedule [if any]
1493 add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
1495 ldmia sp!, {r4-r10, pc} @ return
1499 ldr ip, [sp] @ ctr pointer is passed on stack
1500 stmdb sp!, {r4-r8, lr}
1502 mov r4, r0 @ copy arguments
1506 ldr r8, [ip, #12] @ load counter LSW
1507 vld1.8 {q1}, [ip] @ load whole counter value
1512 vst1.8 {q1}, [sp,:64] @ copy counter value
1515 .Lctr_enc_short_loop:
1516 add r0, sp, #0x10 @ input counter value
1517 mov r1, sp @ output on the stack
1522 vld1.8 {q0}, [r4]! @ load input
1523 vld1.8 {q1}, [sp,:64] @ load encrypted counter
1527 str r0, [sp, #0x1c] @ next counter value
1529 str r8, [sp, #0x1c] @ next counter value
1532 vst1.8 {q0}, [r5]! @ store output
1534 bne .Lctr_enc_short_loop
1540 ldmia sp!, {r4-r8, pc}
1541 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
1542 .globl bsaes_xts_encrypt
1543 .type bsaes_xts_encrypt,%function
1547 stmdb sp!, {r4-r10, lr} @ 0x20
1549 mov r6, sp @ future r3
1556 sub r0, sp, #0x10 @ 0x10
1557 bic r0, #0xf @ align at 16 bytes
1560 #ifdef XTS_CHAIN_TWEAK
1561 ldr r0, [ip] @ pointer to input tweak
1563 @ generate initial tweak
1564 ldr r0, [ip, #4] @ iv[]
1566 ldr r2, [ip, #0] @ key2
1568 mov r0,sp @ pointer to initial tweak
1571 ldr r1, [r10, #240] @ get # of rounds
1573 #ifndef BSAES_ASM_EXTENDED_KEY
1574 @ allocate the key schedule on the stack
1575 sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key
1576 @ add r12, #96 @ size of bit-sliced key schedule
1577 sub r12, #48 @ place for tweak[9]
1579 @ populate the key schedule
1580 mov r4, r10 @ pass key
1581 mov r5, r1 @ pass # of rounds
1583 add r12, #0x90 @ pass key schedule
1584 bl _bsaes_key_convert
1585 veor q7, q7, q15 @ fix up last round key
1586 vstmia r12, {q7} @ save last round key
1588 ldr r12, [r10, #244]
1592 str r12, [r10, #244]
1593 mov r4, r10 @ pass key
1594 mov r5, r1 @ pass # of rounds
1595 add r12, r10, #248 @ pass key schedule
1596 bl _bsaes_key_convert
1597 veor q7, q7, q15 @ fix up last round key
1601 0: sub sp, #0x90 @ place for tweak[9]
1604 vld1.8 {q8}, [r0] @ initial tweak
1613 vldmia r2, {q5} @ load XTS magic
1614 vshr.s64 q6, q8, #63
1618 vst1.64 {q8}, [r0,:128]!
1620 vshr.s64 q7, q9, #63
1623 vadd.u64 q10, q9, q9
1624 vst1.64 {q9}, [r0,:128]!
1626 vshr.s64 q6, q10, #63
1630 vadd.u64 q11, q10, q10
1631 vst1.64 {q10}, [r0,:128]!
1633 vshr.s64 q7, q11, #63
1638 vadd.u64 q12, q11, q11
1639 vst1.64 {q11}, [r0,:128]!
1641 vshr.s64 q6, q12, #63
1646 vadd.u64 q13, q12, q12
1647 vst1.64 {q12}, [r0,:128]!
1649 vshr.s64 q7, q13, #63
1654 vadd.u64 q14, q13, q13
1655 vst1.64 {q13}, [r0,:128]!
1657 vshr.s64 q6, q14, #63
1662 vadd.u64 q15, q14, q14
1663 vst1.64 {q14}, [r0,:128]!
1665 vshr.s64 q7, q15, #63
1670 vadd.u64 q8, q15, q15
1671 vst1.64 {q15}, [r0,:128]!
1674 vst1.64 {q8}, [r0,:128] @ next round tweak
1676 vld1.8 {q6-q7}, [r7]!
1678 #ifndef BSAES_ASM_EXTENDED_KEY
1679 add r4, sp, #0x90 @ pass key schedule
1681 add r4, r10, #248 @ pass key schedule
1684 mov r5, r1 @ pass rounds
1690 vld1.64 {q8-q9}, [r0,:128]!
1691 vld1.64 {q10-q11}, [r0,:128]!
1693 vld1.64 {q12-q13}, [r0,:128]!
1696 vst1.8 {q0-q1}, [r8]!
1698 vld1.64 {q14-q15}, [r0,:128]!
1700 vst1.8 {q8-q9}, [r8]!
1703 vst1.8 {q10-q11}, [r8]!
1705 vst1.8 {q12-q13}, [r8]!
1707 vld1.64 {q8}, [r0,:128] @ next round tweak
1716 vldmia r2, {q5} @ load XTS magic
1717 vshr.s64 q7, q8, #63
1721 vst1.64 {q8}, [r0,:128]!
1723 vshr.s64 q6, q9, #63
1726 vadd.u64 q10, q9, q9
1727 vst1.64 {q9}, [r0,:128]!
1729 vshr.s64 q7, q10, #63
1735 vadd.u64 q11, q10, q10
1736 vst1.64 {q10}, [r0,:128]!
1738 vshr.s64 q6, q11, #63
1745 vadd.u64 q12, q11, q11
1746 vst1.64 {q11}, [r0,:128]!
1748 vshr.s64 q7, q12, #63
1755 vadd.u64 q13, q12, q12
1756 vst1.64 {q12}, [r0,:128]!
1758 vshr.s64 q6, q13, #63
1765 vadd.u64 q14, q13, q13
1766 vst1.64 {q13}, [r0,:128]!
1768 vshr.s64 q7, q14, #63
1775 vadd.u64 q15, q14, q14
1776 vst1.64 {q14}, [r0,:128]!
1778 vshr.s64 q6, q15, #63
1786 vst1.64 {q15}, [r0,:128] @ next round tweak
1790 #ifndef BSAES_ASM_EXTENDED_KEY
1791 add r4, sp, #0x90 @ pass key schedule
1793 add r4, r10, #248 @ pass key schedule
1796 mov r5, r1 @ pass rounds
1801 vld1.64 {q8-q9}, [r0,:128]!
1802 vld1.64 {q10-q11}, [r0,:128]!
1804 vld1.64 {q12-q13}, [r0,:128]!
1807 vst1.8 {q0-q1}, [r8]!
1809 vld1.64 {q14}, [r0,:128]!
1811 vst1.8 {q8-q9}, [r8]!
1814 vst1.8 {q10-q11}, [r8]!
1817 vld1.64 {q8}, [r0,:128] @ next round tweak
1821 vst1.64 {q14}, [r0,:128] @ next round tweak
1824 #ifndef BSAES_ASM_EXTENDED_KEY
1825 add r4, sp, #0x90 @ pass key schedule
1827 add r4, r10, #248 @ pass key schedule
1830 mov r5, r1 @ pass rounds
1835 vld1.64 {q8-q9}, [r0,:128]!
1836 vld1.64 {q10-q11}, [r0,:128]!
1838 vld1.64 {q12-q13}, [r0,:128]!
1841 vst1.8 {q0-q1}, [r8]!
1844 vst1.8 {q8-q9}, [r8]!
1846 vst1.8 {q10-q11}, [r8]!
1848 vld1.64 {q8}, [r0,:128] @ next round tweak
1851 @ put this in range for both ARM and Thumb mode adr instructions
1858 vst1.64 {q13}, [r0,:128] @ next round tweak
1861 #ifndef BSAES_ASM_EXTENDED_KEY
1862 add r4, sp, #0x90 @ pass key schedule
1864 add r4, r10, #248 @ pass key schedule
1867 mov r5, r1 @ pass rounds
1872 vld1.64 {q8-q9}, [r0,:128]!
1873 vld1.64 {q10-q11}, [r0,:128]!
1875 vld1.64 {q12}, [r0,:128]!
1878 vst1.8 {q0-q1}, [r8]!
1881 vst1.8 {q8-q9}, [r8]!
1884 vld1.64 {q8}, [r0,:128] @ next round tweak
1888 vst1.64 {q12}, [r0,:128] @ next round tweak
1891 #ifndef BSAES_ASM_EXTENDED_KEY
1892 add r4, sp, #0x90 @ pass key schedule
1894 add r4, r10, #248 @ pass key schedule
1897 mov r5, r1 @ pass rounds
1902 vld1.64 {q8-q9}, [r0,:128]!
1903 vld1.64 {q10-q11}, [r0,:128]!
1907 vst1.8 {q0-q1}, [r8]!
1909 vst1.8 {q8-q9}, [r8]!
1911 vld1.64 {q8}, [r0,:128] @ next round tweak
1915 vst1.64 {q11}, [r0,:128] @ next round tweak
1918 #ifndef BSAES_ASM_EXTENDED_KEY
1919 add r4, sp, #0x90 @ pass key schedule
1921 add r4, r10, #248 @ pass key schedule
1924 mov r5, r1 @ pass rounds
1929 vld1.64 {q8-q9}, [r0,:128]!
1930 vld1.64 {q10}, [r0,:128]!
1934 vst1.8 {q0-q1}, [r8]!
1937 vld1.64 {q8}, [r0,:128] @ next round tweak
1941 vst1.64 {q10}, [r0,:128] @ next round tweak
1944 #ifndef BSAES_ASM_EXTENDED_KEY
1945 add r4, sp, #0x90 @ pass key schedule
1947 add r4, r10, #248 @ pass key schedule
1950 mov r5, r1 @ pass rounds
1955 vld1.64 {q8-q9}, [r0,:128]!
1958 vst1.8 {q0-q1}, [r8]!
1960 vld1.64 {q8}, [r0,:128] @ next round tweak
1967 vst1.8 {q0}, [sp,:128]
1969 mov r4, r3 @ preserve fp
1973 vld1.8 {q0}, [sp,:128]
1978 vmov q8, q9 @ next round tweak
1981 #ifndef XTS_CHAIN_TWEAK
1988 ldrb r1, [r8, #-0x10]
1989 strb r0, [r8, #-0x10]
1999 vst1.8 {q0}, [sp,:128]
2001 mov r4, r3 @ preserve fp
2005 vld1.8 {q0}, [sp,:128]
2015 #ifdef XTS_CHAIN_TWEAK
2016 ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak
2018 .Lxts_enc_bzero: @ wipe key schedule [if any]
2024 #ifdef XTS_CHAIN_TWEAK
2028 ldmia sp!, {r4-r10, pc} @ return
2030 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2032 .globl bsaes_xts_decrypt
2033 .type bsaes_xts_decrypt,%function
2037 stmdb sp!, {r4-r10, lr} @ 0x20
2039 mov r6, sp @ future r3
2046 sub r0, sp, #0x10 @ 0x10
2047 bic r0, #0xf @ align at 16 bytes
2050 #ifdef XTS_CHAIN_TWEAK
2051 ldr r0, [ip] @ pointer to input tweak
2053 @ generate initial tweak
2054 ldr r0, [ip, #4] @ iv[]
2056 ldr r2, [ip, #0] @ key2
2058 mov r0, sp @ pointer to initial tweak
2061 ldr r1, [r10, #240] @ get # of rounds
2063 #ifndef BSAES_ASM_EXTENDED_KEY
2064 @ allocate the key schedule on the stack
2065 sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key
2066 @ add r12, #96 @ size of bit-sliced key schedule
2067 sub r12, #48 @ place for tweak[9]
2069 @ populate the key schedule
2070 mov r4, r10 @ pass key
2071 mov r5, r1 @ pass # of rounds
2073 add r12, #0x90 @ pass key schedule
2074 bl _bsaes_key_convert
2077 vstmia r12, {q15} @ save last round key
2078 veor q7, q7, q6 @ fix up round 0 key
2081 ldr r12, [r10, #244]
2085 str r12, [r10, #244]
2086 mov r4, r10 @ pass key
2087 mov r5, r1 @ pass # of rounds
2088 add r12, r10, #248 @ pass key schedule
2089 bl _bsaes_key_convert
2092 vstmia r12, {q15} @ save last round key
2093 veor q7, q7, q6 @ fix up round 0 key
2097 0: sub sp, #0x90 @ place for tweak[9]
2099 vld1.8 {q8}, [r0] @ initial tweak
2102 tst r9, #0xf @ if not multiple of 16
2103 it ne @ Thumb2 thing, sanity check in ARM
2104 subne r9, #0x10 @ subtract another 16 bytes
2112 vldmia r2, {q5} @ load XTS magic
2113 vshr.s64 q6, q8, #63
2117 vst1.64 {q8}, [r0,:128]!
2119 vshr.s64 q7, q9, #63
2122 vadd.u64 q10, q9, q9
2123 vst1.64 {q9}, [r0,:128]!
2125 vshr.s64 q6, q10, #63
2129 vadd.u64 q11, q10, q10
2130 vst1.64 {q10}, [r0,:128]!
2132 vshr.s64 q7, q11, #63
2137 vadd.u64 q12, q11, q11
2138 vst1.64 {q11}, [r0,:128]!
2140 vshr.s64 q6, q12, #63
2145 vadd.u64 q13, q12, q12
2146 vst1.64 {q12}, [r0,:128]!
2148 vshr.s64 q7, q13, #63
2153 vadd.u64 q14, q13, q13
2154 vst1.64 {q13}, [r0,:128]!
2156 vshr.s64 q6, q14, #63
2161 vadd.u64 q15, q14, q14
2162 vst1.64 {q14}, [r0,:128]!
2164 vshr.s64 q7, q15, #63
2169 vadd.u64 q8, q15, q15
2170 vst1.64 {q15}, [r0,:128]!
2173 vst1.64 {q8}, [r0,:128] @ next round tweak
2175 vld1.8 {q6-q7}, [r7]!
2177 #ifndef BSAES_ASM_EXTENDED_KEY
2178 add r4, sp, #0x90 @ pass key schedule
2180 add r4, r10, #248 @ pass key schedule
2183 mov r5, r1 @ pass rounds
2189 vld1.64 {q8-q9}, [r0,:128]!
2190 vld1.64 {q10-q11}, [r0,:128]!
2192 vld1.64 {q12-q13}, [r0,:128]!
2195 vst1.8 {q0-q1}, [r8]!
2197 vld1.64 {q14-q15}, [r0,:128]!
2199 vst1.8 {q8-q9}, [r8]!
2202 vst1.8 {q10-q11}, [r8]!
2204 vst1.8 {q12-q13}, [r8]!
2206 vld1.64 {q8}, [r0,:128] @ next round tweak
2215 vldmia r2, {q5} @ load XTS magic
2216 vshr.s64 q7, q8, #63
2220 vst1.64 {q8}, [r0,:128]!
2222 vshr.s64 q6, q9, #63
2225 vadd.u64 q10, q9, q9
2226 vst1.64 {q9}, [r0,:128]!
2228 vshr.s64 q7, q10, #63
2234 vadd.u64 q11, q10, q10
2235 vst1.64 {q10}, [r0,:128]!
2237 vshr.s64 q6, q11, #63
2244 vadd.u64 q12, q11, q11
2245 vst1.64 {q11}, [r0,:128]!
2247 vshr.s64 q7, q12, #63
2254 vadd.u64 q13, q12, q12
2255 vst1.64 {q12}, [r0,:128]!
2257 vshr.s64 q6, q13, #63
2264 vadd.u64 q14, q13, q13
2265 vst1.64 {q13}, [r0,:128]!
2267 vshr.s64 q7, q14, #63
2274 vadd.u64 q15, q14, q14
2275 vst1.64 {q14}, [r0,:128]!
2277 vshr.s64 q6, q15, #63
2285 vst1.64 {q15}, [r0,:128] @ next round tweak
2289 #ifndef BSAES_ASM_EXTENDED_KEY
2290 add r4, sp, #0x90 @ pass key schedule
2292 add r4, r10, #248 @ pass key schedule
2295 mov r5, r1 @ pass rounds
2300 vld1.64 {q8-q9}, [r0,:128]!
2301 vld1.64 {q10-q11}, [r0,:128]!
2303 vld1.64 {q12-q13}, [r0,:128]!
2306 vst1.8 {q0-q1}, [r8]!
2308 vld1.64 {q14}, [r0,:128]!
2310 vst1.8 {q8-q9}, [r8]!
2313 vst1.8 {q10-q11}, [r8]!
2316 vld1.64 {q8}, [r0,:128] @ next round tweak
2320 vst1.64 {q14}, [r0,:128] @ next round tweak
2323 #ifndef BSAES_ASM_EXTENDED_KEY
2324 add r4, sp, #0x90 @ pass key schedule
2326 add r4, r10, #248 @ pass key schedule
2329 mov r5, r1 @ pass rounds
2334 vld1.64 {q8-q9}, [r0,:128]!
2335 vld1.64 {q10-q11}, [r0,:128]!
2337 vld1.64 {q12-q13}, [r0,:128]!
2340 vst1.8 {q0-q1}, [r8]!
2343 vst1.8 {q8-q9}, [r8]!
2345 vst1.8 {q10-q11}, [r8]!
2347 vld1.64 {q8}, [r0,:128] @ next round tweak
2351 vst1.64 {q13}, [r0,:128] @ next round tweak
2354 #ifndef BSAES_ASM_EXTENDED_KEY
2355 add r4, sp, #0x90 @ pass key schedule
2357 add r4, r10, #248 @ pass key schedule
2360 mov r5, r1 @ pass rounds
2365 vld1.64 {q8-q9}, [r0,:128]!
2366 vld1.64 {q10-q11}, [r0,:128]!
2368 vld1.64 {q12}, [r0,:128]!
2371 vst1.8 {q0-q1}, [r8]!
2374 vst1.8 {q8-q9}, [r8]!
2377 vld1.64 {q8}, [r0,:128] @ next round tweak
2381 vst1.64 {q12}, [r0,:128] @ next round tweak
2384 #ifndef BSAES_ASM_EXTENDED_KEY
2385 add r4, sp, #0x90 @ pass key schedule
2387 add r4, r10, #248 @ pass key schedule
2390 mov r5, r1 @ pass rounds
2395 vld1.64 {q8-q9}, [r0,:128]!
2396 vld1.64 {q10-q11}, [r0,:128]!
2400 vst1.8 {q0-q1}, [r8]!
2402 vst1.8 {q8-q9}, [r8]!
2404 vld1.64 {q8}, [r0,:128] @ next round tweak
2408 vst1.64 {q11}, [r0,:128] @ next round tweak
2411 #ifndef BSAES_ASM_EXTENDED_KEY
2412 add r4, sp, #0x90 @ pass key schedule
2414 add r4, r10, #248 @ pass key schedule
2417 mov r5, r1 @ pass rounds
2422 vld1.64 {q8-q9}, [r0,:128]!
2423 vld1.64 {q10}, [r0,:128]!
2427 vst1.8 {q0-q1}, [r8]!
2430 vld1.64 {q8}, [r0,:128] @ next round tweak
2434 vst1.64 {q10}, [r0,:128] @ next round tweak
2437 #ifndef BSAES_ASM_EXTENDED_KEY
2438 add r4, sp, #0x90 @ pass key schedule
2440 add r4, r10, #248 @ pass key schedule
2443 mov r5, r1 @ pass rounds
2448 vld1.64 {q8-q9}, [r0,:128]!
2451 vst1.8 {q0-q1}, [r8]!
2453 vld1.64 {q8}, [r0,:128] @ next round tweak
2460 vst1.8 {q0}, [sp,:128]
2462 mov r4, r3 @ preserve fp
2463 mov r5, r2 @ preserve magic
2467 vld1.8 {q0}, [sp,:128]
2473 vmov q8, q9 @ next round tweak
2476 #ifndef XTS_CHAIN_TWEAK
2480 @ calculate one round of extra tweak for the stolen ciphertext
2482 vshr.s64 q6, q8, #63
2488 @ perform the final decryption with the last tweak value
2493 vst1.8 {q0}, [sp,:128]
2495 mov r4, r3 @ preserve fp
2499 vld1.8 {q0}, [sp,:128]
2507 strb r1, [r8, #0x10]
2517 vst1.8 {q0}, [sp,:128]
2522 vld1.8 {q0}, [sp,:128]
2532 #ifdef XTS_CHAIN_TWEAK
2533 ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak
2535 .Lxts_dec_bzero: @ wipe key schedule [if any]
2541 #ifdef XTS_CHAIN_TWEAK
2545 ldmia sp!, {r4-r10, pc} @ return
2547 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt