2 /* Do not modify. This file is auto-generated from bsaes-armv7.pl. */
3 @ Copyright 2012-2018 The OpenSSL Project Authors. All Rights Reserved.
5 @ Licensed under the OpenSSL license (the "License"). You may not use
6 @ this file except in compliance with the License. You can obtain a copy
7 @ in the file LICENSE in the source distribution or at
8 @ https://www.openssl.org/source/license.html
11 @ ====================================================================
12 @ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
13 @ project. The module is, however, dual licensed under OpenSSL and
14 @ CRYPTOGAMS licenses depending on where you obtain it. For further
15 @ details see http://www.openssl.org/~appro/cryptogams/.
17 @ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
18 @ of Linaro. Permission to use under GPL terms is granted.
19 @ ====================================================================
21 @ Bit-sliced AES for ARM NEON
25 @ This implementation is direct adaptation of bsaes-x86_64 module for
26 @ ARM NEON. Except that this module is endian-neutral [in sense that
27 @ it can be compiled for either endianness] by courtesy of vld1.8's
28 @ neutrality. Initial version doesn't implement interface to OpenSSL,
29 @ only low-level primitives and unsupported entry points, just enough
30 @ to collect performance results, which for Cortex-A8 core are:
32 @ encrypt 19.5 cycles per byte processed with 128-bit key
33 @ decrypt 22.1 cycles per byte processed with 128-bit key
34 @ key conv. 440 cycles per 128-bit key/0.18 of 8x block
36 @ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
37 @ which is [much] worse than anticipated (for further details see
38 @ http://www.openssl.org/~appro/Snapdragon-S4.html).
40 @ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
41 @ manages in 20.0 cycles].
43 @ When comparing to x86_64 results keep in mind that NEON unit is
44 @ [mostly] single-issue and thus can't [fully] benefit from
45 @ instruction-level parallelism. And when comparing to aes-armv4
46 @ results keep in mind key schedule conversion overhead (see
47 @ bsaes-x86_64.pl for further details)...
52 @ Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard.
55 # include "arm_arch.h"
57 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
58 # define VFP_ABI_POP vldmia sp!,{d8-d15}
59 # define VFP_ABI_FRAME 0x40
63 # define VFP_ABI_FRAME 0
64 # define BSAES_ASM_EXTENDED_KEY
65 # define XTS_CHAIN_TWEAK
66 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
67 # define __ARM_MAX_ARCH__ 7
74 #if __ARM_MAX_ARCH__>=7
79 .syntax unified @ ARMv7-capable assembler is expected to handle this
80 #if defined(__thumb2__) && !defined(__APPLE__)
87 .type _bsaes_decrypt8,%function
91 vldmia r4!, {q9} @ round 0 key
92 #if defined(__thumb2__) || defined(__APPLE__)
95 add r6,r6,#.LM0ISR-_bsaes_decrypt8
98 vldmia r6!, {q8} @ .LM0ISR
99 veor q10, q0, q9 @ xor with round0 key
101 vtbl.8 d0, {q10}, d16
102 vtbl.8 d1, {q10}, d17
104 vtbl.8 d2, {q11}, d16
105 vtbl.8 d3, {q11}, d17
107 vtbl.8 d4, {q12}, d16
108 vtbl.8 d5, {q12}, d17
110 vtbl.8 d6, {q13}, d16
111 vtbl.8 d7, {q13}, d17
113 vtbl.8 d8, {q14}, d16
114 vtbl.8 d9, {q14}, d17
116 vtbl.8 d10, {q15}, d16
117 vtbl.8 d11, {q15}, d17
119 vtbl.8 d12, {q10}, d16
120 vtbl.8 d13, {q10}, d17
121 vtbl.8 d14, {q11}, d16
122 vtbl.8 d15, {q11}, d17
123 vmov.i8 q8,#0x55 @ compose .LBS0
124 vmov.i8 q9,#0x33 @ compose .LBS1
132 vshl.u64 q10, q10, #1
134 vshl.u64 q11, q11, #1
144 vshl.u64 q10, q10, #1
146 vshl.u64 q11, q11, #1
149 vmov.i8 q8,#0x0f @ compose .LBS2
157 vshl.u64 q10, q10, #2
159 vshl.u64 q11, q11, #2
169 vshl.u64 q10, q10, #2
171 vshl.u64 q11, q11, #2
181 vshl.u64 q10, q10, #4
183 vshl.u64 q11, q11, #4
193 vshl.u64 q10, q10, #4
195 vshl.u64 q11, q11, #4
202 vldmia r4!, {q8,q9,q10,q11}
213 vtbl.8 d4, {q10}, d24
214 vtbl.8 d5, {q10}, d25
216 vtbl.8 d6, {q11}, d24
217 vtbl.8 d7, {q11}, d25
224 vtbl.8 d10, {q9}, d24
225 vtbl.8 d11, {q9}, d25
227 vtbl.8 d12, {q10}, d24
228 vtbl.8 d13, {q10}, d25
229 vtbl.8 d14, {q11}, d24
230 vtbl.8 d15, {q11}, d25
283 @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
285 @ new smaller inversion
292 veor q14, q8, q14 @ q14=q15
381 @ multiplication by 0x05-0x00-0x04-0x00
382 vext.8 q8, q0, q0, #8
383 vext.8 q14, q3, q3, #8
384 vext.8 q15, q5, q5, #8
386 vext.8 q9, q1, q1, #8
388 vext.8 q10, q6, q6, #8
390 vext.8 q11, q4, q4, #8
392 vext.8 q12, q2, q2, #8
394 vext.8 q13, q7, q7, #8
413 vext.8 q8, q0, q0, #12 @ x0 <<< 32
414 vext.8 q9, q1, q1, #12
415 veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
416 vext.8 q10, q6, q6, #12
418 vext.8 q11, q4, q4, #12
420 vext.8 q12, q2, q2, #12
422 vext.8 q13, q7, q7, #12
424 vext.8 q14, q3, q3, #12
426 vext.8 q15, q5, q5, #12
431 vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
435 vext.8 q1, q1, q1, #8
440 vext.8 q8, q2, q2, #8
442 vext.8 q9, q7, q7, #8
444 vext.8 q2, q4, q4, #8
446 vext.8 q7, q5, q5, #8
448 vext.8 q4, q3, q3, #8
450 vext.8 q3, q6, q6, #8
459 vldmia r6, {q12} @ .LISR
460 ite eq @ Thumb2 thing, sanity check in ARM
463 vldmia r6, {q12} @ .LISRM0
467 vmov.i8 q8,#0x55 @ compose .LBS0
468 vmov.i8 q9,#0x33 @ compose .LBS1
476 vshl.u64 q10, q10, #1
478 vshl.u64 q11, q11, #1
488 vshl.u64 q10, q10, #1
490 vshl.u64 q11, q11, #1
493 vmov.i8 q8,#0x0f @ compose .LBS2
501 vshl.u64 q10, q10, #2
503 vshl.u64 q11, q11, #2
513 vshl.u64 q10, q10, #2
515 vshl.u64 q11, q11, #2
525 vshl.u64 q10, q10, #4
527 vshl.u64 q11, q11, #4
537 vshl.u64 q10, q10, #4
539 vshl.u64 q11, q11, #4
542 vldmia r4, {q8} @ last round key
552 .size _bsaes_decrypt8,.-_bsaes_decrypt8
554 .type _bsaes_const,%object
557 .LM0ISR:@ InvShiftRows constants
558 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
560 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
562 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
563 .LM0SR:@ ShiftRows constants
564 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
566 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
568 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
570 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
572 .quad 0x090d01050c000408, 0x03070b0f060a0e02
573 .byte 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
576 .size _bsaes_const,.-_bsaes_const
578 .type _bsaes_encrypt8,%function
582 vldmia r4!, {q9} @ round 0 key
583 #if defined(__thumb2__) || defined(__APPLE__)
586 sub r6,r6,#_bsaes_encrypt8-.LM0SR
589 vldmia r6!, {q8} @ .LM0SR
591 veor q10, q0, q9 @ xor with round0 key
593 vtbl.8 d0, {q10}, d16
594 vtbl.8 d1, {q10}, d17
596 vtbl.8 d2, {q11}, d16
597 vtbl.8 d3, {q11}, d17
599 vtbl.8 d4, {q12}, d16
600 vtbl.8 d5, {q12}, d17
602 vtbl.8 d6, {q13}, d16
603 vtbl.8 d7, {q13}, d17
605 vtbl.8 d8, {q14}, d16
606 vtbl.8 d9, {q14}, d17
608 vtbl.8 d10, {q15}, d16
609 vtbl.8 d11, {q15}, d17
611 vtbl.8 d12, {q10}, d16
612 vtbl.8 d13, {q10}, d17
613 vtbl.8 d14, {q11}, d16
614 vtbl.8 d15, {q11}, d17
615 _bsaes_encrypt8_bitslice:
616 vmov.i8 q8,#0x55 @ compose .LBS0
617 vmov.i8 q9,#0x33 @ compose .LBS1
625 vshl.u64 q10, q10, #1
627 vshl.u64 q11, q11, #1
637 vshl.u64 q10, q10, #1
639 vshl.u64 q11, q11, #1
642 vmov.i8 q8,#0x0f @ compose .LBS2
650 vshl.u64 q10, q10, #2
652 vshl.u64 q11, q11, #2
662 vshl.u64 q10, q10, #2
664 vshl.u64 q11, q11, #2
674 vshl.u64 q10, q10, #4
676 vshl.u64 q11, q11, #4
686 vshl.u64 q10, q10, #4
688 vshl.u64 q11, q11, #4
695 vldmia r4!, {q8,q9,q10,q11}
706 vtbl.8 d4, {q10}, d24
707 vtbl.8 d5, {q10}, d25
709 vtbl.8 d6, {q11}, d24
710 vtbl.8 d7, {q11}, d25
717 vtbl.8 d10, {q9}, d24
718 vtbl.8 d11, {q9}, d25
720 vtbl.8 d12, {q10}, d24
721 vtbl.8 d13, {q10}, d25
722 vtbl.8 d14, {q11}, d24
723 vtbl.8 d15, {q11}, d25
777 @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
779 @ new smaller inversion
786 veor q14, q8, q14 @ q14=q15
873 vext.8 q8, q0, q0, #12 @ x0 <<< 32
874 vext.8 q9, q1, q1, #12
875 veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
876 vext.8 q10, q4, q4, #12
878 vext.8 q11, q6, q6, #12
880 vext.8 q12, q3, q3, #12
882 vext.8 q13, q7, q7, #12
884 vext.8 q14, q2, q2, #12
886 vext.8 q15, q5, q5, #12
891 vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
895 vext.8 q1, q1, q1, #8
900 vext.8 q8, q3, q3, #8
902 vext.8 q9, q7, q7, #8
904 vext.8 q3, q6, q6, #8
906 vext.8 q7, q5, q5, #8
908 vext.8 q6, q2, q2, #8
910 vext.8 q2, q4, q4, #8
919 vldmia r6, {q12} @ .LSR
920 ite eq @ Thumb2 thing, samity check in ARM
923 vldmia r6, {q12} @ .LSRM0
927 vmov.i8 q8,#0x55 @ compose .LBS0
928 vmov.i8 q9,#0x33 @ compose .LBS1
936 vshl.u64 q10, q10, #1
938 vshl.u64 q11, q11, #1
948 vshl.u64 q10, q10, #1
950 vshl.u64 q11, q11, #1
953 vmov.i8 q8,#0x0f @ compose .LBS2
961 vshl.u64 q10, q10, #2
963 vshl.u64 q11, q11, #2
973 vshl.u64 q10, q10, #2
975 vshl.u64 q11, q11, #2
985 vshl.u64 q10, q10, #4
987 vshl.u64 q11, q11, #4
997 vshl.u64 q10, q10, #4
999 vshl.u64 q11, q11, #4
1002 vldmia r4, {q8} @ last round key
1012 .size _bsaes_encrypt8,.-_bsaes_encrypt8
1013 .type _bsaes_key_convert,%function
1017 vld1.8 {q7}, [r4]! @ load round 0 key
1018 #if defined(__thumb2__) || defined(__APPLE__)
1021 sub r6,r6,#_bsaes_key_convert-.LM0
1023 vld1.8 {q15}, [r4]! @ load round 1 key
1025 vmov.i8 q8, #0x01 @ bit masks
1031 vldmia r6, {q14} @ .LM0
1038 vstmia r12!, {q7} @ save round 0 key
1043 vtbl.8 d14,{q15},d28
1044 vtbl.8 d15,{q15},d29
1056 vld1.8 {q15}, [r4]! @ load next round key
1057 vmvn q0, q0 @ "pnot"
1065 vstmia r12!,{q0,q1,q2,q3,q4,q5,q6,q7} @ write bit-sliced round key
1068 vmov.i8 q7,#0x63 @ compose .L63
1069 @ don't save last round key
1071 .size _bsaes_key_convert,.-_bsaes_key_convert
1075 .globl bsaes_cbc_encrypt
1076 .type bsaes_cbc_encrypt,%function
1090 @ it is up to the caller to make sure we are called with enc == 0
1093 stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
1095 ldr r8, [ip] @ IV is 1st arg on the stack
1096 mov r2, r2, lsr#4 @ len in 16 byte blocks
1097 sub sp, #0x10 @ scratch space to carry over the IV
1098 mov r9, sp @ save sp
1100 ldr r10, [r3, #240] @ get # of rounds
1101 #ifndef BSAES_ASM_EXTENDED_KEY
1102 @ allocate the key schedule on the stack
1103 sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
1104 add r12, #96 @ sifze of bit-slices key schedule
1106 @ populate the key schedule
1107 mov r4, r3 @ pass key
1108 mov r5, r10 @ pass # of rounds
1109 mov sp, r12 @ sp is sp
1110 bl _bsaes_key_convert
1112 vstmia r12, {q15} @ save last round key
1113 veor q7, q7, q6 @ fix up round 0 key
1120 @ populate the key schedule
1122 mov r4, r3 @ pass key
1123 mov r5, r10 @ pass # of rounds
1124 add r12, r3, #248 @ pass key schedule
1125 bl _bsaes_key_convert
1128 vstmia r12, {q15} @ save last round key
1129 veor q7, q7, q6 @ fix up round 0 key
1136 vld1.8 {q15}, [r8] @ load IV
1142 bmi .Lcbc_dec_loop_finish
1144 vld1.8 {q0,q1}, [r0]! @ load input
1145 vld1.8 {q2,q3}, [r0]!
1146 #ifndef BSAES_ASM_EXTENDED_KEY
1147 mov r4, sp @ pass the key
1151 vld1.8 {q4,q5}, [r0]!
1153 vld1.8 {q6,q7}, [r0]
1155 vstmia r9, {q15} @ put aside IV
1159 vldmia r9, {q14} @ reload IV
1160 vld1.8 {q8,q9}, [r0]! @ reload input
1161 veor q0, q0, q14 @ ^= IV
1162 vld1.8 {q10,q11}, [r0]!
1165 vld1.8 {q12,q13}, [r0]!
1168 vld1.8 {q14,q15}, [r0]!
1170 vst1.8 {q0,q1}, [r1]! @ write output
1182 .Lcbc_dec_loop_finish:
1186 vld1.8 {q0}, [r0]! @ load input
1190 #ifndef BSAES_ASM_EXTENDED_KEY
1191 mov r4, sp @ pass the key
1196 vstmia r9, {q15} @ put aside IV
1213 vldmia r9, {q14} @ reload IV
1214 vld1.8 {q8,q9}, [r0]! @ reload input
1215 veor q0, q0, q14 @ ^= IV
1216 vld1.8 {q10,q11}, [r0]!
1219 vld1.8 {q12,q13}, [r0]!
1224 vst1.8 {q0,q1}, [r1]! @ write output
1236 vldmia r9,{q14} @ reload IV
1237 vld1.8 {q8,q9}, [r0]! @ reload input
1238 veor q0, q0, q14 @ ^= IV
1239 vld1.8 {q10,q11}, [r0]!
1247 vst1.8 {q0,q1}, [r1]! @ write output
1257 vldmia r9, {q14} @ reload IV
1258 vld1.8 {q8,q9}, [r0]! @ reload input
1259 veor q0, q0, q14 @ ^= IV
1260 vld1.8 {q10,q11}, [r0]!
1265 vst1.8 {q0,q1}, [r1]! @ write output
1275 vldmia r9, {q14} @ reload IV
1276 vld1.8 {q8,q9}, [r0]! @ reload input
1277 veor q0, q0, q14 @ ^= IV
1283 vst1.8 {q0,q1}, [r1]! @ write output
1291 vldmia r9, {q14} @ reload IV
1292 vld1.8 {q8,q9}, [r0]! @ reload input
1293 veor q0, q0, q14 @ ^= IV
1297 vst1.8 {q0,q1}, [r1]! @ write output
1304 vldmia r9, {q14} @ reload IV
1305 vld1.8 {q8}, [r0]! @ reload input
1306 veor q0, q0, q14 @ ^= IV
1307 vld1.8 {q15}, [r0]! @ reload input
1309 vst1.8 {q0,q1}, [r1]! @ write output
1314 mov r10, r1 @ save original out pointer
1315 mov r1, r9 @ use the iv scratch space as out buffer
1317 vmov q4,q15 @ just in case ensure that IV
1318 vmov q5,q0 @ and input are preserved
1320 vld1.8 {q0}, [r9] @ load result
1321 veor q0, q0, q4 @ ^= IV
1322 vmov q15, q5 @ q5 holds input
1323 vst1.8 {q0}, [r10] @ write output
1326 #ifndef BSAES_ASM_EXTENDED_KEY
1329 .Lcbc_dec_bzero:@ wipe key schedule [if any]
1336 add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
1337 vst1.8 {q15}, [r8] @ return IV
1339 ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc}
1340 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1342 .globl bsaes_ctr32_encrypt_blocks
1343 .type bsaes_ctr32_encrypt_blocks,%function
1345 bsaes_ctr32_encrypt_blocks:
1346 cmp r2, #8 @ use plain AES for
1347 blo .Lctr_enc_short @ small sizes
1350 stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
1352 ldr r8, [ip] @ ctr is 1st arg on the stack
1353 sub sp, sp, #0x10 @ scratch space to carry over the ctr
1354 mov r9, sp @ save sp
1356 ldr r10, [r3, #240] @ get # of rounds
1357 #ifndef BSAES_ASM_EXTENDED_KEY
1358 @ allocate the key schedule on the stack
1359 sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
1360 add r12, #96 @ size of bit-sliced key schedule
1362 @ populate the key schedule
1363 mov r4, r3 @ pass key
1364 mov r5, r10 @ pass # of rounds
1365 mov sp, r12 @ sp is sp
1366 bl _bsaes_key_convert
1367 veor q7,q7,q15 @ fix up last round key
1368 vstmia r12, {q7} @ save last round key
1370 vld1.8 {q0}, [r8] @ load counter
1372 mov r8, #:lower16:(.LREVM0SR-.LM0)
1375 add r8, r6, #.LREVM0SR-.LM0 @ borrow r8
1377 vldmia sp, {q4} @ load round0 key
1383 @ populate the key schedule
1385 mov r4, r3 @ pass key
1386 mov r5, r10 @ pass # of rounds
1387 add r12, r3, #248 @ pass key schedule
1388 bl _bsaes_key_convert
1389 veor q7,q7,q15 @ fix up last round key
1390 vstmia r12, {q7} @ save last round key
1394 vld1.8 {q0}, [r8] @ load counter
1395 adrl r8, .LREVM0SR @ borrow r8
1396 vldmia r12, {q4} @ load round0 key
1397 sub sp, #0x10 @ place for adjusted round0 key
1400 vmov.i32 q8,#1 @ compose 1<<96
1405 vadd.u32 q9,q8,q8 @ compose 2<<96
1406 vstmia sp, {q4} @ save adjusted round0 key
1411 vadd.u32 q10, q8, q9 @ compose 3<<96
1412 vadd.u32 q1, q0, q8 @ +1
1413 vadd.u32 q2, q0, q9 @ +2
1414 vadd.u32 q3, q0, q10 @ +3
1415 vadd.u32 q4, q1, q10
1416 vadd.u32 q5, q2, q10
1417 vadd.u32 q6, q3, q10
1418 vadd.u32 q7, q4, q10
1419 vadd.u32 q10, q5, q10 @ next counter
1421 @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
1422 @ to flip byte order in 32-bit counter
1424 vldmia sp, {q9} @ load round0 key
1425 #ifndef BSAES_ASM_EXTENDED_KEY
1426 add r4, sp, #0x10 @ pass next round key
1430 vldmia r8, {q8} @ .LREVM0SR
1431 mov r5, r10 @ pass rounds
1432 vstmia r9, {q10} @ save next counter
1434 mov r6, #:lower16:(.LREVM0SR-.LSR)
1437 sub r6, r8, #.LREVM0SR-.LSR @ pass constants
1440 bl _bsaes_encrypt8_alt
1443 blo .Lctr_enc_loop_done
1445 vld1.8 {q8,q9}, [r0]! @ load input
1446 vld1.8 {q10,q11}, [r0]!
1449 vld1.8 {q12,q13}, [r0]!
1452 vld1.8 {q14,q15}, [r0]!
1454 vst1.8 {q0,q1}, [r1]! @ write output
1460 vmov.i32 q8, #1 @ compose 1<<96
1464 vext.8 q8, q9, q8, #4
1466 vadd.u32 q9,q8,q8 @ compose 2<<96
1468 vldmia r9, {q0} @ load counter
1474 .Lctr_enc_loop_done:
1476 vld1.8 {q8}, [r0]! @ load input
1478 vst1.8 {q0}, [r1]! @ write output
1510 #ifndef BSAES_ASM_EXTENDED_KEY
1511 .Lctr_enc_bzero:@ wipe key schedule [if any]
1520 add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
1522 ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return
1526 ldr ip, [sp] @ ctr pointer is passed on stack
1527 stmdb sp!, {r4,r5,r6,r7,r8, lr}
1529 mov r4, r0 @ copy arguments
1533 ldr r8, [ip, #12] @ load counter .LSW
1534 vld1.8 {q1}, [ip] @ load whole counter value
1539 vst1.8 {q1}, [sp] @ copy counter value
1542 .Lctr_enc_short_loop:
1543 add r0, sp, #0x10 @ input counter value
1544 mov r1, sp @ output on the stack
1549 vld1.8 {q0}, [r4]! @ load input
1550 vld1.8 {q1}, [sp] @ load encrypted counter
1554 str r0, [sp, #0x1c] @ next counter value
1556 str r8, [sp, #0x1c] @ next counter value
1559 vst1.8 {q0}, [r5]! @ store output
1561 bne .Lctr_enc_short_loop
1567 ldmia sp!, {r4,r5,r6,r7,r8, pc}
1568 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
1569 .globl bsaes_xts_encrypt
1570 .type bsaes_xts_encrypt,%function
1574 stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} @ 0x20
1576 mov r6, sp @ future r3
1583 sub r0, sp, #0x10 @ 0x10
1584 bic r0, #0xf @ align at 16 bytes
1587 #ifdef XTS_CHAIN_TWEAK
1588 ldr r0, [ip] @ pointer to input tweak
1590 @ generate initial tweak
1591 ldr r0, [ip, #4] @ iv[]
1593 ldr r2, [ip, #0] @ key2
1595 mov r0,sp @ pointer to initial tweak
1598 ldr r1, [r10, #240] @ get # of rounds
1600 #ifndef BSAES_ASM_EXTENDED_KEY
1601 @ allocate the key schedule on the stack
1602 sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key
1603 @ add r12, #96 @ size of bit-sliced key schedule
1604 sub r12, #48 @ place for tweak[9]
1606 @ populate the key schedule
1607 mov r4, r10 @ pass key
1608 mov r5, r1 @ pass # of rounds
1610 add r12, #0x90 @ pass key schedule
1611 bl _bsaes_key_convert
1612 veor q7, q7, q15 @ fix up last round key
1613 vstmia r12, {q7} @ save last round key
1615 ldr r12, [r10, #244]
1619 str r12, [r10, #244]
1620 mov r4, r10 @ pass key
1621 mov r5, r1 @ pass # of rounds
1622 add r12, r10, #248 @ pass key schedule
1623 bl _bsaes_key_convert
1624 veor q7, q7, q15 @ fix up last round key
1628 sub sp, #0x90 @ place for tweak[9]
1631 vld1.8 {q8}, [r0] @ initial tweak
1640 vldmia r2, {q5} @ load XTS magic
1641 vshr.s64 q6, q8, #63
1645 vst1.64 {q8}, [r0,:128]!
1647 vshr.s64 q7, q9, #63
1650 vadd.u64 q10, q9, q9
1651 vst1.64 {q9}, [r0,:128]!
1653 vshr.s64 q6, q10, #63
1657 vadd.u64 q11, q10, q10
1658 vst1.64 {q10}, [r0,:128]!
1660 vshr.s64 q7, q11, #63
1665 vadd.u64 q12, q11, q11
1666 vst1.64 {q11}, [r0,:128]!
1668 vshr.s64 q6, q12, #63
1673 vadd.u64 q13, q12, q12
1674 vst1.64 {q12}, [r0,:128]!
1676 vshr.s64 q7, q13, #63
1681 vadd.u64 q14, q13, q13
1682 vst1.64 {q13}, [r0,:128]!
1684 vshr.s64 q6, q14, #63
1689 vadd.u64 q15, q14, q14
1690 vst1.64 {q14}, [r0,:128]!
1692 vshr.s64 q7, q15, #63
1697 vadd.u64 q8, q15, q15
1698 vst1.64 {q15}, [r0,:128]!
1701 vst1.64 {q8}, [r0,:128] @ next round tweak
1703 vld1.8 {q6,q7}, [r7]!
1705 #ifndef BSAES_ASM_EXTENDED_KEY
1706 add r4, sp, #0x90 @ pass key schedule
1708 add r4, r10, #248 @ pass key schedule
1711 mov r5, r1 @ pass rounds
1717 vld1.64 {q8,q9}, [r0,:128]!
1718 vld1.64 {q10,q11}, [r0,:128]!
1720 vld1.64 {q12,q13}, [r0,:128]!
1723 vst1.8 {q0,q1}, [r8]!
1725 vld1.64 {q14,q15}, [r0,:128]!
1727 vst1.8 {q8,q9}, [r8]!
1730 vst1.8 {q10,q11}, [r8]!
1732 vst1.8 {q12,q13}, [r8]!
1734 vld1.64 {q8}, [r0,:128] @ next round tweak
1743 vldmia r2, {q5} @ load XTS magic
1744 vshr.s64 q7, q8, #63
1748 vst1.64 {q8}, [r0,:128]!
1750 vshr.s64 q6, q9, #63
1753 vadd.u64 q10, q9, q9
1754 vst1.64 {q9}, [r0,:128]!
1756 vshr.s64 q7, q10, #63
1762 vadd.u64 q11, q10, q10
1763 vst1.64 {q10}, [r0,:128]!
1765 vshr.s64 q6, q11, #63
1772 vadd.u64 q12, q11, q11
1773 vst1.64 {q11}, [r0,:128]!
1775 vshr.s64 q7, q12, #63
1782 vadd.u64 q13, q12, q12
1783 vst1.64 {q12}, [r0,:128]!
1785 vshr.s64 q6, q13, #63
1792 vadd.u64 q14, q13, q13
1793 vst1.64 {q13}, [r0,:128]!
1795 vshr.s64 q7, q14, #63
1802 vadd.u64 q15, q14, q14
1803 vst1.64 {q14}, [r0,:128]!
1805 vshr.s64 q6, q15, #63
1813 vst1.64 {q15}, [r0,:128] @ next round tweak
1817 #ifndef BSAES_ASM_EXTENDED_KEY
1818 add r4, sp, #0x90 @ pass key schedule
1820 add r4, r10, #248 @ pass key schedule
1823 mov r5, r1 @ pass rounds
1828 vld1.64 {q8,q9}, [r0,:128]!
1829 vld1.64 {q10,q11}, [r0,:128]!
1831 vld1.64 {q12,q13}, [r0,:128]!
1834 vst1.8 {q0,q1}, [r8]!
1836 vld1.64 {q14}, [r0,:128]!
1838 vst1.8 {q8,q9}, [r8]!
1841 vst1.8 {q10,q11}, [r8]!
1844 vld1.64 {q8}, [r0,:128] @ next round tweak
1849 #ifndef BSAES_ASM_EXTENDED_KEY
1850 add r4, sp, #0x90 @ pass key schedule
1852 add r4, r10, #248 @ pass key schedule
1855 mov r5, r1 @ pass rounds
1860 vld1.64 {q8,q9}, [r0,:128]!
1861 vld1.64 {q10,q11}, [r0,:128]!
1863 vld1.64 {q12,q13}, [r0,:128]!
1866 vst1.8 {q0,q1}, [r8]!
1869 vst1.8 {q8,q9}, [r8]!
1871 vst1.8 {q10,q11}, [r8]!
1873 vld1.64 {q8}, [r0,:128] @ next round tweak
1876 @ put this in range for both ARM and Thumb mode adr instructions
1884 #ifndef BSAES_ASM_EXTENDED_KEY
1885 add r4, sp, #0x90 @ pass key schedule
1887 add r4, r10, #248 @ pass key schedule
1890 mov r5, r1 @ pass rounds
1895 vld1.64 {q8,q9}, [r0,:128]!
1896 vld1.64 {q10,q11}, [r0,:128]!
1898 vld1.64 {q12}, [r0,:128]!
1901 vst1.8 {q0,q1}, [r8]!
1904 vst1.8 {q8,q9}, [r8]!
1907 vld1.64 {q8}, [r0,:128] @ next round tweak
1912 #ifndef BSAES_ASM_EXTENDED_KEY
1913 add r4, sp, #0x90 @ pass key schedule
1915 add r4, r10, #248 @ pass key schedule
1918 mov r5, r1 @ pass rounds
1923 vld1.64 {q8,q9}, [r0,:128]!
1924 vld1.64 {q10,q11}, [r0,:128]!
1928 vst1.8 {q0,q1}, [r8]!
1930 vst1.8 {q8,q9}, [r8]!
1932 vld1.64 {q8}, [r0,:128] @ next round tweak
1937 #ifndef BSAES_ASM_EXTENDED_KEY
1938 add r4, sp, #0x90 @ pass key schedule
1940 add r4, r10, #248 @ pass key schedule
1943 mov r5, r1 @ pass rounds
1948 vld1.64 {q8,q9}, [r0,:128]!
1949 vld1.64 {q10}, [r0,:128]!
1953 vst1.8 {q0,q1}, [r8]!
1956 vld1.64 {q8}, [r0,:128] @ next round tweak
1961 #ifndef BSAES_ASM_EXTENDED_KEY
1962 add r4, sp, #0x90 @ pass key schedule
1964 add r4, r10, #248 @ pass key schedule
1967 mov r5, r1 @ pass rounds
1972 vld1.64 {q8,q9}, [r0,:128]!
1975 vst1.8 {q0,q1}, [r8]!
1977 vld1.64 {q8}, [r0,:128] @ next round tweak
1984 vst1.8 {q0}, [sp,:128]
1986 mov r4, r3 @ preserve fp
1990 vld1.8 {q0}, [sp,:128]
1995 vmov q8, q9 @ next round tweak
1998 #ifndef XTS_CHAIN_TWEAK
2005 ldrb r1, [r8, #-0x10]
2006 strb r0, [r8, #-0x10]
2016 vst1.8 {q0}, [sp,:128]
2018 mov r4, r3 @ preserve fp
2022 vld1.8 {q0}, [sp,:128]
2032 #ifdef XTS_CHAIN_TWEAK
2033 ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak
2035 .Lxts_enc_bzero:@ wipe key schedule [if any]
2041 #ifdef XTS_CHAIN_TWEAK
2045 ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return
2047 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2049 .globl bsaes_xts_decrypt
2050 .type bsaes_xts_decrypt,%function
2054 stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} @ 0x20
2056 mov r6, sp @ future r3
2063 sub r0, sp, #0x10 @ 0x10
2064 bic r0, #0xf @ align at 16 bytes
2067 #ifdef XTS_CHAIN_TWEAK
2068 ldr r0, [ip] @ pointer to input tweak
2070 @ generate initial tweak
2071 ldr r0, [ip, #4] @ iv[]
2073 ldr r2, [ip, #0] @ key2
2075 mov r0, sp @ pointer to initial tweak
2078 ldr r1, [r10, #240] @ get # of rounds
2080 #ifndef BSAES_ASM_EXTENDED_KEY
2081 @ allocate the key schedule on the stack
2082 sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key
2083 @ add r12, #96 @ size of bit-sliced key schedule
2084 sub r12, #48 @ place for tweak[9]
2086 @ populate the key schedule
2087 mov r4, r10 @ pass key
2088 mov r5, r1 @ pass # of rounds
2090 add r12, #0x90 @ pass key schedule
2091 bl _bsaes_key_convert
2094 vstmia r12, {q15} @ save last round key
2095 veor q7, q7, q6 @ fix up round 0 key
2098 ldr r12, [r10, #244]
2102 str r12, [r10, #244]
2103 mov r4, r10 @ pass key
2104 mov r5, r1 @ pass # of rounds
2105 add r12, r10, #248 @ pass key schedule
2106 bl _bsaes_key_convert
2109 vstmia r12, {q15} @ save last round key
2110 veor q7, q7, q6 @ fix up round 0 key
2114 sub sp, #0x90 @ place for tweak[9]
2116 vld1.8 {q8}, [r0] @ initial tweak
2119 #ifndef XTS_CHAIN_TWEAK
2120 tst r9, #0xf @ if not multiple of 16
2121 it ne @ Thumb2 thing, sanity check in ARM
2122 subne r9, #0x10 @ subtract another 16 bytes
2131 vldmia r2, {q5} @ load XTS magic
2132 vshr.s64 q6, q8, #63
2136 vst1.64 {q8}, [r0,:128]!
2138 vshr.s64 q7, q9, #63
2141 vadd.u64 q10, q9, q9
2142 vst1.64 {q9}, [r0,:128]!
2144 vshr.s64 q6, q10, #63
2148 vadd.u64 q11, q10, q10
2149 vst1.64 {q10}, [r0,:128]!
2151 vshr.s64 q7, q11, #63
2156 vadd.u64 q12, q11, q11
2157 vst1.64 {q11}, [r0,:128]!
2159 vshr.s64 q6, q12, #63
2164 vadd.u64 q13, q12, q12
2165 vst1.64 {q12}, [r0,:128]!
2167 vshr.s64 q7, q13, #63
2172 vadd.u64 q14, q13, q13
2173 vst1.64 {q13}, [r0,:128]!
2175 vshr.s64 q6, q14, #63
2180 vadd.u64 q15, q14, q14
2181 vst1.64 {q14}, [r0,:128]!
2183 vshr.s64 q7, q15, #63
2188 vadd.u64 q8, q15, q15
2189 vst1.64 {q15}, [r0,:128]!
2192 vst1.64 {q8}, [r0,:128] @ next round tweak
2194 vld1.8 {q6,q7}, [r7]!
2196 #ifndef BSAES_ASM_EXTENDED_KEY
2197 add r4, sp, #0x90 @ pass key schedule
2199 add r4, r10, #248 @ pass key schedule
2202 mov r5, r1 @ pass rounds
2208 vld1.64 {q8,q9}, [r0,:128]!
2209 vld1.64 {q10,q11}, [r0,:128]!
2211 vld1.64 {q12,q13}, [r0,:128]!
2214 vst1.8 {q0,q1}, [r8]!
2216 vld1.64 {q14,q15}, [r0,:128]!
2218 vst1.8 {q8,q9}, [r8]!
2221 vst1.8 {q10,q11}, [r8]!
2223 vst1.8 {q12,q13}, [r8]!
2225 vld1.64 {q8}, [r0,:128] @ next round tweak
2234 vldmia r2, {q5} @ load XTS magic
2235 vshr.s64 q7, q8, #63
2239 vst1.64 {q8}, [r0,:128]!
2241 vshr.s64 q6, q9, #63
2244 vadd.u64 q10, q9, q9
2245 vst1.64 {q9}, [r0,:128]!
2247 vshr.s64 q7, q10, #63
2253 vadd.u64 q11, q10, q10
2254 vst1.64 {q10}, [r0,:128]!
2256 vshr.s64 q6, q11, #63
2263 vadd.u64 q12, q11, q11
2264 vst1.64 {q11}, [r0,:128]!
2266 vshr.s64 q7, q12, #63
2273 vadd.u64 q13, q12, q12
2274 vst1.64 {q12}, [r0,:128]!
2276 vshr.s64 q6, q13, #63
2283 vadd.u64 q14, q13, q13
2284 vst1.64 {q13}, [r0,:128]!
2286 vshr.s64 q7, q14, #63
2293 vadd.u64 q15, q14, q14
2294 vst1.64 {q14}, [r0,:128]!
2296 vshr.s64 q6, q15, #63
2304 vst1.64 {q15}, [r0,:128] @ next round tweak
2308 #ifndef BSAES_ASM_EXTENDED_KEY
2309 add r4, sp, #0x90 @ pass key schedule
2311 add r4, r10, #248 @ pass key schedule
2314 mov r5, r1 @ pass rounds
2319 vld1.64 {q8,q9}, [r0,:128]!
2320 vld1.64 {q10,q11}, [r0,:128]!
2322 vld1.64 {q12,q13}, [r0,:128]!
2325 vst1.8 {q0,q1}, [r8]!
2327 vld1.64 {q14}, [r0,:128]!
2329 vst1.8 {q8,q9}, [r8]!
2332 vst1.8 {q10,q11}, [r8]!
2335 vld1.64 {q8}, [r0,:128] @ next round tweak
2339 vst1.64 {q14}, [r0,:128] @ next round tweak
2342 #ifndef BSAES_ASM_EXTENDED_KEY
2343 add r4, sp, #0x90 @ pass key schedule
2345 add r4, r10, #248 @ pass key schedule
2348 mov r5, r1 @ pass rounds
2353 vld1.64 {q8,q9}, [r0,:128]!
2354 vld1.64 {q10,q11}, [r0,:128]!
2356 vld1.64 {q12,q13}, [r0,:128]!
2359 vst1.8 {q0,q1}, [r8]!
2362 vst1.8 {q8,q9}, [r8]!
2364 vst1.8 {q10,q11}, [r8]!
2366 vld1.64 {q8}, [r0,:128] @ next round tweak
2371 #ifndef BSAES_ASM_EXTENDED_KEY
2372 add r4, sp, #0x90 @ pass key schedule
2374 add r4, r10, #248 @ pass key schedule
2377 mov r5, r1 @ pass rounds
2382 vld1.64 {q8,q9}, [r0,:128]!
2383 vld1.64 {q10,q11}, [r0,:128]!
2385 vld1.64 {q12}, [r0,:128]!
2388 vst1.8 {q0,q1}, [r8]!
2391 vst1.8 {q8,q9}, [r8]!
2394 vld1.64 {q8}, [r0,:128] @ next round tweak
2399 #ifndef BSAES_ASM_EXTENDED_KEY
2400 add r4, sp, #0x90 @ pass key schedule
2402 add r4, r10, #248 @ pass key schedule
2405 mov r5, r1 @ pass rounds
2410 vld1.64 {q8,q9}, [r0,:128]!
2411 vld1.64 {q10,q11}, [r0,:128]!
2415 vst1.8 {q0,q1}, [r8]!
2417 vst1.8 {q8,q9}, [r8]!
2419 vld1.64 {q8}, [r0,:128] @ next round tweak
2424 #ifndef BSAES_ASM_EXTENDED_KEY
2425 add r4, sp, #0x90 @ pass key schedule
2427 add r4, r10, #248 @ pass key schedule
2430 mov r5, r1 @ pass rounds
2435 vld1.64 {q8,q9}, [r0,:128]!
2436 vld1.64 {q10}, [r0,:128]!
2440 vst1.8 {q0,q1}, [r8]!
2443 vld1.64 {q8}, [r0,:128] @ next round tweak
2448 #ifndef BSAES_ASM_EXTENDED_KEY
2449 add r4, sp, #0x90 @ pass key schedule
2451 add r4, r10, #248 @ pass key schedule
2454 mov r5, r1 @ pass rounds
2459 vld1.64 {q8,q9}, [r0,:128]!
2462 vst1.8 {q0,q1}, [r8]!
2464 vld1.64 {q8}, [r0,:128] @ next round tweak
2471 vst1.8 {q0}, [sp,:128]
2472 mov r5, r2 @ preserve magic
2474 mov r4, r3 @ preserve fp
2478 vld1.8 {q0}, [sp,:128]
2484 vmov q8, q9 @ next round tweak
2487 #ifndef XTS_CHAIN_TWEAK
2491 @ calculate one round of extra tweak for the stolen ciphertext
2493 vshr.s64 q6, q8, #63
2499 @ perform the final decryption with the last tweak value
2504 vst1.8 {q0}, [sp,:128]
2506 mov r4, r3 @ preserve fp
2510 vld1.8 {q0}, [sp,:128]
2518 strb r1, [r8, #0x10]
2528 vst1.8 {q0}, [sp,:128]
2533 vld1.8 {q0}, [sp,:128]
2543 #ifdef XTS_CHAIN_TWEAK
2544 ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak
2546 .Lxts_dec_bzero:@ wipe key schedule [if any]
2552 #ifdef XTS_CHAIN_TWEAK
2556 ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return
2558 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt