2 * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 #define BR_POWER_ASM_MACROS 1
30 /* see bearssl_block.h */
31 const br_block_ctrcbc_class *
32 br_aes_pwr8_ctrcbc_get_vtable(void)
34 return br_aes_pwr8_supported() ? &br_aes_pwr8_ctrcbc_vtable : NULL;
37 /* see bearssl_block.h */
39 br_aes_pwr8_ctrcbc_init(br_aes_pwr8_ctrcbc_keys *ctx,
40 const void *key, size_t len)
42 ctx->vtable = &br_aes_pwr8_ctrcbc_vtable;
43 ctx->num_rounds = br_aes_pwr8_keysched(ctx->skey.skni, key, len);
47 * Register conventions for CTR + CBC-MAC:
49 * AES subkeys are in registers 0 to 10/12/14 (depending on keys size)
50 * Register v15 contains the byteswap index register (little-endian only)
51 * Register v16 contains the CTR counter value
52 * Register v17 contains the CBC-MAC current value
53 * Registers v18 to v27 are scratch
54 * Counter increment uses v28, v29 and v30
58 * AES subkeys are in registers 0 to 10/12/14 (depending on keys size)
59 * Register v15 contains the byteswap index register (little-endian only)
60 * Registers v16 to v19 contain the CTR counter values (four blocks)
61 * Registers v20 to v27 are scratch
62 * Counter increment uses v28, v29 and v30
65 #define LOAD_SUBKEYS_128 \
66 lxvw4x(32, %[cc], %[sk]) \
67 addi(%[cc], %[cc], 16) \
68 lxvw4x(33, %[cc], %[sk]) \
69 addi(%[cc], %[cc], 16) \
70 lxvw4x(34, %[cc], %[sk]) \
71 addi(%[cc], %[cc], 16) \
72 lxvw4x(35, %[cc], %[sk]) \
73 addi(%[cc], %[cc], 16) \
74 lxvw4x(36, %[cc], %[sk]) \
75 addi(%[cc], %[cc], 16) \
76 lxvw4x(37, %[cc], %[sk]) \
77 addi(%[cc], %[cc], 16) \
78 lxvw4x(38, %[cc], %[sk]) \
79 addi(%[cc], %[cc], 16) \
80 lxvw4x(39, %[cc], %[sk]) \
81 addi(%[cc], %[cc], 16) \
82 lxvw4x(40, %[cc], %[sk]) \
83 addi(%[cc], %[cc], 16) \
84 lxvw4x(41, %[cc], %[sk]) \
85 addi(%[cc], %[cc], 16) \
86 lxvw4x(42, %[cc], %[sk])
88 #define LOAD_SUBKEYS_192 \
90 addi(%[cc], %[cc], 16) \
91 lxvw4x(43, %[cc], %[sk]) \
92 addi(%[cc], %[cc], 16) \
93 lxvw4x(44, %[cc], %[sk])
95 #define LOAD_SUBKEYS_256 \
97 addi(%[cc], %[cc], 16) \
98 lxvw4x(45, %[cc], %[sk]) \
99 addi(%[cc], %[cc], 16) \
100 lxvw4x(46, %[cc], %[sk])
102 #define BLOCK_ENCRYPT_128(x) \
113 vcipherlast(x, x, 10)
115 #define BLOCK_ENCRYPT_192(x) \
128 vcipherlast(x, x, 12)
130 #define BLOCK_ENCRYPT_256(x) \
145 vcipherlast(x, x, 14)
147 #define BLOCK_ENCRYPT_X2_128(x, y) \
168 vcipherlast(x, x, 10) \
169 vcipherlast(y, y, 10)
171 #define BLOCK_ENCRYPT_X2_192(x, y) \
196 vcipherlast(x, x, 12) \
197 vcipherlast(y, y, 12)
199 #define BLOCK_ENCRYPT_X2_256(x, y) \
228 vcipherlast(x, x, 14) \
229 vcipherlast(y, y, 14)
231 #define BLOCK_ENCRYPT_X4_128(x0, x1, x2, x3) \
272 vcipherlast(x0, x0, 10) \
273 vcipherlast(x1, x1, 10) \
274 vcipherlast(x2, x2, 10) \
275 vcipherlast(x3, x3, 10)
277 #define BLOCK_ENCRYPT_X4_192(x0, x1, x2, x3) \
318 vcipher(x0, x0, 10) \
319 vcipher(x1, x1, 10) \
320 vcipher(x2, x2, 10) \
321 vcipher(x3, x3, 10) \
322 vcipher(x0, x0, 11) \
323 vcipher(x1, x1, 11) \
324 vcipher(x2, x2, 11) \
325 vcipher(x3, x3, 11) \
326 vcipherlast(x0, x0, 12) \
327 vcipherlast(x1, x1, 12) \
328 vcipherlast(x2, x2, 12) \
329 vcipherlast(x3, x3, 12)
331 #define BLOCK_ENCRYPT_X4_256(x0, x1, x2, x3) \
372 vcipher(x0, x0, 10) \
373 vcipher(x1, x1, 10) \
374 vcipher(x2, x2, 10) \
375 vcipher(x3, x3, 10) \
376 vcipher(x0, x0, 11) \
377 vcipher(x1, x1, 11) \
378 vcipher(x2, x2, 11) \
379 vcipher(x3, x3, 11) \
380 vcipher(x0, x0, 12) \
381 vcipher(x1, x1, 12) \
382 vcipher(x2, x2, 12) \
383 vcipher(x3, x3, 12) \
384 vcipher(x0, x0, 13) \
385 vcipher(x1, x1, 13) \
386 vcipher(x2, x2, 13) \
387 vcipher(x3, x3, 13) \
388 vcipherlast(x0, x0, 14) \
389 vcipherlast(x1, x1, 14) \
390 vcipherlast(x2, x2, 14) \
391 vcipherlast(x3, x3, 14)
394 static const uint32_t idx2be[] = {
395 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
397 #define BYTESWAP_INIT lxvw4x(47, 0, %[idx2be])
398 #define BYTESWAP(x) vperm(x, x, x, 15)
399 #define BYTESWAPX(d, s) vperm(d, s, s, 15)
400 #define BYTESWAP_REG , [idx2be] "b" (idx2be)
402 #define BYTESWAP_INIT
404 #define BYTESWAPX(d, s) vand(d, s, s)
408 static const uint32_t ctrinc[] = {
411 static const uint32_t ctrinc_x4[] = {
414 #define INCR_128_INIT lxvw4x(60, 0, %[ctrinc])
415 #define INCR_128_X4_INIT lxvw4x(60, 0, %[ctrinc_x4])
416 #define INCR_128(d, s) \
419 vsldoi(30, 29, 29, 4) \
422 vsldoi(30, 29, 29, 4) \
425 vsldoi(30, 29, 29, 4) \
428 #define MKCTR(size) \
430 ctr_ ## size(const unsigned char *sk, \
431 unsigned char *ctrbuf, unsigned char *buf, size_t num_blocks_x4) \
433 long cc, cc0, cc1, cc2, cc3; \
443 * Load subkeys into v0..v10 \
445 LOAD_SUBKEYS_ ## size \
452 * Load current CTR counters into v16 to v19. \
454 lxvw4x(48, %[cc0], %[ctrbuf]) \
455 lxvw4x(49, %[cc1], %[ctrbuf]) \
456 lxvw4x(50, %[cc2], %[ctrbuf]) \
457 lxvw4x(51, %[cc3], %[ctrbuf]) \
463 mtctr(%[num_blocks_x4]) \
467 * Compute next counter values into v20..v23. \
475 * Encrypt counter values and XOR into next data blocks. \
477 lxvw4x(56, %[cc0], %[buf]) \
478 lxvw4x(57, %[cc1], %[buf]) \
479 lxvw4x(58, %[cc2], %[buf]) \
480 lxvw4x(59, %[cc3], %[buf]) \
485 BLOCK_ENCRYPT_X4_ ## size(16, 17, 18, 19) \
494 stxvw4x(48, %[cc0], %[buf]) \
495 stxvw4x(49, %[cc1], %[buf]) \
496 stxvw4x(50, %[cc2], %[buf]) \
497 stxvw4x(51, %[cc3], %[buf]) \
500 * Update counters and data pointer. \
506 addi(%[buf], %[buf], 64) \
511 * Write back new counter values. \
517 stxvw4x(48, %[cc0], %[ctrbuf]) \
518 stxvw4x(49, %[cc1], %[ctrbuf]) \
519 stxvw4x(50, %[cc2], %[ctrbuf]) \
520 stxvw4x(51, %[cc3], %[ctrbuf]) \
522 : [cc] "+b" (cc), [buf] "+b" (buf), \
523 [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3) \
524 : [sk] "b" (sk), [ctrbuf] "b" (ctrbuf), \
525 [num_blocks_x4] "b" (num_blocks_x4), [ctrinc_x4] "b" (ctrinc_x4) \
527 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", \
528 "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", \
529 "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", \
530 "v30", "ctr", "memory" \
538 #define MKCBCMAC(size) \
540 cbcmac_ ## size(const unsigned char *sk, \
541 unsigned char *cbcmac, const unsigned char *buf, size_t num_blocks) \
549 * Load subkeys into v0..v10 \
551 LOAD_SUBKEYS_ ## size \
557 * Load current CBC-MAC value into v16. \
559 lxvw4x(48, %[cc], %[cbcmac]) \
562 mtctr(%[num_blocks]) \
566 * Load next block, XOR into current CBC-MAC value, \
567 * and then encrypt it. \
569 lxvw4x(49, %[cc], %[buf]) \
572 BLOCK_ENCRYPT_ ## size(16) \
573 addi(%[buf], %[buf], 16) \
578 * Write back new CBC-MAC value. \
581 stxvw4x(48, %[cc], %[cbcmac]) \
583 : [cc] "+b" (cc), [buf] "+b" (buf) \
584 : [sk] "b" (sk), [cbcmac] "b" (cbcmac), [num_blocks] "b" (num_blocks) \
586 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", \
587 "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", \
588 "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", \
589 "v30", "ctr", "memory" \
597 #define MKENCRYPT(size) \
599 ctrcbc_ ## size ## _encrypt(const unsigned char *sk, \
600 unsigned char *ctr, unsigned char *cbcmac, unsigned char *buf, \
609 * Load subkeys into v0..v10 \
611 LOAD_SUBKEYS_ ## size \
618 * Load current CTR counter into v16, and current \
619 * CBC-MAC IV into v17. \
621 lxvw4x(48, %[cc], %[ctr]) \
622 lxvw4x(49, %[cc], %[cbcmac]) \
627 * At each iteration, we do two parallel encryption: \
628 * - new counter value for encryption of the next block; \
629 * - CBC-MAC over the previous encrypted block. \
630 * Thus, each plaintext block implies two AES instances, \
631 * over two successive iterations. This requires a single \
632 * counter encryption before the loop, and a single \
633 * CBC-MAC encryption after the loop. \
637 * Encrypt first block (into v20). \
639 lxvw4x(52, %[cc], %[buf]) \
642 BLOCK_ENCRYPT_ ## size(16) \
645 stxvw4x(53, %[cc], %[buf]) \
647 addi(%[buf], %[buf], 16) \
650 * Load loop counter; skip the loop if there is only \
651 * one block in total (already handled by the boundary \
654 mtctr(%[num_blocks]) \
660 * v16 counter value for next block \
661 * v17 current CBC-MAC value \
662 * v20 encrypted previous block \
666 lxvw4x(52, %[cc], %[buf]) \
668 BLOCK_ENCRYPT_X2_ ## size(16, 17) \
671 stxvw4x(53, %[cc], %[buf]) \
672 addi(%[buf], %[buf], 16) \
679 BLOCK_ENCRYPT_ ## size(17) \
682 stxvw4x(48, %[cc], %[ctr]) \
683 stxvw4x(49, %[cc], %[cbcmac]) \
685 : [cc] "+b" (cc), [buf] "+b" (buf) \
686 : [sk] "b" (sk), [ctr] "b" (ctr), [cbcmac] "b" (cbcmac), \
687 [num_blocks] "b" (num_blocks), [ctrinc] "b" (ctrinc) \
689 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", \
690 "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", \
691 "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", \
692 "v30", "ctr", "memory" \
700 #define MKDECRYPT(size) \
702 ctrcbc_ ## size ## _decrypt(const unsigned char *sk, \
703 unsigned char *ctr, unsigned char *cbcmac, unsigned char *buf, \
712 * Load subkeys into v0..v10 \
714 LOAD_SUBKEYS_ ## size \
721 * Load current CTR counter into v16, and current \
722 * CBC-MAC IV into v17. \
724 lxvw4x(48, %[cc], %[ctr]) \
725 lxvw4x(49, %[cc], %[cbcmac]) \
730 * At each iteration, we do two parallel encryption: \
731 * - new counter value for decryption of the next block; \
732 * - CBC-MAC over the next encrypted block. \
733 * Each iteration performs the two AES instances related \
734 * to the current block; there is thus no need for some \
735 * extra pre-loop and post-loop work as in encryption. \
738 mtctr(%[num_blocks]) \
743 * v16 counter value for next block \
744 * v17 current CBC-MAC value \
746 lxvw4x(52, %[cc], %[buf]) \
750 BLOCK_ENCRYPT_X2_ ## size(16, 17) \
753 stxvw4x(53, %[cc], %[buf]) \
754 addi(%[buf], %[buf], 16) \
760 * Store back counter and CBC-MAC value. \
764 stxvw4x(48, %[cc], %[ctr]) \
765 stxvw4x(49, %[cc], %[cbcmac]) \
767 : [cc] "+b" (cc), [buf] "+b" (buf) \
768 : [sk] "b" (sk), [ctr] "b" (ctr), [cbcmac] "b" (cbcmac), \
769 [num_blocks] "b" (num_blocks), [ctrinc] "b" (ctrinc) \
771 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", \
772 "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", \
773 "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", \
774 "v30", "ctr", "memory" \
782 /* see bearssl_block.h */
784 br_aes_pwr8_ctrcbc_encrypt(const br_aes_pwr8_ctrcbc_keys *ctx,
785 void *ctr, void *cbcmac, void *data, size_t len)
790 switch (ctx->num_rounds) {
792 ctrcbc_128_encrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
795 ctrcbc_192_encrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
798 ctrcbc_256_encrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
803 /* see bearssl_block.h */
805 br_aes_pwr8_ctrcbc_decrypt(const br_aes_pwr8_ctrcbc_keys *ctx,
806 void *ctr, void *cbcmac, void *data, size_t len)
811 switch (ctx->num_rounds) {
813 ctrcbc_128_decrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
816 ctrcbc_192_decrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
819 ctrcbc_256_decrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
825 incr_ctr(void *dst, const void *src)
829 hi = br_dec64be(src);
830 lo = br_dec64be((const unsigned char *)src + 8);
832 hi += ((lo | -lo) >> 63) ^ (uint64_t)1;
834 br_enc64be((unsigned char *)dst + 8, lo);
837 /* see bearssl_block.h */
839 br_aes_pwr8_ctrcbc_ctr(const br_aes_pwr8_ctrcbc_keys *ctx,
840 void *ctr, void *data, size_t len)
842 unsigned char ctrbuf[64];
844 memcpy(ctrbuf, ctr, 16);
845 incr_ctr(ctrbuf + 16, ctrbuf);
846 incr_ctr(ctrbuf + 32, ctrbuf + 16);
847 incr_ctr(ctrbuf + 48, ctrbuf + 32);
849 switch (ctx->num_rounds) {
851 ctr_128(ctx->skey.skni, ctrbuf, data, len >> 6);
854 ctr_192(ctx->skey.skni, ctrbuf, data, len >> 6);
857 ctr_256(ctx->skey.skni, ctrbuf, data, len >> 6);
860 data = (unsigned char *)data + (len & ~(size_t)63);
864 unsigned char tmp[64];
868 memcpy(ctr, ctrbuf + 48, 16);
870 memcpy(ctr, ctrbuf + 32, 16);
874 memcpy(ctr, ctrbuf + 16, 16);
877 memcpy(tmp, data, len);
878 memset(tmp + len, 0, (sizeof tmp) - len);
879 switch (ctx->num_rounds) {
881 ctr_128(ctx->skey.skni, ctrbuf, tmp, 1);
884 ctr_192(ctx->skey.skni, ctrbuf, tmp, 1);
887 ctr_256(ctx->skey.skni, ctrbuf, tmp, 1);
890 memcpy(data, tmp, len);
892 memcpy(ctr, ctrbuf, 16);
896 /* see bearssl_block.h */
898 br_aes_pwr8_ctrcbc_mac(const br_aes_pwr8_ctrcbc_keys *ctx,
899 void *cbcmac, const void *data, size_t len)
902 switch (ctx->num_rounds) {
904 cbcmac_128(ctx->skey.skni, cbcmac, data, len >> 4);
907 cbcmac_192(ctx->skey.skni, cbcmac, data, len >> 4);
910 cbcmac_256(ctx->skey.skni, cbcmac, data, len >> 4);
916 /* see bearssl_block.h */
917 const br_block_ctrcbc_class br_aes_pwr8_ctrcbc_vtable = {
918 sizeof(br_aes_pwr8_ctrcbc_keys),
921 (void (*)(const br_block_ctrcbc_class **, const void *, size_t))
922 &br_aes_pwr8_ctrcbc_init,
923 (void (*)(const br_block_ctrcbc_class *const *,
924 void *, void *, void *, size_t))
925 &br_aes_pwr8_ctrcbc_encrypt,
926 (void (*)(const br_block_ctrcbc_class *const *,
927 void *, void *, void *, size_t))
928 &br_aes_pwr8_ctrcbc_decrypt,
929 (void (*)(const br_block_ctrcbc_class *const *,
930 void *, void *, size_t))
931 &br_aes_pwr8_ctrcbc_ctr,
932 (void (*)(const br_block_ctrcbc_class *const *,
933 void *, const void *, size_t))
934 &br_aes_pwr8_ctrcbc_mac
939 /* see bearssl_block.h */
940 const br_block_ctrcbc_class *
941 br_aes_pwr8_ctrcbc_get_vtable(void)