2 * Copyright (C) 2008 Damien Miller <djm@mindrot.org>
3 * Copyright (c) 2010 Konstantin Belousov <kib@FreeBSD.org>
4 * Copyright (c) 2010-2011 Pawel Jakub Dawidek <pawel@dawidek.net>
5 * Copyright 2012-2013 John-Mark Gurney <jmg@FreeBSD.org>
6 * Copyright (c) 2014 The FreeBSD Foundation
9 * Portions of this software were developed by John-Mark Gurney
10 * under sponsorship of the FreeBSD Foundation and
11 * Rubicon Communications, LLC (Netgate).
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
38 #include <sys/param.h>
39 #include <sys/libkern.h>
40 #include <sys/malloc.h>
42 #include <sys/systm.h>
43 #include <crypto/aesni/aesni.h>
45 #include <opencrypto/gmac.h>
47 #include "aesencdec.h"
48 #include <smmintrin.h>
50 MALLOC_DECLARE(M_AESNI);
57 aesni_encrypt_cbc(int rounds, const void *key_schedule, size_t len,
58 const uint8_t *from, uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN])
64 ivreg = _mm_loadu_si128((const __m128i *)iv);
65 for (i = 0; i < len; i++) {
66 tot = aesni_enc(rounds - 1, key_schedule,
67 _mm_loadu_si128((const __m128i *)from) ^ ivreg);
69 _mm_storeu_si128((__m128i *)to, tot);
70 from += AES_BLOCK_LEN;
76 aesni_decrypt_cbc(int rounds, const void *key_schedule, size_t len,
77 uint8_t *buf, const uint8_t iv[static AES_BLOCK_LEN])
81 __m128i ivreg, nextiv;
84 ivreg = _mm_loadu_si128((const __m128i *)iv);
85 cnt = len / AES_BLOCK_LEN / 8;
86 for (i = 0; i < cnt; i++) {
87 blks = (struct blocks8 *)buf;
88 aesni_dec8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1],
89 blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5],
90 blks->blk[6], blks->blk[7], &blocks[0]);
91 for (j = 0; j < 8; j++) {
92 nextiv = blks->blk[j];
93 blks->blk[j] = blocks[j] ^ ivreg;
96 buf += AES_BLOCK_LEN * 8;
99 cnt = len / AES_BLOCK_LEN;
100 for (; i < cnt; i++) {
101 nextiv = _mm_loadu_si128((void *)buf);
102 _mm_storeu_si128((void *)buf,
103 aesni_dec(rounds - 1, key_schedule, nextiv) ^ ivreg);
105 buf += AES_BLOCK_LEN;
110 aesni_encrypt_ecb(int rounds, const void *key_schedule, size_t len,
111 const uint8_t *from, uint8_t *to)
116 const struct blocks8 *blks;
119 cnt = len / AES_BLOCK_LEN / 8;
120 for (i = 0; i < cnt; i++) {
121 blks = (const struct blocks8 *)from;
122 top = (struct blocks8 *)to;
123 aesni_enc8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1],
124 blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5],
125 blks->blk[6], blks->blk[7], tout);
126 top->blk[0] = tout[0];
127 top->blk[1] = tout[1];
128 top->blk[2] = tout[2];
129 top->blk[3] = tout[3];
130 top->blk[4] = tout[4];
131 top->blk[5] = tout[5];
132 top->blk[6] = tout[6];
133 top->blk[7] = tout[7];
134 from += AES_BLOCK_LEN * 8;
135 to += AES_BLOCK_LEN * 8;
138 cnt = len / AES_BLOCK_LEN;
139 for (; i < cnt; i++) {
140 tot = aesni_enc(rounds - 1, key_schedule,
141 _mm_loadu_si128((const __m128i *)from));
142 _mm_storeu_si128((__m128i *)to, tot);
143 from += AES_BLOCK_LEN;
149 aesni_decrypt_ecb(int rounds, const void *key_schedule, size_t len,
150 const uint8_t from[AES_BLOCK_LEN], uint8_t to[AES_BLOCK_LEN])
154 const struct blocks8 *blks;
158 cnt = len / AES_BLOCK_LEN / 8;
159 for (i = 0; i < cnt; i++) {
160 blks = (const struct blocks8 *)from;
161 top = (struct blocks8 *)to;
162 aesni_dec8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1],
163 blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5],
164 blks->blk[6], blks->blk[7], tout);
165 top->blk[0] = tout[0];
166 top->blk[1] = tout[1];
167 top->blk[2] = tout[2];
168 top->blk[3] = tout[3];
169 top->blk[4] = tout[4];
170 top->blk[5] = tout[5];
171 top->blk[6] = tout[6];
172 top->blk[7] = tout[7];
173 from += AES_BLOCK_LEN * 8;
174 to += AES_BLOCK_LEN * 8;
177 cnt = len / AES_BLOCK_LEN;
178 for (; i < cnt; i++) {
179 tot = aesni_dec(rounds - 1, key_schedule,
180 _mm_loadu_si128((const __m128i *)from));
181 _mm_storeu_si128((__m128i *)to, tot);
182 from += AES_BLOCK_LEN;
188 * mixed endian increment, low 64bits stored in hi word to be compatible
191 static inline __m128i
194 const __m128i ONE = _mm_setr_epi32(0, 0, 1, 0);
195 const __m128i ZERO = _mm_setzero_si128();
197 x = _mm_add_epi64(x, ONE);
198 __m128i t = _mm_cmpeq_epi64(x, ZERO);
199 t = _mm_unpackhi_epi64(t, ZERO);
200 x = _mm_sub_epi64(x, t);
206 aesni_encrypt_icm(int rounds, const void *key_schedule, size_t len,
207 const uint8_t *from, uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN])
210 __m128i tmp1, tmp2, tmp3, tmp4;
211 __m128i tmp5, tmp6, tmp7, tmp8;
212 __m128i ctr1, ctr2, ctr3, ctr4;
213 __m128i ctr5, ctr6, ctr7, ctr8;
218 const struct blocks8 *blks;
219 size_t i, cnt, resid;
221 BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7);
223 ctr1 = _mm_loadu_si128((const __m128i *)iv);
224 ctr1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
226 cnt = len / AES_BLOCK_LEN / 8;
227 for (i = 0; i < cnt; i++) {
228 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
230 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
232 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
234 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
236 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
238 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
240 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
242 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
245 blks = (const struct blocks8 *)from;
246 top = (struct blocks8 *)to;
247 aesni_enc8(rounds - 1, key_schedule, tmp1, tmp2, tmp3, tmp4,
248 tmp5, tmp6, tmp7, tmp8, tout);
250 top->blk[0] = blks->blk[0] ^ tout[0];
251 top->blk[1] = blks->blk[1] ^ tout[1];
252 top->blk[2] = blks->blk[2] ^ tout[2];
253 top->blk[3] = blks->blk[3] ^ tout[3];
254 top->blk[4] = blks->blk[4] ^ tout[4];
255 top->blk[5] = blks->blk[5] ^ tout[5];
256 top->blk[6] = blks->blk[6] ^ tout[6];
257 top->blk[7] = blks->blk[7] ^ tout[7];
259 from += AES_BLOCK_LEN * 8;
260 to += AES_BLOCK_LEN * 8;
263 cnt = len / AES_BLOCK_LEN;
264 for (; i < cnt; i++) {
265 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
268 tot = aesni_enc(rounds - 1, key_schedule, tmp1);
270 tot = tot ^ _mm_loadu_si128((const __m128i *)from);
271 _mm_storeu_si128((__m128i *)to, tot);
273 from += AES_BLOCK_LEN;
278 * Handle remaining partial round. Copy the remaining payload onto the
279 * stack to ensure that the full block can be loaded safely.
281 resid = len % AES_BLOCK_LEN;
283 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
284 tot = aesni_enc(rounds - 1, key_schedule, tmp1);
285 block = _mm_setzero_si128();
286 memcpy(&block, from, resid);
287 tot = tot ^ _mm_loadu_si128(&block);
288 memcpy(to, &tot, resid);
289 explicit_bzero(&block, sizeof(block));
293 #define AES_XTS_BLOCKSIZE 16
294 #define AES_XTS_IVSIZE 8
295 #define AES_XTS_ALPHA 0x87 /* GF(2^128) generator polynomial */
297 static inline __m128i
298 xts_crank_lfsr(__m128i inp)
300 const __m128i alphamask = _mm_set_epi32(1, 1, 1, AES_XTS_ALPHA);
303 /* set up xor mask */
304 xtweak = _mm_shuffle_epi32(inp, 0x93);
305 xtweak = _mm_srai_epi32(xtweak, 31);
309 ret = _mm_slli_epi32(inp, 1);
316 aesni_crypt_xts_block(int rounds, const __m128i *key_schedule, __m128i *tweak,
317 const uint8_t *from, uint8_t *to, int do_encrypt)
321 block = _mm_loadu_si128((const __m128i *)from) ^ *tweak;
324 block = aesni_enc(rounds - 1, key_schedule, block);
326 block = aesni_dec(rounds - 1, key_schedule, block);
328 _mm_storeu_si128((__m128i *)to, block ^ *tweak);
330 *tweak = xts_crank_lfsr(*tweak);
334 aesni_crypt_xts_block8(int rounds, const __m128i *key_schedule, __m128i *tweak,
335 const uint8_t *from, uint8_t *to, int do_encrypt)
338 __m128i a, b, c, d, e, f, g, h;
342 const __m128i *fromp;
347 * unroll the loop. This lets gcc put values directly in the
348 * register and saves memory accesses.
350 fromp = (const __m128i *)from;
351 #define PREPINP(v, pos) \
353 tweaks[(pos)] = tmptweak; \
354 (v) = _mm_loadu_si128(&fromp[pos]) ^ \
356 tmptweak = xts_crank_lfsr(tmptweak); \
369 aesni_enc8(rounds - 1, key_schedule, a, b, c, d, e, f, g, h,
372 aesni_dec8(rounds - 1, key_schedule, a, b, c, d, e, f, g, h,
376 _mm_storeu_si128(&top[0], tmp[0] ^ tweaks[0]);
377 _mm_storeu_si128(&top[1], tmp[1] ^ tweaks[1]);
378 _mm_storeu_si128(&top[2], tmp[2] ^ tweaks[2]);
379 _mm_storeu_si128(&top[3], tmp[3] ^ tweaks[3]);
380 _mm_storeu_si128(&top[4], tmp[4] ^ tweaks[4]);
381 _mm_storeu_si128(&top[5], tmp[5] ^ tweaks[5]);
382 _mm_storeu_si128(&top[6], tmp[6] ^ tweaks[6]);
383 _mm_storeu_si128(&top[7], tmp[7] ^ tweaks[7]);
387 aesni_crypt_xts(int rounds, const __m128i *data_schedule,
388 const __m128i *tweak_schedule, size_t len, const uint8_t *from,
389 uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN], int do_encrypt)
392 uint8_t tweak[AES_XTS_BLOCKSIZE] __aligned(16);
396 * Prepare tweak as E_k2(IV). IV is specified as LE representation
397 * of a 64-bit block number which we allow to be passed in directly.
399 #if BYTE_ORDER == LITTLE_ENDIAN
400 bcopy(iv, tweak, AES_XTS_IVSIZE);
401 /* Last 64 bits of IV are always zero. */
402 bzero(tweak + AES_XTS_IVSIZE, AES_XTS_IVSIZE);
404 #error Only LITTLE_ENDIAN architectures are supported.
406 tweakreg = _mm_loadu_si128((__m128i *)&tweak[0]);
407 tweakreg = aesni_enc(rounds - 1, tweak_schedule, tweakreg);
409 cnt = len / AES_XTS_BLOCKSIZE / 8;
410 for (i = 0; i < cnt; i++) {
411 aesni_crypt_xts_block8(rounds, data_schedule, &tweakreg,
412 from, to, do_encrypt);
413 from += AES_XTS_BLOCKSIZE * 8;
414 to += AES_XTS_BLOCKSIZE * 8;
417 cnt = len / AES_XTS_BLOCKSIZE;
418 for (; i < cnt; i++) {
419 aesni_crypt_xts_block(rounds, data_schedule, &tweakreg,
420 from, to, do_encrypt);
421 from += AES_XTS_BLOCKSIZE;
422 to += AES_XTS_BLOCKSIZE;
427 aesni_encrypt_xts(int rounds, const void *data_schedule,
428 const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to,
429 const uint8_t iv[static AES_BLOCK_LEN])
432 aesni_crypt_xts(rounds, data_schedule, tweak_schedule, len, from, to,
437 aesni_decrypt_xts(int rounds, const void *data_schedule,
438 const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to,
439 const uint8_t iv[static AES_BLOCK_LEN])
442 aesni_crypt_xts(rounds, data_schedule, tweak_schedule, len, from, to,
447 aesni_cipher_setup_common(struct aesni_session *ses,
448 const struct crypto_session_params *csp, const uint8_t *key, int keylen)
454 switch (csp->csp_cipher_alg) {
456 case CRYPTO_AES_NIST_GCM_16:
457 case CRYPTO_AES_CCM_16:
462 if (csp->csp_cipher_alg == CRYPTO_AES_XTS)
465 switch (keylen * 8) {
467 ses->rounds = AES128_ROUNDS;
470 ses->rounds = AES192_ROUNDS;
473 ses->rounds = AES256_ROUNDS;
476 panic("shouldn't happen");
479 aesni_set_enckey(key, ses->enc_schedule, ses->rounds);
481 aesni_set_deckey(ses->enc_schedule, ses->dec_schedule,
484 if (csp->csp_cipher_alg == CRYPTO_AES_XTS)
485 aesni_set_enckey(key + keylen, ses->xts_schedule,