]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/crypto/aesni/aesni_wrap.c
ssh: Update to OpenSSH 9.5p1
[FreeBSD/FreeBSD.git] / sys / crypto / aesni / aesni_wrap.c
1 /*-
2  * Copyright (C) 2008 Damien Miller <djm@mindrot.org>
3  * Copyright (c) 2010 Konstantin Belousov <kib@FreeBSD.org>
4  * Copyright (c) 2010-2011 Pawel Jakub Dawidek <pawel@dawidek.net>
5  * Copyright 2012-2013 John-Mark Gurney <jmg@FreeBSD.org>
6  * Copyright (c) 2014 The FreeBSD Foundation
7  * All rights reserved.
8  *
9  * Portions of this software were developed by John-Mark Gurney
10  * under sponsorship of the FreeBSD Foundation and
11  * Rubicon Communications, LLC (Netgate).
12  * 
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34
35 #include <sys/cdefs.h>
36 #include <sys/param.h>
37 #include <sys/libkern.h>
38 #include <sys/malloc.h>
39 #include <sys/proc.h>
40 #include <sys/systm.h>
41 #include <crypto/aesni/aesni.h>
42
43 #include <opencrypto/gmac.h>
44
45 #include "aesencdec.h"
46 #include <smmintrin.h>
47
48 MALLOC_DECLARE(M_AESNI);
49
50 struct blocks8 {
51         __m128i blk[8];
52 } __packed;
53
54 void
55 aesni_encrypt_cbc(int rounds, const void *key_schedule, size_t len,
56     const uint8_t *from, uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN])
57 {
58         __m128i tot, ivreg;
59         size_t i;
60
61         len /= AES_BLOCK_LEN;
62         ivreg = _mm_loadu_si128((const __m128i *)iv);
63         for (i = 0; i < len; i++) {
64                 tot = aesni_enc(rounds - 1, key_schedule,
65                     _mm_loadu_si128((const __m128i *)from) ^ ivreg);
66                 ivreg = tot;
67                 _mm_storeu_si128((__m128i *)to, tot);
68                 from += AES_BLOCK_LEN;
69                 to += AES_BLOCK_LEN;
70         }
71 }
72
73 void
74 aesni_decrypt_cbc(int rounds, const void *key_schedule, size_t len,
75     uint8_t *buf, const uint8_t iv[static AES_BLOCK_LEN])
76 {
77         __m128i blocks[8];
78         struct blocks8 *blks;
79         __m128i ivreg, nextiv;
80         size_t i, j, cnt;
81
82         ivreg = _mm_loadu_si128((const __m128i *)iv);
83         cnt = len / AES_BLOCK_LEN / 8;
84         for (i = 0; i < cnt; i++) {
85                 blks = (struct blocks8 *)buf;
86                 aesni_dec8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1],
87                     blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5],
88                     blks->blk[6], blks->blk[7], &blocks[0]);
89                 for (j = 0; j < 8; j++) {
90                         nextiv = blks->blk[j];
91                         blks->blk[j] = blocks[j] ^ ivreg;
92                         ivreg = nextiv;
93                 }
94                 buf += AES_BLOCK_LEN * 8;
95         }
96         i *= 8;
97         cnt = len / AES_BLOCK_LEN;
98         for (; i < cnt; i++) {
99                 nextiv = _mm_loadu_si128((void *)buf);
100                 _mm_storeu_si128((void *)buf,
101                     aesni_dec(rounds - 1, key_schedule, nextiv) ^ ivreg);
102                 ivreg = nextiv;
103                 buf += AES_BLOCK_LEN;
104         }
105 }
106
107 void
108 aesni_encrypt_ecb(int rounds, const void *key_schedule, size_t len,
109     const uint8_t *from, uint8_t *to)
110 {
111         __m128i tot;
112         __m128i tout[8];
113         struct blocks8 *top;
114         const struct blocks8 *blks;
115         size_t i, cnt;
116
117         cnt = len / AES_BLOCK_LEN / 8;
118         for (i = 0; i < cnt; i++) {
119                 blks = (const struct blocks8 *)from;
120                 top = (struct blocks8 *)to;
121                 aesni_enc8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1],
122                     blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5],
123                     blks->blk[6], blks->blk[7], tout);
124                 top->blk[0] = tout[0];
125                 top->blk[1] = tout[1];
126                 top->blk[2] = tout[2];
127                 top->blk[3] = tout[3];
128                 top->blk[4] = tout[4];
129                 top->blk[5] = tout[5];
130                 top->blk[6] = tout[6];
131                 top->blk[7] = tout[7];
132                 from += AES_BLOCK_LEN * 8;
133                 to += AES_BLOCK_LEN * 8;
134         }
135         i *= 8;
136         cnt = len / AES_BLOCK_LEN;
137         for (; i < cnt; i++) {
138                 tot = aesni_enc(rounds - 1, key_schedule,
139                     _mm_loadu_si128((const __m128i *)from));
140                 _mm_storeu_si128((__m128i *)to, tot);
141                 from += AES_BLOCK_LEN;
142                 to += AES_BLOCK_LEN;
143         }
144 }
145
146 void
147 aesni_decrypt_ecb(int rounds, const void *key_schedule, size_t len,
148     const uint8_t *from, uint8_t *to)
149 {
150         __m128i tot;
151         __m128i tout[8];
152         const struct blocks8 *blks;
153         struct blocks8 *top;
154         size_t i, cnt;
155
156         cnt = len / AES_BLOCK_LEN / 8;
157         for (i = 0; i < cnt; i++) {
158                 blks = (const struct blocks8 *)from;
159                 top = (struct blocks8 *)to;
160                 aesni_dec8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1],
161                     blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5],
162                     blks->blk[6], blks->blk[7], tout);
163                 top->blk[0] = tout[0];
164                 top->blk[1] = tout[1];
165                 top->blk[2] = tout[2];
166                 top->blk[3] = tout[3];
167                 top->blk[4] = tout[4];
168                 top->blk[5] = tout[5];
169                 top->blk[6] = tout[6];
170                 top->blk[7] = tout[7];
171                 from += AES_BLOCK_LEN * 8;
172                 to += AES_BLOCK_LEN * 8;
173         }
174         i *= 8;
175         cnt = len / AES_BLOCK_LEN;
176         for (; i < cnt; i++) {
177                 tot = aesni_dec(rounds - 1, key_schedule,
178                     _mm_loadu_si128((const __m128i *)from));
179                 _mm_storeu_si128((__m128i *)to, tot);
180                 from += AES_BLOCK_LEN;
181                 to += AES_BLOCK_LEN;
182         }
183 }
184
185 /*
186  * mixed endian increment, low 64bits stored in hi word to be compatible
187  * with _icm's BSWAP.
188  */
189 static inline __m128i
190 nextc(__m128i x)
191 {
192         const __m128i ONE = _mm_setr_epi32(0, 0, 1, 0);
193         const __m128i ZERO = _mm_setzero_si128();
194
195         x = _mm_add_epi64(x, ONE);
196         __m128i t = _mm_cmpeq_epi64(x, ZERO);
197         t = _mm_unpackhi_epi64(t, ZERO);
198         x = _mm_sub_epi64(x, t);
199
200         return x;
201 }
202
203 void
204 aesni_encrypt_icm(int rounds, const void *key_schedule, size_t len,
205     const uint8_t *from, uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN])
206 {
207         __m128i tot;
208         __m128i tmp1, tmp2, tmp3, tmp4;
209         __m128i tmp5, tmp6, tmp7, tmp8;
210         __m128i ctr1, ctr2, ctr3, ctr4;
211         __m128i ctr5, ctr6, ctr7, ctr8;
212         __m128i BSWAP_EPI64;
213         __m128i tout[8];
214         __m128i block;
215         struct blocks8 *top;
216         const struct blocks8 *blks;
217         size_t i, cnt, resid;
218
219         BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7);
220
221         ctr1 = _mm_loadu_si128((const __m128i *)iv);
222         ctr1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
223
224         cnt = len / AES_BLOCK_LEN / 8;
225         for (i = 0; i < cnt; i++) {
226                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
227                 ctr2 = nextc(ctr1);
228                 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
229                 ctr3 = nextc(ctr2);
230                 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
231                 ctr4 = nextc(ctr3);
232                 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
233                 ctr5 = nextc(ctr4);
234                 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
235                 ctr6 = nextc(ctr5);
236                 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
237                 ctr7 = nextc(ctr6);
238                 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
239                 ctr8 = nextc(ctr7);
240                 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
241                 ctr1 = nextc(ctr8);
242
243                 blks = (const struct blocks8 *)from;
244                 top = (struct blocks8 *)to;
245                 aesni_enc8(rounds - 1, key_schedule, tmp1, tmp2, tmp3, tmp4,
246                     tmp5, tmp6, tmp7, tmp8, tout);
247
248                 top->blk[0] = blks->blk[0] ^ tout[0];
249                 top->blk[1] = blks->blk[1] ^ tout[1];
250                 top->blk[2] = blks->blk[2] ^ tout[2];
251                 top->blk[3] = blks->blk[3] ^ tout[3];
252                 top->blk[4] = blks->blk[4] ^ tout[4];
253                 top->blk[5] = blks->blk[5] ^ tout[5];
254                 top->blk[6] = blks->blk[6] ^ tout[6];
255                 top->blk[7] = blks->blk[7] ^ tout[7];
256
257                 from += AES_BLOCK_LEN * 8;
258                 to += AES_BLOCK_LEN * 8;
259         }
260         i *= 8;
261         cnt = len / AES_BLOCK_LEN;
262         for (; i < cnt; i++) {
263                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
264                 ctr1 = nextc(ctr1);
265
266                 tot = aesni_enc(rounds - 1, key_schedule, tmp1);
267
268                 tot = tot ^ _mm_loadu_si128((const __m128i *)from);
269                 _mm_storeu_si128((__m128i *)to, tot);
270
271                 from += AES_BLOCK_LEN;
272                 to += AES_BLOCK_LEN;
273         }
274
275         /*
276          * Handle remaining partial round.  Copy the remaining payload onto the
277          * stack to ensure that the full block can be loaded safely.
278          */
279         resid = len % AES_BLOCK_LEN;
280         if (resid != 0) {
281                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
282                 tot = aesni_enc(rounds - 1, key_schedule, tmp1);
283                 block = _mm_setzero_si128();
284                 memcpy(&block, from, resid);
285                 tot = tot ^ _mm_loadu_si128(&block);
286                 memcpy(to, &tot, resid);
287                 explicit_bzero(&block, sizeof(block));
288         }
289 }
290
291 #define AES_XTS_BLOCKSIZE       16
292 #define AES_XTS_IVSIZE          8
293 #define AES_XTS_ALPHA           0x87    /* GF(2^128) generator polynomial */
294
295 static inline __m128i
296 xts_crank_lfsr(__m128i inp)
297 {
298         const __m128i alphamask = _mm_set_epi32(1, 1, 1, AES_XTS_ALPHA);
299         __m128i xtweak, ret;
300
301         /* set up xor mask */
302         xtweak = _mm_shuffle_epi32(inp, 0x93);
303         xtweak = _mm_srai_epi32(xtweak, 31);
304         xtweak &= alphamask;
305
306         /* next term */
307         ret = _mm_slli_epi32(inp, 1);
308         ret ^= xtweak;
309
310         return ret;
311 }
312
313 static void
314 aesni_crypt_xts_block(int rounds, const __m128i *key_schedule, __m128i *tweak,
315     const uint8_t *from, uint8_t *to, int do_encrypt)
316 {
317         __m128i block;
318
319         block = _mm_loadu_si128((const __m128i *)from) ^ *tweak;
320
321         if (do_encrypt)
322                 block = aesni_enc(rounds - 1, key_schedule, block);
323         else
324                 block = aesni_dec(rounds - 1, key_schedule, block);
325
326         _mm_storeu_si128((__m128i *)to, block ^ *tweak);
327
328         *tweak = xts_crank_lfsr(*tweak);
329 }
330
331 static void
332 aesni_crypt_xts_block8(int rounds, const __m128i *key_schedule, __m128i *tweak,
333     const uint8_t *from, uint8_t *to, int do_encrypt)
334 {
335         __m128i tmptweak;
336         __m128i a, b, c, d, e, f, g, h;
337         __m128i tweaks[8];
338         __m128i tmp[8];
339         __m128i *top;
340         const __m128i *fromp;
341
342         tmptweak = *tweak;
343
344         /*
345          * unroll the loop.  This lets gcc put values directly in the
346          * register and saves memory accesses.
347          */
348         fromp = (const __m128i *)from;
349 #define PREPINP(v, pos)                                         \
350                 do {                                            \
351                         tweaks[(pos)] = tmptweak;               \
352                         (v) = _mm_loadu_si128(&fromp[pos]) ^    \
353                             tmptweak;                           \
354                         tmptweak = xts_crank_lfsr(tmptweak);    \
355                 } while (0)
356         PREPINP(a, 0);
357         PREPINP(b, 1);
358         PREPINP(c, 2);
359         PREPINP(d, 3);
360         PREPINP(e, 4);
361         PREPINP(f, 5);
362         PREPINP(g, 6);
363         PREPINP(h, 7);
364         *tweak = tmptweak;
365
366         if (do_encrypt)
367                 aesni_enc8(rounds - 1, key_schedule, a, b, c, d, e, f, g, h,
368                     tmp);
369         else
370                 aesni_dec8(rounds - 1, key_schedule, a, b, c, d, e, f, g, h,
371                     tmp);
372
373         top = (__m128i *)to;
374         _mm_storeu_si128(&top[0], tmp[0] ^ tweaks[0]);
375         _mm_storeu_si128(&top[1], tmp[1] ^ tweaks[1]);
376         _mm_storeu_si128(&top[2], tmp[2] ^ tweaks[2]);
377         _mm_storeu_si128(&top[3], tmp[3] ^ tweaks[3]);
378         _mm_storeu_si128(&top[4], tmp[4] ^ tweaks[4]);
379         _mm_storeu_si128(&top[5], tmp[5] ^ tweaks[5]);
380         _mm_storeu_si128(&top[6], tmp[6] ^ tweaks[6]);
381         _mm_storeu_si128(&top[7], tmp[7] ^ tweaks[7]);
382 }
383
384 static void
385 aesni_crypt_xts(int rounds, const __m128i *data_schedule,
386     const __m128i *tweak_schedule, size_t len, const uint8_t *from,
387     uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN], int do_encrypt)
388 {
389         __m128i tweakreg;
390         uint8_t tweak[AES_XTS_BLOCKSIZE] __aligned(16);
391         size_t i, cnt;
392
393         /*
394          * Prepare tweak as E_k2(IV). IV is specified as LE representation
395          * of a 64-bit block number which we allow to be passed in directly.
396          */
397 #if BYTE_ORDER == LITTLE_ENDIAN
398         bcopy(iv, tweak, AES_XTS_IVSIZE);
399         /* Last 64 bits of IV are always zero. */
400         bzero(tweak + AES_XTS_IVSIZE, AES_XTS_IVSIZE);
401 #else
402 #error Only LITTLE_ENDIAN architectures are supported.
403 #endif
404         tweakreg = _mm_loadu_si128((__m128i *)&tweak[0]);
405         tweakreg = aesni_enc(rounds - 1, tweak_schedule, tweakreg);
406
407         cnt = len / AES_XTS_BLOCKSIZE / 8;
408         for (i = 0; i < cnt; i++) {
409                 aesni_crypt_xts_block8(rounds, data_schedule, &tweakreg,
410                     from, to, do_encrypt);
411                 from += AES_XTS_BLOCKSIZE * 8;
412                 to += AES_XTS_BLOCKSIZE * 8;
413         }
414         i *= 8;
415         cnt = len / AES_XTS_BLOCKSIZE;
416         for (; i < cnt; i++) {
417                 aesni_crypt_xts_block(rounds, data_schedule, &tweakreg,
418                     from, to, do_encrypt);
419                 from += AES_XTS_BLOCKSIZE;
420                 to += AES_XTS_BLOCKSIZE;
421         }
422 }
423
424 void
425 aesni_encrypt_xts(int rounds, const void *data_schedule,
426     const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to,
427     const uint8_t iv[static AES_BLOCK_LEN])
428 {
429
430         aesni_crypt_xts(rounds, data_schedule, tweak_schedule, len, from, to,
431             iv, 1);
432 }
433
434 void
435 aesni_decrypt_xts(int rounds, const void *data_schedule,
436     const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to,
437     const uint8_t iv[static AES_BLOCK_LEN])
438 {
439
440         aesni_crypt_xts(rounds, data_schedule, tweak_schedule, len, from, to,
441             iv, 0);
442 }
443
444 void
445 aesni_cipher_setup_common(struct aesni_session *ses,
446     const struct crypto_session_params *csp, const uint8_t *key, int keylen)
447 {
448         int decsched;
449
450         decsched = 1;
451
452         switch (csp->csp_cipher_alg) {
453         case CRYPTO_AES_ICM:
454         case CRYPTO_AES_NIST_GCM_16:
455         case CRYPTO_AES_CCM_16:
456                 decsched = 0;
457                 break;
458         }
459
460         if (csp->csp_cipher_alg == CRYPTO_AES_XTS)
461                 keylen /= 2;
462
463         switch (keylen * 8) {
464         case 128:
465                 ses->rounds = AES128_ROUNDS;
466                 break;
467         case 192:
468                 ses->rounds = AES192_ROUNDS;
469                 break;
470         case 256:
471                 ses->rounds = AES256_ROUNDS;
472                 break;
473         default:
474                 panic("shouldn't happen");
475         }
476
477         aesni_set_enckey(key, ses->enc_schedule, ses->rounds);
478         if (decsched)
479                 aesni_set_deckey(ses->enc_schedule, ses->dec_schedule,
480                     ses->rounds);
481
482         if (csp->csp_cipher_alg == CRYPTO_AES_XTS)
483                 aesni_set_enckey(key + keylen, ses->xts_schedule,
484                     ses->rounds);
485 }