]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - contrib/bearssl/src/symcipher/aes_x86ni_ctrcbc.c
MFV r346563:
[FreeBSD/FreeBSD.git] / contrib / bearssl / src / symcipher / aes_x86ni_ctrcbc.c
1 /*
2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining 
5  * a copy of this software and associated documentation files (the
6  * "Software"), to deal in the Software without restriction, including
7  * without limitation the rights to use, copy, modify, merge, publish,
8  * distribute, sublicense, and/or sell copies of the Software, and to
9  * permit persons to whom the Software is furnished to do so, subject to
10  * the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be 
13  * included in all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24
25 #define BR_ENABLE_INTRINSICS   1
26 #include "inner.h"
27
28 #if BR_AES_X86NI
29
30 /* see bearssl_block.h */
31 const br_block_ctrcbc_class *
32 br_aes_x86ni_ctrcbc_get_vtable(void)
33 {
34         return br_aes_x86ni_supported() ? &br_aes_x86ni_ctrcbc_vtable : NULL;
35 }
36
37 /* see bearssl_block.h */
38 void
39 br_aes_x86ni_ctrcbc_init(br_aes_x86ni_ctrcbc_keys *ctx,
40         const void *key, size_t len)
41 {
42         ctx->vtable = &br_aes_x86ni_ctrcbc_vtable;
43         ctx->num_rounds = br_aes_x86ni_keysched_enc(ctx->skey.skni, key, len);
44 }
45
46 BR_TARGETS_X86_UP
47
48 /* see bearssl_block.h */
49 BR_TARGET("sse2,sse4.1,aes")
50 void
51 br_aes_x86ni_ctrcbc_ctr(const br_aes_x86ni_ctrcbc_keys *ctx,
52         void *ctr, void *data, size_t len)
53 {
54         unsigned char *buf;
55         unsigned num_rounds;
56         __m128i sk[15];
57         __m128i ivx0, ivx1, ivx2, ivx3;
58         __m128i erev, zero, one, four, notthree;
59         unsigned u;
60
61         buf = data;
62         num_rounds = ctx->num_rounds;
63         for (u = 0; u <= num_rounds; u ++) {
64                 sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
65         }
66
67         /*
68          * Some SSE2 constants.
69          */
70         erev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
71                 8, 9, 10, 11, 12, 13, 14, 15);
72         zero = _mm_setzero_si128();
73         one = _mm_set_epi64x(0, 1);
74         four = _mm_set_epi64x(0, 4);
75         notthree = _mm_sub_epi64(zero, four);
76
77         /*
78          * Decode the counter in big-endian and pre-increment the other
79          * three counters.
80          */
81         ivx0 = _mm_shuffle_epi8(_mm_loadu_si128((void *)ctr), erev);
82         ivx1 = _mm_add_epi64(ivx0, one);
83         ivx1 = _mm_sub_epi64(ivx1,
84                 _mm_slli_si128(_mm_cmpeq_epi64(ivx1, zero), 8));
85         ivx2 = _mm_add_epi64(ivx1, one);
86         ivx2 = _mm_sub_epi64(ivx2,
87                 _mm_slli_si128(_mm_cmpeq_epi64(ivx2, zero), 8));
88         ivx3 = _mm_add_epi64(ivx2, one);
89         ivx3 = _mm_sub_epi64(ivx3,
90                 _mm_slli_si128(_mm_cmpeq_epi64(ivx3, zero), 8));
91         while (len > 0) {
92                 __m128i x0, x1, x2, x3;
93
94                 /*
95                  * Load counter values; we need to byteswap them because
96                  * the specification says that they use big-endian.
97                  */
98                 x0 = _mm_shuffle_epi8(ivx0, erev);
99                 x1 = _mm_shuffle_epi8(ivx1, erev);
100                 x2 = _mm_shuffle_epi8(ivx2, erev);
101                 x3 = _mm_shuffle_epi8(ivx3, erev);
102
103                 x0 = _mm_xor_si128(x0, sk[0]);
104                 x1 = _mm_xor_si128(x1, sk[0]);
105                 x2 = _mm_xor_si128(x2, sk[0]);
106                 x3 = _mm_xor_si128(x3, sk[0]);
107                 x0 = _mm_aesenc_si128(x0, sk[1]);
108                 x1 = _mm_aesenc_si128(x1, sk[1]);
109                 x2 = _mm_aesenc_si128(x2, sk[1]);
110                 x3 = _mm_aesenc_si128(x3, sk[1]);
111                 x0 = _mm_aesenc_si128(x0, sk[2]);
112                 x1 = _mm_aesenc_si128(x1, sk[2]);
113                 x2 = _mm_aesenc_si128(x2, sk[2]);
114                 x3 = _mm_aesenc_si128(x3, sk[2]);
115                 x0 = _mm_aesenc_si128(x0, sk[3]);
116                 x1 = _mm_aesenc_si128(x1, sk[3]);
117                 x2 = _mm_aesenc_si128(x2, sk[3]);
118                 x3 = _mm_aesenc_si128(x3, sk[3]);
119                 x0 = _mm_aesenc_si128(x0, sk[4]);
120                 x1 = _mm_aesenc_si128(x1, sk[4]);
121                 x2 = _mm_aesenc_si128(x2, sk[4]);
122                 x3 = _mm_aesenc_si128(x3, sk[4]);
123                 x0 = _mm_aesenc_si128(x0, sk[5]);
124                 x1 = _mm_aesenc_si128(x1, sk[5]);
125                 x2 = _mm_aesenc_si128(x2, sk[5]);
126                 x3 = _mm_aesenc_si128(x3, sk[5]);
127                 x0 = _mm_aesenc_si128(x0, sk[6]);
128                 x1 = _mm_aesenc_si128(x1, sk[6]);
129                 x2 = _mm_aesenc_si128(x2, sk[6]);
130                 x3 = _mm_aesenc_si128(x3, sk[6]);
131                 x0 = _mm_aesenc_si128(x0, sk[7]);
132                 x1 = _mm_aesenc_si128(x1, sk[7]);
133                 x2 = _mm_aesenc_si128(x2, sk[7]);
134                 x3 = _mm_aesenc_si128(x3, sk[7]);
135                 x0 = _mm_aesenc_si128(x0, sk[8]);
136                 x1 = _mm_aesenc_si128(x1, sk[8]);
137                 x2 = _mm_aesenc_si128(x2, sk[8]);
138                 x3 = _mm_aesenc_si128(x3, sk[8]);
139                 x0 = _mm_aesenc_si128(x0, sk[9]);
140                 x1 = _mm_aesenc_si128(x1, sk[9]);
141                 x2 = _mm_aesenc_si128(x2, sk[9]);
142                 x3 = _mm_aesenc_si128(x3, sk[9]);
143                 if (num_rounds == 10) {
144                         x0 = _mm_aesenclast_si128(x0, sk[10]);
145                         x1 = _mm_aesenclast_si128(x1, sk[10]);
146                         x2 = _mm_aesenclast_si128(x2, sk[10]);
147                         x3 = _mm_aesenclast_si128(x3, sk[10]);
148                 } else if (num_rounds == 12) {
149                         x0 = _mm_aesenc_si128(x0, sk[10]);
150                         x1 = _mm_aesenc_si128(x1, sk[10]);
151                         x2 = _mm_aesenc_si128(x2, sk[10]);
152                         x3 = _mm_aesenc_si128(x3, sk[10]);
153                         x0 = _mm_aesenc_si128(x0, sk[11]);
154                         x1 = _mm_aesenc_si128(x1, sk[11]);
155                         x2 = _mm_aesenc_si128(x2, sk[11]);
156                         x3 = _mm_aesenc_si128(x3, sk[11]);
157                         x0 = _mm_aesenclast_si128(x0, sk[12]);
158                         x1 = _mm_aesenclast_si128(x1, sk[12]);
159                         x2 = _mm_aesenclast_si128(x2, sk[12]);
160                         x3 = _mm_aesenclast_si128(x3, sk[12]);
161                 } else {
162                         x0 = _mm_aesenc_si128(x0, sk[10]);
163                         x1 = _mm_aesenc_si128(x1, sk[10]);
164                         x2 = _mm_aesenc_si128(x2, sk[10]);
165                         x3 = _mm_aesenc_si128(x3, sk[10]);
166                         x0 = _mm_aesenc_si128(x0, sk[11]);
167                         x1 = _mm_aesenc_si128(x1, sk[11]);
168                         x2 = _mm_aesenc_si128(x2, sk[11]);
169                         x3 = _mm_aesenc_si128(x3, sk[11]);
170                         x0 = _mm_aesenc_si128(x0, sk[12]);
171                         x1 = _mm_aesenc_si128(x1, sk[12]);
172                         x2 = _mm_aesenc_si128(x2, sk[12]);
173                         x3 = _mm_aesenc_si128(x3, sk[12]);
174                         x0 = _mm_aesenc_si128(x0, sk[13]);
175                         x1 = _mm_aesenc_si128(x1, sk[13]);
176                         x2 = _mm_aesenc_si128(x2, sk[13]);
177                         x3 = _mm_aesenc_si128(x3, sk[13]);
178                         x0 = _mm_aesenclast_si128(x0, sk[14]);
179                         x1 = _mm_aesenclast_si128(x1, sk[14]);
180                         x2 = _mm_aesenclast_si128(x2, sk[14]);
181                         x3 = _mm_aesenclast_si128(x3, sk[14]);
182                 }
183                 if (len >= 64) {
184                         x0 = _mm_xor_si128(x0,
185                                 _mm_loadu_si128((void *)(buf +  0)));
186                         x1 = _mm_xor_si128(x1,
187                                 _mm_loadu_si128((void *)(buf + 16)));
188                         x2 = _mm_xor_si128(x2,
189                                 _mm_loadu_si128((void *)(buf + 32)));
190                         x3 = _mm_xor_si128(x3,
191                                 _mm_loadu_si128((void *)(buf + 48)));
192                         _mm_storeu_si128((void *)(buf +  0), x0);
193                         _mm_storeu_si128((void *)(buf + 16), x1);
194                         _mm_storeu_si128((void *)(buf + 32), x2);
195                         _mm_storeu_si128((void *)(buf + 48), x3);
196                         buf += 64;
197                         len -= 64;
198                 } else {
199                         unsigned char tmp[64];
200
201                         _mm_storeu_si128((void *)(tmp +  0), x0);
202                         _mm_storeu_si128((void *)(tmp + 16), x1);
203                         _mm_storeu_si128((void *)(tmp + 32), x2);
204                         _mm_storeu_si128((void *)(tmp + 48), x3);
205                         for (u = 0; u < len; u ++) {
206                                 buf[u] ^= tmp[u];
207                         }
208                         switch (len) {
209                         case 16:
210                                 ivx0 = ivx1;
211                                 break;
212                         case 32:
213                                 ivx0 = ivx2;
214                                 break;
215                         case 48:
216                                 ivx0 = ivx3;
217                                 break;
218                         }
219                         break;
220                 }
221
222                 /*
223                  * Add 4 to each counter value. For carry propagation
224                  * into the upper 64-bit words, we would need to compare
225                  * the results with 4, but SSE2+ has only _signed_
226                  * comparisons. Instead, we mask out the low two bits,
227                  * and check whether the remaining bits are zero.
228                  */
229                 ivx0 = _mm_add_epi64(ivx0, four);
230                 ivx1 = _mm_add_epi64(ivx1, four);
231                 ivx2 = _mm_add_epi64(ivx2, four);
232                 ivx3 = _mm_add_epi64(ivx3, four);
233                 ivx0 = _mm_sub_epi64(ivx0,
234                         _mm_slli_si128(_mm_cmpeq_epi64(
235                                 _mm_and_si128(ivx0, notthree), zero), 8));
236                 ivx1 = _mm_sub_epi64(ivx1,
237                         _mm_slli_si128(_mm_cmpeq_epi64(
238                                 _mm_and_si128(ivx1, notthree), zero), 8));
239                 ivx2 = _mm_sub_epi64(ivx2,
240                         _mm_slli_si128(_mm_cmpeq_epi64(
241                                 _mm_and_si128(ivx2, notthree), zero), 8));
242                 ivx3 = _mm_sub_epi64(ivx3,
243                         _mm_slli_si128(_mm_cmpeq_epi64(
244                                 _mm_and_si128(ivx3, notthree), zero), 8));
245         }
246
247         /*
248          * Write back new counter value. The loop took care to put the
249          * right counter value in ivx0.
250          */
251         _mm_storeu_si128((void *)ctr, _mm_shuffle_epi8(ivx0, erev));
252 }
253
254 /* see bearssl_block.h */
255 BR_TARGET("sse2,sse4.1,aes")
256 void
257 br_aes_x86ni_ctrcbc_mac(const br_aes_x86ni_ctrcbc_keys *ctx,
258         void *cbcmac, const void *data, size_t len)
259 {
260         const unsigned char *buf;
261         unsigned num_rounds;
262         __m128i sk[15], ivx;
263         unsigned u;
264
265         buf = data;
266         ivx = _mm_loadu_si128(cbcmac);
267         num_rounds = ctx->num_rounds;
268         for (u = 0; u <= num_rounds; u ++) {
269                 sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
270         }
271         while (len > 0) {
272                 __m128i x;
273
274                 x = _mm_xor_si128(_mm_loadu_si128((void *)buf), ivx);
275                 x = _mm_xor_si128(x, sk[0]);
276                 x = _mm_aesenc_si128(x, sk[1]);
277                 x = _mm_aesenc_si128(x, sk[2]);
278                 x = _mm_aesenc_si128(x, sk[3]);
279                 x = _mm_aesenc_si128(x, sk[4]);
280                 x = _mm_aesenc_si128(x, sk[5]);
281                 x = _mm_aesenc_si128(x, sk[6]);
282                 x = _mm_aesenc_si128(x, sk[7]);
283                 x = _mm_aesenc_si128(x, sk[8]);
284                 x = _mm_aesenc_si128(x, sk[9]);
285                 if (num_rounds == 10) {
286                         x = _mm_aesenclast_si128(x, sk[10]);
287                 } else if (num_rounds == 12) {
288                         x = _mm_aesenc_si128(x, sk[10]);
289                         x = _mm_aesenc_si128(x, sk[11]);
290                         x = _mm_aesenclast_si128(x, sk[12]);
291                 } else {
292                         x = _mm_aesenc_si128(x, sk[10]);
293                         x = _mm_aesenc_si128(x, sk[11]);
294                         x = _mm_aesenc_si128(x, sk[12]);
295                         x = _mm_aesenc_si128(x, sk[13]);
296                         x = _mm_aesenclast_si128(x, sk[14]);
297                 }
298                 ivx = x;
299                 buf += 16;
300                 len -= 16;
301         }
302         _mm_storeu_si128(cbcmac, ivx);
303 }
304
305 /* see bearssl_block.h */
306 BR_TARGET("sse2,sse4.1,aes")
307 void
308 br_aes_x86ni_ctrcbc_encrypt(const br_aes_x86ni_ctrcbc_keys *ctx,
309         void *ctr, void *cbcmac, void *data, size_t len)
310 {
311         unsigned char *buf;
312         unsigned num_rounds;
313         __m128i sk[15];
314         __m128i ivx, cmx;
315         __m128i erev, zero, one;
316         unsigned u;
317         int first_iter;
318
319         num_rounds = ctx->num_rounds;
320         for (u = 0; u <= num_rounds; u ++) {
321                 sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
322         }
323
324         /*
325          * Some SSE2 constants.
326          */
327         erev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
328                 8, 9, 10, 11, 12, 13, 14, 15);
329         zero = _mm_setzero_si128();
330         one = _mm_set_epi64x(0, 1);
331
332         /*
333          * Decode the counter in big-endian.
334          */
335         ivx = _mm_shuffle_epi8(_mm_loadu_si128(ctr), erev);
336         cmx = _mm_loadu_si128(cbcmac);
337
338         buf = data;
339         first_iter = 1;
340         while (len > 0) {
341                 __m128i dx, x0, x1;
342
343                 /*
344                  * Load initial values:
345                  *   dx   encrypted block of data
346                  *   x0   counter (for CTR encryption)
347                  *   x1   input for CBC-MAC
348                  */
349                 dx = _mm_loadu_si128((void *)buf);
350                 x0 = _mm_shuffle_epi8(ivx, erev);
351                 x1 = cmx;
352
353                 x0 = _mm_xor_si128(x0, sk[0]);
354                 x1 = _mm_xor_si128(x1, sk[0]);
355                 x0 = _mm_aesenc_si128(x0, sk[1]);
356                 x1 = _mm_aesenc_si128(x1, sk[1]);
357                 x0 = _mm_aesenc_si128(x0, sk[2]);
358                 x1 = _mm_aesenc_si128(x1, sk[2]);
359                 x0 = _mm_aesenc_si128(x0, sk[3]);
360                 x1 = _mm_aesenc_si128(x1, sk[3]);
361                 x0 = _mm_aesenc_si128(x0, sk[4]);
362                 x1 = _mm_aesenc_si128(x1, sk[4]);
363                 x0 = _mm_aesenc_si128(x0, sk[5]);
364                 x1 = _mm_aesenc_si128(x1, sk[5]);
365                 x0 = _mm_aesenc_si128(x0, sk[6]);
366                 x1 = _mm_aesenc_si128(x1, sk[6]);
367                 x0 = _mm_aesenc_si128(x0, sk[7]);
368                 x1 = _mm_aesenc_si128(x1, sk[7]);
369                 x0 = _mm_aesenc_si128(x0, sk[8]);
370                 x1 = _mm_aesenc_si128(x1, sk[8]);
371                 x0 = _mm_aesenc_si128(x0, sk[9]);
372                 x1 = _mm_aesenc_si128(x1, sk[9]);
373                 if (num_rounds == 10) {
374                         x0 = _mm_aesenclast_si128(x0, sk[10]);
375                         x1 = _mm_aesenclast_si128(x1, sk[10]);
376                 } else if (num_rounds == 12) {
377                         x0 = _mm_aesenc_si128(x0, sk[10]);
378                         x1 = _mm_aesenc_si128(x1, sk[10]);
379                         x0 = _mm_aesenc_si128(x0, sk[11]);
380                         x1 = _mm_aesenc_si128(x1, sk[11]);
381                         x0 = _mm_aesenclast_si128(x0, sk[12]);
382                         x1 = _mm_aesenclast_si128(x1, sk[12]);
383                 } else {
384                         x0 = _mm_aesenc_si128(x0, sk[10]);
385                         x1 = _mm_aesenc_si128(x1, sk[10]);
386                         x0 = _mm_aesenc_si128(x0, sk[11]);
387                         x1 = _mm_aesenc_si128(x1, sk[11]);
388                         x0 = _mm_aesenc_si128(x0, sk[12]);
389                         x1 = _mm_aesenc_si128(x1, sk[12]);
390                         x0 = _mm_aesenc_si128(x0, sk[13]);
391                         x1 = _mm_aesenc_si128(x1, sk[13]);
392                         x0 = _mm_aesenclast_si128(x0, sk[14]);
393                         x1 = _mm_aesenclast_si128(x1, sk[14]);
394                 }
395
396                 x0 = _mm_xor_si128(x0, dx);
397                 if (first_iter) {
398                         cmx = _mm_xor_si128(cmx, x0);
399                         first_iter = 0;
400                 } else {
401                         cmx = _mm_xor_si128(x1, x0);
402                 }
403                 _mm_storeu_si128((void *)buf, x0);
404
405                 buf += 16;
406                 len -= 16;
407
408                 /*
409                  * Increment the counter value.
410                  */
411                 ivx = _mm_add_epi64(ivx, one);
412                 ivx = _mm_sub_epi64(ivx,
413                         _mm_slli_si128(_mm_cmpeq_epi64(ivx, zero), 8));
414
415                 /*
416                  * If this was the last iteration, then compute the
417                  * extra block encryption to complete CBC-MAC.
418                  */
419                 if (len == 0) {
420                         cmx = _mm_xor_si128(cmx, sk[0]);
421                         cmx = _mm_aesenc_si128(cmx, sk[1]);
422                         cmx = _mm_aesenc_si128(cmx, sk[2]);
423                         cmx = _mm_aesenc_si128(cmx, sk[3]);
424                         cmx = _mm_aesenc_si128(cmx, sk[4]);
425                         cmx = _mm_aesenc_si128(cmx, sk[5]);
426                         cmx = _mm_aesenc_si128(cmx, sk[6]);
427                         cmx = _mm_aesenc_si128(cmx, sk[7]);
428                         cmx = _mm_aesenc_si128(cmx, sk[8]);
429                         cmx = _mm_aesenc_si128(cmx, sk[9]);
430                         if (num_rounds == 10) {
431                                 cmx = _mm_aesenclast_si128(cmx, sk[10]);
432                         } else if (num_rounds == 12) {
433                                 cmx = _mm_aesenc_si128(cmx, sk[10]);
434                                 cmx = _mm_aesenc_si128(cmx, sk[11]);
435                                 cmx = _mm_aesenclast_si128(cmx, sk[12]);
436                         } else {
437                                 cmx = _mm_aesenc_si128(cmx, sk[10]);
438                                 cmx = _mm_aesenc_si128(cmx, sk[11]);
439                                 cmx = _mm_aesenc_si128(cmx, sk[12]);
440                                 cmx = _mm_aesenc_si128(cmx, sk[13]);
441                                 cmx = _mm_aesenclast_si128(cmx, sk[14]);
442                         }
443                         break;
444                 }
445         }
446
447         /*
448          * Write back new counter value and CBC-MAC value.
449          */
450         _mm_storeu_si128(ctr, _mm_shuffle_epi8(ivx, erev));
451         _mm_storeu_si128(cbcmac, cmx);
452 }
453
454 /* see bearssl_block.h */
455 BR_TARGET("sse2,sse4.1,aes")
456 void
457 br_aes_x86ni_ctrcbc_decrypt(const br_aes_x86ni_ctrcbc_keys *ctx,
458         void *ctr, void *cbcmac, void *data, size_t len)
459 {
460         unsigned char *buf;
461         unsigned num_rounds;
462         __m128i sk[15];
463         __m128i ivx, cmx;
464         __m128i erev, zero, one;
465         unsigned u;
466
467         num_rounds = ctx->num_rounds;
468         for (u = 0; u <= num_rounds; u ++) {
469                 sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
470         }
471
472         /*
473          * Some SSE2 constants.
474          */
475         erev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
476                 8, 9, 10, 11, 12, 13, 14, 15);
477         zero = _mm_setzero_si128();
478         one = _mm_set_epi64x(0, 1);
479
480         /*
481          * Decode the counter in big-endian.
482          */
483         ivx = _mm_shuffle_epi8(_mm_loadu_si128(ctr), erev);
484         cmx = _mm_loadu_si128(cbcmac);
485
486         buf = data;
487         while (len > 0) {
488                 __m128i dx, x0, x1;
489
490                 /*
491                  * Load initial values:
492                  *   dx   encrypted block of data
493                  *   x0   counter (for CTR encryption)
494                  *   x1   input for CBC-MAC
495                  */
496                 dx = _mm_loadu_si128((void *)buf);
497                 x0 = _mm_shuffle_epi8(ivx, erev);
498                 x1 = _mm_xor_si128(cmx, dx);
499
500                 x0 = _mm_xor_si128(x0, sk[0]);
501                 x1 = _mm_xor_si128(x1, sk[0]);
502                 x0 = _mm_aesenc_si128(x0, sk[1]);
503                 x1 = _mm_aesenc_si128(x1, sk[1]);
504                 x0 = _mm_aesenc_si128(x0, sk[2]);
505                 x1 = _mm_aesenc_si128(x1, sk[2]);
506                 x0 = _mm_aesenc_si128(x0, sk[3]);
507                 x1 = _mm_aesenc_si128(x1, sk[3]);
508                 x0 = _mm_aesenc_si128(x0, sk[4]);
509                 x1 = _mm_aesenc_si128(x1, sk[4]);
510                 x0 = _mm_aesenc_si128(x0, sk[5]);
511                 x1 = _mm_aesenc_si128(x1, sk[5]);
512                 x0 = _mm_aesenc_si128(x0, sk[6]);
513                 x1 = _mm_aesenc_si128(x1, sk[6]);
514                 x0 = _mm_aesenc_si128(x0, sk[7]);
515                 x1 = _mm_aesenc_si128(x1, sk[7]);
516                 x0 = _mm_aesenc_si128(x0, sk[8]);
517                 x1 = _mm_aesenc_si128(x1, sk[8]);
518                 x0 = _mm_aesenc_si128(x0, sk[9]);
519                 x1 = _mm_aesenc_si128(x1, sk[9]);
520                 if (num_rounds == 10) {
521                         x0 = _mm_aesenclast_si128(x0, sk[10]);
522                         x1 = _mm_aesenclast_si128(x1, sk[10]);
523                 } else if (num_rounds == 12) {
524                         x0 = _mm_aesenc_si128(x0, sk[10]);
525                         x1 = _mm_aesenc_si128(x1, sk[10]);
526                         x0 = _mm_aesenc_si128(x0, sk[11]);
527                         x1 = _mm_aesenc_si128(x1, sk[11]);
528                         x0 = _mm_aesenclast_si128(x0, sk[12]);
529                         x1 = _mm_aesenclast_si128(x1, sk[12]);
530                 } else {
531                         x0 = _mm_aesenc_si128(x0, sk[10]);
532                         x1 = _mm_aesenc_si128(x1, sk[10]);
533                         x0 = _mm_aesenc_si128(x0, sk[11]);
534                         x1 = _mm_aesenc_si128(x1, sk[11]);
535                         x0 = _mm_aesenc_si128(x0, sk[12]);
536                         x1 = _mm_aesenc_si128(x1, sk[12]);
537                         x0 = _mm_aesenc_si128(x0, sk[13]);
538                         x1 = _mm_aesenc_si128(x1, sk[13]);
539                         x0 = _mm_aesenclast_si128(x0, sk[14]);
540                         x1 = _mm_aesenclast_si128(x1, sk[14]);
541                 }
542                 x0 = _mm_xor_si128(x0, dx);
543                 cmx = x1;
544                 _mm_storeu_si128((void *)buf, x0);
545
546                 buf += 16;
547                 len -= 16;
548
549                 /*
550                  * Increment the counter value.
551                  */
552                 ivx = _mm_add_epi64(ivx, one);
553                 ivx = _mm_sub_epi64(ivx,
554                         _mm_slli_si128(_mm_cmpeq_epi64(ivx, zero), 8));
555         }
556
557         /*
558          * Write back new counter value and CBC-MAC value.
559          */
560         _mm_storeu_si128(ctr, _mm_shuffle_epi8(ivx, erev));
561         _mm_storeu_si128(cbcmac, cmx);
562 }
563
564 BR_TARGETS_X86_DOWN
565
566 /* see bearssl_block.h */
567 const br_block_ctrcbc_class br_aes_x86ni_ctrcbc_vtable = {
568         sizeof(br_aes_x86ni_ctrcbc_keys),
569         16,
570         4,
571         (void (*)(const br_block_ctrcbc_class **, const void *, size_t))
572                 &br_aes_x86ni_ctrcbc_init,
573         (void (*)(const br_block_ctrcbc_class *const *,
574                 void *, void *, void *, size_t))
575                 &br_aes_x86ni_ctrcbc_encrypt,
576         (void (*)(const br_block_ctrcbc_class *const *,
577                 void *, void *, void *, size_t))
578                 &br_aes_x86ni_ctrcbc_decrypt,
579         (void (*)(const br_block_ctrcbc_class *const *,
580                 void *, void *, size_t))
581                 &br_aes_x86ni_ctrcbc_ctr,
582         (void (*)(const br_block_ctrcbc_class *const *,
583                 void *, const void *, size_t))
584                 &br_aes_x86ni_ctrcbc_mac
585 };
586
587 #else
588
589 /* see bearssl_block.h */
590 const br_block_ctrcbc_class *
591 br_aes_x86ni_ctrcbc_get_vtable(void)
592 {
593         return NULL;
594 }
595
596 #endif