2 * Copyright (c) 2014 The FreeBSD Foundation
5 * This software was developed by John-Mark Gurney under
6 * the sponsorship of the FreeBSD Foundation and
7 * Rubicon Communications, LLC (Netgate).
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * Figure 5, 8 and 12 are copied from the Intel white paper:
36 * Intel® Carry-Less Multiplication Instruction and its Usage for
37 * Computing the GCM Mode
40 * Copyright © 2010 Intel Corporation.
41 * All rights reserved.
43 * Redistribution and use in source and binary forms, with or without
44 * modification, are permitted provided that the following conditions
46 * * Redistributions of source code must retain the above copyright
47 * notice, this list of conditions and the following disclaimer.
48 * * Redistributions in binary form must reproduce the above copyright
49 * notice, this list of conditions and the following disclaimer in the
50 * documentation and/or other materials provided with the distribution.
51 * * Neither the name of Intel Corporation nor the
52 * names of its contributors may be used to endorse or promote products
53 * derived from this software without specific prior written permission.
55 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
56 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
57 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
58 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
59 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
60 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
61 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
62 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
63 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
64 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
65 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
69 #include <crypto/aesni/aesni.h>
70 #include <crypto/aesni/aesni_os.h>
75 #include <wmmintrin.h>
76 #include <emmintrin.h>
77 #include <smmintrin.h>
80 m128icmp(__m128i a, __m128i b)
84 cmp = _mm_cmpeq_epi32(a, b);
86 return _mm_movemask_epi8(cmp) == 0xffff;
91 _mm_insert_epi64(__m128i a, int64_t b, const int ndx)
95 a = _mm_insert_epi32(a, b, 0);
96 a = _mm_insert_epi32(a, b >> 32, 1);
98 a = _mm_insert_epi32(a, b, 2);
99 a = _mm_insert_epi32(a, b >> 32, 3);
106 /* some code from carry-less-multiplication-instruction-in-gcm-mode-paper.pdf */
108 /* Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */
110 gfmul(__m128i a, __m128i b, __m128i *res)
112 __m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
114 tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
115 tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
116 tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
117 tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
119 tmp4 = _mm_xor_si128(tmp4, tmp5);
120 tmp5 = _mm_slli_si128(tmp4, 8);
121 tmp4 = _mm_srli_si128(tmp4, 8);
122 tmp3 = _mm_xor_si128(tmp3, tmp5);
123 tmp6 = _mm_xor_si128(tmp6, tmp4);
125 tmp7 = _mm_srli_epi32(tmp3, 31);
126 tmp8 = _mm_srli_epi32(tmp6, 31);
127 tmp3 = _mm_slli_epi32(tmp3, 1);
128 tmp6 = _mm_slli_epi32(tmp6, 1);
130 tmp9 = _mm_srli_si128(tmp7, 12);
131 tmp8 = _mm_slli_si128(tmp8, 4);
132 tmp7 = _mm_slli_si128(tmp7, 4);
133 tmp3 = _mm_or_si128(tmp3, tmp7);
134 tmp6 = _mm_or_si128(tmp6, tmp8);
135 tmp6 = _mm_or_si128(tmp6, tmp9);
137 tmp7 = _mm_slli_epi32(tmp3, 31);
138 tmp8 = _mm_slli_epi32(tmp3, 30);
139 tmp9 = _mm_slli_epi32(tmp3, 25);
141 tmp7 = _mm_xor_si128(tmp7, tmp8);
142 tmp7 = _mm_xor_si128(tmp7, tmp9);
143 tmp8 = _mm_srli_si128(tmp7, 4);
144 tmp7 = _mm_slli_si128(tmp7, 12);
145 tmp3 = _mm_xor_si128(tmp3, tmp7);
147 tmp2 = _mm_srli_epi32(tmp3, 1);
148 tmp4 = _mm_srli_epi32(tmp3, 2);
149 tmp5 = _mm_srli_epi32(tmp3, 7);
150 tmp2 = _mm_xor_si128(tmp2, tmp4);
151 tmp2 = _mm_xor_si128(tmp2, tmp5);
152 tmp2 = _mm_xor_si128(tmp2, tmp8);
153 tmp3 = _mm_xor_si128(tmp3, tmp2);
154 tmp6 = _mm_xor_si128(tmp6, tmp3);
160 * Figure 8. Code Sample - Performing Ghash Using an Aggregated Reduction
163 reduce4(__m128i H1, __m128i H2, __m128i H3, __m128i H4,
164 __m128i X1, __m128i X2, __m128i X3, __m128i X4, __m128i *res)
166 /*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/
167 __m128i H1_X1_lo, H1_X1_hi, H2_X2_lo, H2_X2_hi, H3_X3_lo,
168 H3_X3_hi, H4_X4_lo, H4_X4_hi, lo, hi;
169 __m128i tmp0, tmp1, tmp2, tmp3;
170 __m128i tmp4, tmp5, tmp6, tmp7;
173 H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00);
174 H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00);
175 H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00);
176 H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00);
178 lo = _mm_xor_si128(H1_X1_lo, H2_X2_lo);
179 lo = _mm_xor_si128(lo, H3_X3_lo);
180 lo = _mm_xor_si128(lo, H4_X4_lo);
182 H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11);
183 H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11);
184 H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11);
185 H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11);
187 hi = _mm_xor_si128(H1_X1_hi, H2_X2_hi);
188 hi = _mm_xor_si128(hi, H3_X3_hi);
189 hi = _mm_xor_si128(hi, H4_X4_hi);
191 tmp0 = _mm_shuffle_epi32(H1, 78);
192 tmp4 = _mm_shuffle_epi32(X1, 78);
193 tmp0 = _mm_xor_si128(tmp0, H1);
194 tmp4 = _mm_xor_si128(tmp4, X1);
195 tmp1 = _mm_shuffle_epi32(H2, 78);
196 tmp5 = _mm_shuffle_epi32(X2, 78);
197 tmp1 = _mm_xor_si128(tmp1, H2);
198 tmp5 = _mm_xor_si128(tmp5, X2);
199 tmp2 = _mm_shuffle_epi32(H3, 78);
200 tmp6 = _mm_shuffle_epi32(X3, 78);
201 tmp2 = _mm_xor_si128(tmp2, H3);
202 tmp6 = _mm_xor_si128(tmp6, X3);
203 tmp3 = _mm_shuffle_epi32(H4, 78);
204 tmp7 = _mm_shuffle_epi32(X4, 78);
205 tmp3 = _mm_xor_si128(tmp3, H4);
206 tmp7 = _mm_xor_si128(tmp7, X4);
208 tmp0 = _mm_clmulepi64_si128(tmp0, tmp4, 0x00);
209 tmp1 = _mm_clmulepi64_si128(tmp1, tmp5, 0x00);
210 tmp2 = _mm_clmulepi64_si128(tmp2, tmp6, 0x00);
211 tmp3 = _mm_clmulepi64_si128(tmp3, tmp7, 0x00);
213 tmp0 = _mm_xor_si128(tmp0, lo);
214 tmp0 = _mm_xor_si128(tmp0, hi);
215 tmp0 = _mm_xor_si128(tmp1, tmp0);
216 tmp0 = _mm_xor_si128(tmp2, tmp0);
217 tmp0 = _mm_xor_si128(tmp3, tmp0);
219 tmp4 = _mm_slli_si128(tmp0, 8);
220 tmp0 = _mm_srli_si128(tmp0, 8);
222 lo = _mm_xor_si128(tmp4, lo);
223 hi = _mm_xor_si128(tmp0, hi);
228 tmp7 = _mm_srli_epi32(tmp3, 31);
229 tmp8 = _mm_srli_epi32(tmp6, 31);
230 tmp3 = _mm_slli_epi32(tmp3, 1);
231 tmp6 = _mm_slli_epi32(tmp6, 1);
233 tmp9 = _mm_srli_si128(tmp7, 12);
234 tmp8 = _mm_slli_si128(tmp8, 4);
235 tmp7 = _mm_slli_si128(tmp7, 4);
236 tmp3 = _mm_or_si128(tmp3, tmp7);
237 tmp6 = _mm_or_si128(tmp6, tmp8);
238 tmp6 = _mm_or_si128(tmp6, tmp9);
240 tmp7 = _mm_slli_epi32(tmp3, 31);
241 tmp8 = _mm_slli_epi32(tmp3, 30);
242 tmp9 = _mm_slli_epi32(tmp3, 25);
244 tmp7 = _mm_xor_si128(tmp7, tmp8);
245 tmp7 = _mm_xor_si128(tmp7, tmp9);
246 tmp8 = _mm_srli_si128(tmp7, 4);
247 tmp7 = _mm_slli_si128(tmp7, 12);
248 tmp3 = _mm_xor_si128(tmp3, tmp7);
250 tmp2 = _mm_srli_epi32(tmp3, 1);
251 tmp4 = _mm_srli_epi32(tmp3, 2);
252 tmp5 = _mm_srli_epi32(tmp3, 7);
253 tmp2 = _mm_xor_si128(tmp2, tmp4);
254 tmp2 = _mm_xor_si128(tmp2, tmp5);
255 tmp2 = _mm_xor_si128(tmp2, tmp8);
256 tmp3 = _mm_xor_si128(tmp3, tmp2);
257 tmp6 = _mm_xor_si128(tmp6, tmp3);
263 * Figure 12. AES-GCM: Processing Four Blocks in Parallel with Aggregated
267 * per NIST SP-800-38D, 5.2.1.1, len(p) <= 2^39-256 (in bits), or
268 * 2^32-256*8*16 bytes.
271 AES_GCM_encrypt(const unsigned char *in, unsigned char *out,
272 const unsigned char *addt, const unsigned char *ivec,
273 unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
274 const unsigned char *key, int nr)
277 __m128i tmp1, tmp2, tmp3, tmp4;
278 __m128i tmp5, tmp6, tmp7, tmp8;
279 __m128i H, H2, H3, H4, Y, T;
280 const __m128i *KEY = (const __m128i *)key;
281 __m128i ctr1, ctr2, ctr3, ctr4;
282 __m128i ctr5, ctr6, ctr7, ctr8;
283 __m128i last_block = _mm_setzero_si128();
284 __m128i ONE = _mm_set_epi32(0, 1, 0, 0);
285 __m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
286 __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
288 __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
290 __m128i X = _mm_setzero_si128();
292 if (ibytes == 96/8) {
293 Y = _mm_loadu_si128((const __m128i *)ivec);
294 Y = _mm_insert_epi32(Y, 0x1000000, 3);
295 /*(Compute E[ZERO, KS] and E[Y0, KS] together*/
296 tmp1 = _mm_xor_si128(X, KEY[0]);
297 tmp2 = _mm_xor_si128(Y, KEY[0]);
298 for (j=1; j < nr-1; j+=2) {
299 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
300 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
302 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
303 tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
305 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
306 tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
308 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
309 T = _mm_aesenclast_si128(tmp2, KEY[nr]);
311 H = _mm_shuffle_epi8(H, BSWAP_MASK);
313 tmp1 = _mm_xor_si128(X, KEY[0]);
314 for (j=1; j <nr; j++)
315 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
316 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
318 H = _mm_shuffle_epi8(H, BSWAP_MASK);
319 Y = _mm_setzero_si128();
321 for (i=0; i < ibytes/16; i++) {
322 tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]);
323 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
324 Y = _mm_xor_si128(Y, tmp1);
328 for (j=0; j < ibytes%16; j++)
329 ((unsigned char*)&last_block)[j] = ivec[i*16+j];
331 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
332 Y = _mm_xor_si128(Y, tmp1);
335 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
336 tmp1 = _mm_insert_epi64(tmp1, 0, 1);
338 Y = _mm_xor_si128(Y, tmp1);
340 Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
341 tmp1 = _mm_xor_si128(Y, KEY[0]);
342 for (j=1; j < nr; j++)
343 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
344 T = _mm_aesenclast_si128(tmp1, KEY[nr]);
351 for (i=0; i<abytes/16/4; i++) {
352 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]);
353 tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]);
354 tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]);
355 tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]);
357 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
358 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
359 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
360 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
361 tmp1 = _mm_xor_si128(X, tmp1);
363 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
365 for (i=i*4; i<abytes/16; i++) {
366 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]);
367 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
368 X = _mm_xor_si128(X,tmp1);
372 last_block = _mm_setzero_si128();
373 for (j=0; j<abytes%16; j++)
374 ((unsigned char*)&last_block)[j] = addt[i*16+j];
376 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
377 X =_mm_xor_si128(X,tmp1);
381 ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
382 ctr1 = _mm_add_epi64(ctr1, ONE);
383 ctr2 = _mm_add_epi64(ctr1, ONE);
384 ctr3 = _mm_add_epi64(ctr2, ONE);
385 ctr4 = _mm_add_epi64(ctr3, ONE);
386 ctr5 = _mm_add_epi64(ctr4, ONE);
387 ctr6 = _mm_add_epi64(ctr5, ONE);
388 ctr7 = _mm_add_epi64(ctr6, ONE);
389 ctr8 = _mm_add_epi64(ctr7, ONE);
391 for (i=0; i<nbytes/16/8; i++) {
392 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
393 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
394 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
395 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
396 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
397 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
398 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
399 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
401 ctr1 = _mm_add_epi64(ctr1, EIGHT);
402 ctr2 = _mm_add_epi64(ctr2, EIGHT);
403 ctr3 = _mm_add_epi64(ctr3, EIGHT);
404 ctr4 = _mm_add_epi64(ctr4, EIGHT);
405 ctr5 = _mm_add_epi64(ctr5, EIGHT);
406 ctr6 = _mm_add_epi64(ctr6, EIGHT);
407 ctr7 = _mm_add_epi64(ctr7, EIGHT);
408 ctr8 = _mm_add_epi64(ctr8, EIGHT);
410 tmp1 =_mm_xor_si128(tmp1, KEY[0]);
411 tmp2 =_mm_xor_si128(tmp2, KEY[0]);
412 tmp3 =_mm_xor_si128(tmp3, KEY[0]);
413 tmp4 =_mm_xor_si128(tmp4, KEY[0]);
414 tmp5 =_mm_xor_si128(tmp5, KEY[0]);
415 tmp6 =_mm_xor_si128(tmp6, KEY[0]);
416 tmp7 =_mm_xor_si128(tmp7, KEY[0]);
417 tmp8 =_mm_xor_si128(tmp8, KEY[0]);
419 for (j=1; j<nr; j++) {
420 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
421 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
422 tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
423 tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
424 tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
425 tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
426 tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
427 tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
429 tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
430 tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
431 tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
432 tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
433 tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
434 tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
435 tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
436 tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
438 tmp1 = _mm_xor_si128(tmp1,
439 _mm_loadu_si128(&((const __m128i *)in)[i*8+0]));
440 tmp2 = _mm_xor_si128(tmp2,
441 _mm_loadu_si128(&((const __m128i *)in)[i*8+1]));
442 tmp3 = _mm_xor_si128(tmp3,
443 _mm_loadu_si128(&((const __m128i *)in)[i*8+2]));
444 tmp4 = _mm_xor_si128(tmp4,
445 _mm_loadu_si128(&((const __m128i *)in)[i*8+3]));
446 tmp5 = _mm_xor_si128(tmp5,
447 _mm_loadu_si128(&((const __m128i *)in)[i*8+4]));
448 tmp6 = _mm_xor_si128(tmp6,
449 _mm_loadu_si128(&((const __m128i *)in)[i*8+5]));
450 tmp7 = _mm_xor_si128(tmp7,
451 _mm_loadu_si128(&((const __m128i *)in)[i*8+6]));
452 tmp8 = _mm_xor_si128(tmp8,
453 _mm_loadu_si128(&((const __m128i *)in)[i*8+7]));
455 _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
456 _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
457 _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
458 _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
459 _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
460 _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
461 _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
462 _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
464 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
465 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
466 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
467 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
468 tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
469 tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
470 tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
471 tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
473 tmp1 = _mm_xor_si128(X, tmp1);
475 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
477 tmp5 = _mm_xor_si128(X, tmp5);
478 reduce4(H, H2, H3, H4, tmp8, tmp7, tmp6, tmp5, &X);
480 for (k=i*8; k<nbytes/16; k++) {
481 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
482 ctr1 = _mm_add_epi64(ctr1, ONE);
483 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
484 for (j=1; j<nr-1; j+=2) {
485 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
486 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
488 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
489 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
490 tmp1 = _mm_xor_si128(tmp1,
491 _mm_loadu_si128(&((const __m128i *)in)[k]));
492 _mm_storeu_si128(&((__m128i*)out)[k], tmp1);
493 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
494 X = _mm_xor_si128(X, tmp1);
497 //If remains one incomplete block
499 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
500 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
501 for (j=1; j<nr-1; j+=2) {
502 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
503 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
505 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
506 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
507 last_block = _mm_setzero_si128();
508 memcpy(&last_block, &((const __m128i *)in)[k],
510 last_block = _mm_xor_si128(last_block, tmp1);
511 for (j=0; j<nbytes%16; j++)
512 out[k*16+j] = ((unsigned char*)&last_block)[j];
513 for ((void)j; j<16; j++)
514 ((unsigned char*)&last_block)[j] = 0;
516 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
517 X = _mm_xor_si128(X, tmp1);
520 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
521 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
523 X = _mm_xor_si128(X, tmp1);
525 X = _mm_shuffle_epi8(X, BSWAP_MASK);
526 T = _mm_xor_si128(X, T);
527 _mm_storeu_si128((__m128i*)tag, T);
530 /* My modification of _encrypt to be _decrypt */
532 AES_GCM_decrypt(const unsigned char *in, unsigned char *out,
533 const unsigned char *addt, const unsigned char *ivec,
534 const unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
535 const unsigned char *key, int nr)
538 __m128i tmp1, tmp2, tmp3, tmp4;
539 __m128i tmp5, tmp6, tmp7, tmp8;
540 __m128i H, H2, H3, H4, Y, T;
541 const __m128i *KEY = (const __m128i *)key;
542 __m128i ctr1, ctr2, ctr3, ctr4;
543 __m128i ctr5, ctr6, ctr7, ctr8;
544 __m128i last_block = _mm_setzero_si128();
545 __m128i ONE = _mm_set_epi32(0, 1, 0, 0);
546 __m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
547 __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
549 __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
551 __m128i X = _mm_setzero_si128();
553 if (ibytes == 96/8) {
554 Y = _mm_loadu_si128((const __m128i *)ivec);
555 Y = _mm_insert_epi32(Y, 0x1000000, 3);
556 /*(Compute E[ZERO, KS] and E[Y0, KS] together*/
557 tmp1 = _mm_xor_si128(X, KEY[0]);
558 tmp2 = _mm_xor_si128(Y, KEY[0]);
559 for (j=1; j < nr-1; j+=2) {
560 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
561 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
563 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
564 tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
566 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
567 tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
569 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
570 T = _mm_aesenclast_si128(tmp2, KEY[nr]);
572 H = _mm_shuffle_epi8(H, BSWAP_MASK);
574 tmp1 = _mm_xor_si128(X, KEY[0]);
575 for (j=1; j <nr; j++)
576 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
577 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
579 H = _mm_shuffle_epi8(H, BSWAP_MASK);
580 Y = _mm_setzero_si128();
582 for (i=0; i < ibytes/16; i++) {
583 tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]);
584 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
585 Y = _mm_xor_si128(Y, tmp1);
589 for (j=0; j < ibytes%16; j++)
590 ((unsigned char*)&last_block)[j] = ivec[i*16+j];
592 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
593 Y = _mm_xor_si128(Y, tmp1);
596 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
597 tmp1 = _mm_insert_epi64(tmp1, 0, 1);
599 Y = _mm_xor_si128(Y, tmp1);
601 Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
602 tmp1 = _mm_xor_si128(Y, KEY[0]);
603 for (j=1; j < nr; j++)
604 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
605 T = _mm_aesenclast_si128(tmp1, KEY[nr]);
612 for (i=0; i<abytes/16/4; i++) {
613 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]);
614 tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]);
615 tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]);
616 tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]);
618 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
619 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
620 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
621 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
623 tmp1 = _mm_xor_si128(X, tmp1);
625 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
627 for (i=i*4; i<abytes/16; i++) {
628 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]);
629 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
630 X = _mm_xor_si128(X,tmp1);
634 last_block = _mm_setzero_si128();
635 for (j=0; j<abytes%16; j++)
636 ((unsigned char*)&last_block)[j] = addt[i*16+j];
638 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
639 X =_mm_xor_si128(X,tmp1);
643 /* This is where we validate the cipher text before decrypt */
644 for (i = 0; i<nbytes/16/4; i++) {
645 tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i*4]);
646 tmp2 = _mm_loadu_si128(&((const __m128i *)in)[i*4+1]);
647 tmp3 = _mm_loadu_si128(&((const __m128i *)in)[i*4+2]);
648 tmp4 = _mm_loadu_si128(&((const __m128i *)in)[i*4+3]);
650 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
651 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
652 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
653 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
655 tmp1 = _mm_xor_si128(X, tmp1);
657 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
659 for (i = i*4; i<nbytes/16; i++) {
660 tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i]);
661 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
662 X = _mm_xor_si128(X, tmp1);
666 last_block = _mm_setzero_si128();
667 for (j=0; j<nbytes%16; j++)
668 ((unsigned char*)&last_block)[j] = in[i*16+j];
670 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
671 X = _mm_xor_si128(X, tmp1);
675 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
676 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
678 X = _mm_xor_si128(X, tmp1);
680 X = _mm_shuffle_epi8(X, BSWAP_MASK);
681 T = _mm_xor_si128(X, T);
683 if (!m128icmp(T, _mm_loadu_si128((const __m128i*)tag)))
684 return 0; //in case the authentication failed
686 ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
687 ctr1 = _mm_add_epi64(ctr1, ONE);
688 ctr2 = _mm_add_epi64(ctr1, ONE);
689 ctr3 = _mm_add_epi64(ctr2, ONE);
690 ctr4 = _mm_add_epi64(ctr3, ONE);
691 ctr5 = _mm_add_epi64(ctr4, ONE);
692 ctr6 = _mm_add_epi64(ctr5, ONE);
693 ctr7 = _mm_add_epi64(ctr6, ONE);
694 ctr8 = _mm_add_epi64(ctr7, ONE);
696 for (i=0; i<nbytes/16/8; i++) {
697 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
698 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
699 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
700 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
701 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
702 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
703 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
704 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
706 ctr1 = _mm_add_epi64(ctr1, EIGHT);
707 ctr2 = _mm_add_epi64(ctr2, EIGHT);
708 ctr3 = _mm_add_epi64(ctr3, EIGHT);
709 ctr4 = _mm_add_epi64(ctr4, EIGHT);
710 ctr5 = _mm_add_epi64(ctr5, EIGHT);
711 ctr6 = _mm_add_epi64(ctr6, EIGHT);
712 ctr7 = _mm_add_epi64(ctr7, EIGHT);
713 ctr8 = _mm_add_epi64(ctr8, EIGHT);
715 tmp1 =_mm_xor_si128(tmp1, KEY[0]);
716 tmp2 =_mm_xor_si128(tmp2, KEY[0]);
717 tmp3 =_mm_xor_si128(tmp3, KEY[0]);
718 tmp4 =_mm_xor_si128(tmp4, KEY[0]);
719 tmp5 =_mm_xor_si128(tmp5, KEY[0]);
720 tmp6 =_mm_xor_si128(tmp6, KEY[0]);
721 tmp7 =_mm_xor_si128(tmp7, KEY[0]);
722 tmp8 =_mm_xor_si128(tmp8, KEY[0]);
724 for (j=1; j<nr; j++) {
725 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
726 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
727 tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
728 tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
729 tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
730 tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
731 tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
732 tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
734 tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
735 tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
736 tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
737 tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
738 tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
739 tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
740 tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
741 tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
743 tmp1 = _mm_xor_si128(tmp1,
744 _mm_loadu_si128(&((const __m128i *)in)[i*8+0]));
745 tmp2 = _mm_xor_si128(tmp2,
746 _mm_loadu_si128(&((const __m128i *)in)[i*8+1]));
747 tmp3 = _mm_xor_si128(tmp3,
748 _mm_loadu_si128(&((const __m128i *)in)[i*8+2]));
749 tmp4 = _mm_xor_si128(tmp4,
750 _mm_loadu_si128(&((const __m128i *)in)[i*8+3]));
751 tmp5 = _mm_xor_si128(tmp5,
752 _mm_loadu_si128(&((const __m128i *)in)[i*8+4]));
753 tmp6 = _mm_xor_si128(tmp6,
754 _mm_loadu_si128(&((const __m128i *)in)[i*8+5]));
755 tmp7 = _mm_xor_si128(tmp7,
756 _mm_loadu_si128(&((const __m128i *)in)[i*8+6]));
757 tmp8 = _mm_xor_si128(tmp8,
758 _mm_loadu_si128(&((const __m128i *)in)[i*8+7]));
760 _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
761 _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
762 _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
763 _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
764 _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
765 _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
766 _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
767 _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
769 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
770 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
771 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
772 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
773 tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
774 tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
775 tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
776 tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
778 for (k=i*8; k<nbytes/16; k++) {
779 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
780 ctr1 = _mm_add_epi64(ctr1, ONE);
781 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
782 for (j=1; j<nr-1; j+=2) {
783 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
784 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
786 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
787 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
788 tmp1 = _mm_xor_si128(tmp1,
789 _mm_loadu_si128(&((const __m128i *)in)[k]));
790 _mm_storeu_si128(&((__m128i*)out)[k], tmp1);
792 //If remains one incomplete block
794 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
795 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
796 for (j=1; j<nr-1; j+=2) {
797 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
798 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
800 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
801 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
802 last_block = _mm_setzero_si128();
803 memcpy(&last_block, &((const __m128i *)in)[k], nbytes%16);
804 tmp1 = _mm_xor_si128(tmp1, last_block);
806 for (j=0; j<nbytes%16; j++)
807 out[k*16+j] = ((unsigned char*)&last_block)[j];
809 return 1; //when sucessfull returns 1