]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/crypto/aesni/aesni_ghash.c
aesni: Fix an out-of-bounds read in AES_GCM_decrypt()
[FreeBSD/FreeBSD.git] / sys / crypto / aesni / aesni_ghash.c
1 /*-
2  * Copyright (c) 2014 The FreeBSD Foundation
3  * All rights reserved.
4  *
5  * This software was developed by John-Mark Gurney under
6  * the sponsorship of the FreeBSD Foundation and
7  * Rubicon Communications, LLC (Netgate).
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1.  Redistributions of source code must retain the above copyright
12  *     notice, this list of conditions and the following disclaimer.
13  * 2.  Redistributions in binary form must reproduce the above copyright
14  *     notice, this list of conditions and the following disclaimer in the
15  *     documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *
30  *      $FreeBSD$
31  *
32  */
33
34 /*
35  * Figure 5, 8 and 12 are copied from the Intel white paper:
36  * Intel® Carry-Less Multiplication Instruction and its Usage for
37  * Computing the GCM Mode
38  *
39  * and as such are:
40  * Copyright © 2010 Intel Corporation.
41  * All rights reserved.
42  * 
43  * Redistribution and use in source and binary forms, with or without
44  * modification, are permitted provided that the following conditions
45  * are met:
46  *   * Redistributions of source code must retain the above copyright
47  *     notice, this list of conditions and the following disclaimer.
48  *   * Redistributions in binary form must reproduce the above copyright
49  *     notice, this list of conditions and the following disclaimer in the
50  *     documentation and/or other materials provided with the distribution.
51  *   * Neither the name of Intel Corporation nor the
52  *     names of its contributors may be used to endorse or promote products
53  *     derived from this software without specific prior written permission.
54  * 
55  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
56  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
57  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
58  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
59  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
60  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
61  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
62  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
63  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
64  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
65  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
66  */
67
68 #ifdef _KERNEL
69 #include <crypto/aesni/aesni.h>
70 #include <crypto/aesni/aesni_os.h>
71 #else
72 #include <stdint.h>
73 #endif
74
75 #include <wmmintrin.h>
76 #include <emmintrin.h>
77 #include <smmintrin.h>
78
79 static inline int
80 m128icmp(__m128i a, __m128i b)
81 {
82         __m128i cmp;
83
84         cmp = _mm_cmpeq_epi32(a, b);
85
86         return _mm_movemask_epi8(cmp) == 0xffff;
87 }
88
89 #ifdef __i386__
90 static inline __m128i
91 _mm_insert_epi64(__m128i a, int64_t b, const int ndx)
92 {  
93
94         if (!ndx) {
95                 a = _mm_insert_epi32(a, b, 0);
96                 a = _mm_insert_epi32(a, b >> 32, 1);
97         } else {
98                 a = _mm_insert_epi32(a, b, 2);
99                 a = _mm_insert_epi32(a, b >> 32, 3);
100         }
101
102         return a;
103 }
104 #endif
105
106 /* some code from carry-less-multiplication-instruction-in-gcm-mode-paper.pdf */
107
108 /* Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */
109 static void
110 gfmul(__m128i a, __m128i b, __m128i *res)
111 {
112         __m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
113
114         tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
115         tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
116         tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
117         tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
118
119         tmp4 = _mm_xor_si128(tmp4, tmp5);
120         tmp5 = _mm_slli_si128(tmp4, 8);
121         tmp4 = _mm_srli_si128(tmp4, 8);
122         tmp3 = _mm_xor_si128(tmp3, tmp5);
123         tmp6 = _mm_xor_si128(tmp6, tmp4);
124
125         tmp7 = _mm_srli_epi32(tmp3, 31);
126         tmp8 = _mm_srli_epi32(tmp6, 31);
127         tmp3 = _mm_slli_epi32(tmp3, 1);
128         tmp6 = _mm_slli_epi32(tmp6, 1);
129
130         tmp9 = _mm_srli_si128(tmp7, 12);
131         tmp8 = _mm_slli_si128(tmp8, 4);
132         tmp7 = _mm_slli_si128(tmp7, 4);
133         tmp3 = _mm_or_si128(tmp3, tmp7);
134         tmp6 = _mm_or_si128(tmp6, tmp8);
135         tmp6 = _mm_or_si128(tmp6, tmp9);
136
137         tmp7 = _mm_slli_epi32(tmp3, 31);
138         tmp8 = _mm_slli_epi32(tmp3, 30);
139         tmp9 = _mm_slli_epi32(tmp3, 25);
140
141         tmp7 = _mm_xor_si128(tmp7, tmp8);
142         tmp7 = _mm_xor_si128(tmp7, tmp9);
143         tmp8 = _mm_srli_si128(tmp7, 4);
144         tmp7 = _mm_slli_si128(tmp7, 12);
145         tmp3 = _mm_xor_si128(tmp3, tmp7);
146
147         tmp2 = _mm_srli_epi32(tmp3, 1);
148         tmp4 = _mm_srli_epi32(tmp3, 2);
149         tmp5 = _mm_srli_epi32(tmp3, 7);
150         tmp2 = _mm_xor_si128(tmp2, tmp4);
151         tmp2 = _mm_xor_si128(tmp2, tmp5);
152         tmp2 = _mm_xor_si128(tmp2, tmp8);
153         tmp3 = _mm_xor_si128(tmp3, tmp2);
154         tmp6 = _mm_xor_si128(tmp6, tmp3);
155
156         *res = tmp6;
157 }
158
159 /*
160  * Figure 8. Code Sample - Performing Ghash Using an Aggregated Reduction
161  * Method */
162 static void
163 reduce4(__m128i H1, __m128i H2, __m128i H3, __m128i H4,
164     __m128i X1, __m128i X2, __m128i X3, __m128i X4, __m128i *res)
165 {
166         /*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/
167         __m128i H1_X1_lo, H1_X1_hi, H2_X2_lo, H2_X2_hi, H3_X3_lo,
168             H3_X3_hi, H4_X4_lo, H4_X4_hi, lo, hi;
169         __m128i tmp0, tmp1, tmp2, tmp3;
170         __m128i tmp4, tmp5, tmp6, tmp7;
171         __m128i tmp8, tmp9;
172
173         H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00);
174         H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00);
175         H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00);
176         H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00);
177
178         lo = _mm_xor_si128(H1_X1_lo, H2_X2_lo);
179         lo = _mm_xor_si128(lo, H3_X3_lo);
180         lo = _mm_xor_si128(lo, H4_X4_lo);
181
182         H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11);
183         H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11);
184         H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11);
185         H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11);
186
187         hi = _mm_xor_si128(H1_X1_hi, H2_X2_hi);
188         hi = _mm_xor_si128(hi, H3_X3_hi);
189         hi = _mm_xor_si128(hi, H4_X4_hi);
190
191         tmp0 = _mm_shuffle_epi32(H1, 78);
192         tmp4 = _mm_shuffle_epi32(X1, 78);
193         tmp0 = _mm_xor_si128(tmp0, H1);
194         tmp4 = _mm_xor_si128(tmp4, X1);
195         tmp1 = _mm_shuffle_epi32(H2, 78);
196         tmp5 = _mm_shuffle_epi32(X2, 78);
197         tmp1 = _mm_xor_si128(tmp1, H2);
198         tmp5 = _mm_xor_si128(tmp5, X2);
199         tmp2 = _mm_shuffle_epi32(H3, 78);
200         tmp6 = _mm_shuffle_epi32(X3, 78);
201         tmp2 = _mm_xor_si128(tmp2, H3);
202         tmp6 = _mm_xor_si128(tmp6, X3);
203         tmp3 = _mm_shuffle_epi32(H4, 78);
204         tmp7 = _mm_shuffle_epi32(X4, 78);
205         tmp3 = _mm_xor_si128(tmp3, H4);
206         tmp7 = _mm_xor_si128(tmp7, X4);
207
208         tmp0 = _mm_clmulepi64_si128(tmp0, tmp4, 0x00);
209         tmp1 = _mm_clmulepi64_si128(tmp1, tmp5, 0x00);
210         tmp2 = _mm_clmulepi64_si128(tmp2, tmp6, 0x00);
211         tmp3 = _mm_clmulepi64_si128(tmp3, tmp7, 0x00);
212
213         tmp0 = _mm_xor_si128(tmp0, lo);
214         tmp0 = _mm_xor_si128(tmp0, hi);
215         tmp0 = _mm_xor_si128(tmp1, tmp0);
216         tmp0 = _mm_xor_si128(tmp2, tmp0);
217         tmp0 = _mm_xor_si128(tmp3, tmp0);
218
219         tmp4 = _mm_slli_si128(tmp0, 8);
220         tmp0 = _mm_srli_si128(tmp0, 8);
221
222         lo = _mm_xor_si128(tmp4, lo);
223         hi = _mm_xor_si128(tmp0, hi);
224
225         tmp3 = lo;
226         tmp6 = hi;
227
228         tmp7 = _mm_srli_epi32(tmp3, 31);
229         tmp8 = _mm_srli_epi32(tmp6, 31);
230         tmp3 = _mm_slli_epi32(tmp3, 1);
231         tmp6 = _mm_slli_epi32(tmp6, 1);
232
233         tmp9 = _mm_srli_si128(tmp7, 12);
234         tmp8 = _mm_slli_si128(tmp8, 4);
235         tmp7 = _mm_slli_si128(tmp7, 4);
236         tmp3 = _mm_or_si128(tmp3, tmp7);
237         tmp6 = _mm_or_si128(tmp6, tmp8);
238         tmp6 = _mm_or_si128(tmp6, tmp9);
239
240         tmp7 = _mm_slli_epi32(tmp3, 31);
241         tmp8 = _mm_slli_epi32(tmp3, 30);
242         tmp9 = _mm_slli_epi32(tmp3, 25);
243
244         tmp7 = _mm_xor_si128(tmp7, tmp8);
245         tmp7 = _mm_xor_si128(tmp7, tmp9);
246         tmp8 = _mm_srli_si128(tmp7, 4);
247         tmp7 = _mm_slli_si128(tmp7, 12);
248         tmp3 = _mm_xor_si128(tmp3, tmp7);
249
250         tmp2 = _mm_srli_epi32(tmp3, 1);
251         tmp4 = _mm_srli_epi32(tmp3, 2);
252         tmp5 = _mm_srli_epi32(tmp3, 7);
253         tmp2 = _mm_xor_si128(tmp2, tmp4);
254         tmp2 = _mm_xor_si128(tmp2, tmp5);
255         tmp2 = _mm_xor_si128(tmp2, tmp8);
256         tmp3 = _mm_xor_si128(tmp3, tmp2);
257         tmp6 = _mm_xor_si128(tmp6, tmp3);
258
259         *res = tmp6;
260 }
261
262 /*
263  * Figure 12. AES-GCM: Processing Four Blocks in Parallel with Aggregated
264  * Every Four Blocks
265  */
266 /*
267  * per NIST SP-800-38D, 5.2.1.1, len(p) <= 2^39-256 (in bits), or
268  * 2^32-256*8*16 bytes.
269  */
270 void
271 AES_GCM_encrypt(const unsigned char *in, unsigned char *out,
272         const unsigned char *addt, const unsigned char *ivec,
273         unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
274         const unsigned char *key, int nr)
275 {
276         int i, j ,k;
277         __m128i tmp1, tmp2, tmp3, tmp4;
278         __m128i tmp5, tmp6, tmp7, tmp8;
279         __m128i H, H2, H3, H4, Y, T;
280         const __m128i *KEY = (const __m128i *)key;
281         __m128i ctr1, ctr2, ctr3, ctr4;
282         __m128i ctr5, ctr6, ctr7, ctr8;
283         __m128i last_block = _mm_setzero_si128();
284         __m128i ONE = _mm_set_epi32(0, 1, 0, 0);
285         __m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
286         __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
287             7);
288         __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
289             15);
290         __m128i X = _mm_setzero_si128();
291
292         if (ibytes == 96/8) {
293                 Y = _mm_loadu_si128((const __m128i *)ivec);
294                 Y = _mm_insert_epi32(Y, 0x1000000, 3);
295                 /*(Compute E[ZERO, KS] and E[Y0, KS] together*/
296                 tmp1 = _mm_xor_si128(X, KEY[0]);
297                 tmp2 = _mm_xor_si128(Y, KEY[0]);
298                 for (j=1; j < nr-1; j+=2) {
299                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
300                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
301
302                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
303                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
304                 }
305                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
306                 tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
307
308                 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
309                 T = _mm_aesenclast_si128(tmp2, KEY[nr]);
310
311                 H = _mm_shuffle_epi8(H, BSWAP_MASK);
312         } else {
313                 tmp1 = _mm_xor_si128(X, KEY[0]);
314                 for (j=1; j <nr; j++)
315                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
316                 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
317
318                 H = _mm_shuffle_epi8(H, BSWAP_MASK);
319                 Y = _mm_setzero_si128();
320
321                 for (i=0; i < ibytes/16; i++) {
322                         tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]);
323                         tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
324                         Y = _mm_xor_si128(Y, tmp1);
325                         gfmul(Y, H, &Y);
326                 }
327                 if (ibytes%16) {
328                         for (j=0; j < ibytes%16; j++)
329                                 ((unsigned char*)&last_block)[j] = ivec[i*16+j];
330                         tmp1 = last_block;
331                         tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
332                         Y = _mm_xor_si128(Y, tmp1);
333                         gfmul(Y, H, &Y);
334                 }
335                 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
336                 tmp1 = _mm_insert_epi64(tmp1, 0, 1);
337
338                 Y = _mm_xor_si128(Y, tmp1);
339                 gfmul(Y, H, &Y);
340                 Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
341                 tmp1 = _mm_xor_si128(Y, KEY[0]);
342                 for (j=1; j < nr; j++)
343                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
344                 T = _mm_aesenclast_si128(tmp1, KEY[nr]);
345         }
346
347         gfmul(H,H,&H2);
348         gfmul(H,H2,&H3);
349         gfmul(H,H3,&H4);
350
351         for (i=0; i<abytes/16/4; i++) {
352                 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]);
353                 tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]);
354                 tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]);
355                 tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]);
356
357                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
358                 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
359                 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
360                 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
361                 tmp1 = _mm_xor_si128(X, tmp1);
362
363                 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
364         }
365         for (i=i*4; i<abytes/16; i++) {
366                 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]);
367                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
368                 X = _mm_xor_si128(X,tmp1);
369                 gfmul(X, H, &X);
370         }
371         if (abytes%16) {
372                 last_block = _mm_setzero_si128();
373                 for (j=0; j<abytes%16; j++)
374                         ((unsigned char*)&last_block)[j] = addt[i*16+j];
375                 tmp1 = last_block;
376                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
377                 X =_mm_xor_si128(X,tmp1);
378                 gfmul(X,H,&X);
379         }
380
381         ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
382         ctr1 = _mm_add_epi64(ctr1, ONE);
383         ctr2 = _mm_add_epi64(ctr1, ONE);
384         ctr3 = _mm_add_epi64(ctr2, ONE);
385         ctr4 = _mm_add_epi64(ctr3, ONE);
386         ctr5 = _mm_add_epi64(ctr4, ONE);
387         ctr6 = _mm_add_epi64(ctr5, ONE);
388         ctr7 = _mm_add_epi64(ctr6, ONE);
389         ctr8 = _mm_add_epi64(ctr7, ONE);
390
391         for (i=0; i<nbytes/16/8; i++) {
392                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
393                 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
394                 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
395                 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
396                 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
397                 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
398                 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
399                 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
400
401                 ctr1 = _mm_add_epi64(ctr1, EIGHT);
402                 ctr2 = _mm_add_epi64(ctr2, EIGHT);
403                 ctr3 = _mm_add_epi64(ctr3, EIGHT);
404                 ctr4 = _mm_add_epi64(ctr4, EIGHT);
405                 ctr5 = _mm_add_epi64(ctr5, EIGHT);
406                 ctr6 = _mm_add_epi64(ctr6, EIGHT);
407                 ctr7 = _mm_add_epi64(ctr7, EIGHT);
408                 ctr8 = _mm_add_epi64(ctr8, EIGHT);
409
410                 tmp1 =_mm_xor_si128(tmp1, KEY[0]);
411                 tmp2 =_mm_xor_si128(tmp2, KEY[0]);
412                 tmp3 =_mm_xor_si128(tmp3, KEY[0]);
413                 tmp4 =_mm_xor_si128(tmp4, KEY[0]);
414                 tmp5 =_mm_xor_si128(tmp5, KEY[0]);
415                 tmp6 =_mm_xor_si128(tmp6, KEY[0]);
416                 tmp7 =_mm_xor_si128(tmp7, KEY[0]);
417                 tmp8 =_mm_xor_si128(tmp8, KEY[0]);
418
419                 for (j=1; j<nr; j++) {
420                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
421                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
422                         tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
423                         tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
424                         tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
425                         tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
426                         tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
427                         tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
428                 }
429                 tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
430                 tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
431                 tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
432                 tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
433                 tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
434                 tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
435                 tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
436                 tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
437
438                 tmp1 = _mm_xor_si128(tmp1,
439                     _mm_loadu_si128(&((const __m128i *)in)[i*8+0]));
440                 tmp2 = _mm_xor_si128(tmp2,
441                     _mm_loadu_si128(&((const __m128i *)in)[i*8+1]));
442                 tmp3 = _mm_xor_si128(tmp3,
443                     _mm_loadu_si128(&((const __m128i *)in)[i*8+2]));
444                 tmp4 = _mm_xor_si128(tmp4,
445                     _mm_loadu_si128(&((const __m128i *)in)[i*8+3]));
446                 tmp5 = _mm_xor_si128(tmp5,
447                     _mm_loadu_si128(&((const __m128i *)in)[i*8+4]));
448                 tmp6 = _mm_xor_si128(tmp6,
449                     _mm_loadu_si128(&((const __m128i *)in)[i*8+5]));
450                 tmp7 = _mm_xor_si128(tmp7,
451                     _mm_loadu_si128(&((const __m128i *)in)[i*8+6]));
452                 tmp8 = _mm_xor_si128(tmp8,
453                     _mm_loadu_si128(&((const __m128i *)in)[i*8+7]));
454
455                 _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
456                 _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
457                 _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
458                 _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
459                 _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
460                 _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
461                 _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
462                 _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
463
464                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
465                 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
466                 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
467                 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
468                 tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
469                 tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
470                 tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
471                 tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
472
473                 tmp1 = _mm_xor_si128(X, tmp1);
474
475                 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
476
477                 tmp5 = _mm_xor_si128(X, tmp5);
478                 reduce4(H, H2, H3, H4, tmp8, tmp7, tmp6, tmp5, &X);
479         }
480         for (k=i*8; k<nbytes/16; k++) {
481                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
482                 ctr1 = _mm_add_epi64(ctr1, ONE);
483                 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
484                 for (j=1; j<nr-1; j+=2) {
485                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
486                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
487                 }
488                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
489                 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
490                 tmp1 = _mm_xor_si128(tmp1,
491                     _mm_loadu_si128(&((const __m128i *)in)[k]));
492                 _mm_storeu_si128(&((__m128i*)out)[k], tmp1);
493                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
494                 X = _mm_xor_si128(X, tmp1);
495                 gfmul(X,H,&X);
496         }
497         //If remains one incomplete block
498         if (nbytes%16) {
499                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
500                 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
501                 for (j=1; j<nr-1; j+=2) {
502                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
503                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
504                 }
505                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
506                 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
507                 last_block = _mm_setzero_si128();
508                 memcpy(&last_block, &((const __m128i *)in)[k],
509                     nbytes % 16);
510                 last_block = _mm_xor_si128(last_block, tmp1);
511                 for (j=0; j<nbytes%16; j++)
512                         out[k*16+j] = ((unsigned char*)&last_block)[j];
513                 for ((void)j; j<16; j++)
514                         ((unsigned char*)&last_block)[j] = 0;
515                 tmp1 = last_block;
516                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
517                 X = _mm_xor_si128(X, tmp1);
518                 gfmul(X, H, &X);
519         }
520         tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
521         tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
522
523         X = _mm_xor_si128(X, tmp1);
524         gfmul(X,H,&X);
525         X = _mm_shuffle_epi8(X, BSWAP_MASK);
526         T = _mm_xor_si128(X, T);
527         _mm_storeu_si128((__m128i*)tag, T);
528 }
529
530 /* My modification of _encrypt to be _decrypt */
531 int
532 AES_GCM_decrypt(const unsigned char *in, unsigned char *out,
533         const unsigned char *addt, const unsigned char *ivec,
534         const unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
535         const unsigned char *key, int nr)
536 {
537         int i, j ,k;
538         __m128i tmp1, tmp2, tmp3, tmp4;
539         __m128i tmp5, tmp6, tmp7, tmp8;
540         __m128i H, H2, H3, H4, Y, T;
541         const __m128i *KEY = (const __m128i *)key;
542         __m128i ctr1, ctr2, ctr3, ctr4;
543         __m128i ctr5, ctr6, ctr7, ctr8;
544         __m128i last_block = _mm_setzero_si128();
545         __m128i ONE = _mm_set_epi32(0, 1, 0, 0);
546         __m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
547         __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
548             7);
549         __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
550             15);
551         __m128i X = _mm_setzero_si128();
552
553         if (ibytes == 96/8) {
554                 Y = _mm_loadu_si128((const __m128i *)ivec);
555                 Y = _mm_insert_epi32(Y, 0x1000000, 3);
556                 /*(Compute E[ZERO, KS] and E[Y0, KS] together*/
557                 tmp1 = _mm_xor_si128(X, KEY[0]);
558                 tmp2 = _mm_xor_si128(Y, KEY[0]);
559                 for (j=1; j < nr-1; j+=2) {
560                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
561                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
562
563                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
564                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
565                 }
566                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
567                 tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
568
569                 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
570                 T = _mm_aesenclast_si128(tmp2, KEY[nr]);
571
572                 H = _mm_shuffle_epi8(H, BSWAP_MASK);
573         } else {
574                 tmp1 = _mm_xor_si128(X, KEY[0]);
575                 for (j=1; j <nr; j++)
576                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
577                 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
578
579                 H = _mm_shuffle_epi8(H, BSWAP_MASK);
580                 Y = _mm_setzero_si128();
581
582                 for (i=0; i < ibytes/16; i++) {
583                         tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]);
584                         tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
585                         Y = _mm_xor_si128(Y, tmp1);
586                         gfmul(Y, H, &Y);
587                 }
588                 if (ibytes%16) {
589                         for (j=0; j < ibytes%16; j++)
590                                 ((unsigned char*)&last_block)[j] = ivec[i*16+j];
591                         tmp1 = last_block;
592                         tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
593                         Y = _mm_xor_si128(Y, tmp1);
594                         gfmul(Y, H, &Y);
595                 }
596                 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
597                 tmp1 = _mm_insert_epi64(tmp1, 0, 1);
598
599                 Y = _mm_xor_si128(Y, tmp1);
600                 gfmul(Y, H, &Y);
601                 Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
602                 tmp1 = _mm_xor_si128(Y, KEY[0]);
603                 for (j=1; j < nr; j++)
604                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
605                 T = _mm_aesenclast_si128(tmp1, KEY[nr]);
606         }
607
608         gfmul(H,H,&H2);
609         gfmul(H,H2,&H3);
610         gfmul(H,H3,&H4);
611
612         for (i=0; i<abytes/16/4; i++) {
613                 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]);
614                 tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]);
615                 tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]);
616                 tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]);
617
618                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
619                 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
620                 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
621                 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
622
623                 tmp1 = _mm_xor_si128(X, tmp1);
624
625                 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
626         }
627         for (i=i*4; i<abytes/16; i++) {
628                 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]);
629                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
630                 X = _mm_xor_si128(X,tmp1);
631                 gfmul(X, H, &X);
632         }
633         if (abytes%16) {
634                 last_block = _mm_setzero_si128();
635                 for (j=0; j<abytes%16; j++)
636                         ((unsigned char*)&last_block)[j] = addt[i*16+j];
637                 tmp1 = last_block;
638                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
639                 X =_mm_xor_si128(X,tmp1);
640                 gfmul(X,H,&X);
641         }
642
643         /* This is where we validate the cipher text before decrypt */
644         for (i = 0; i<nbytes/16/4; i++) {
645                 tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i*4]);
646                 tmp2 = _mm_loadu_si128(&((const __m128i *)in)[i*4+1]);
647                 tmp3 = _mm_loadu_si128(&((const __m128i *)in)[i*4+2]);
648                 tmp4 = _mm_loadu_si128(&((const __m128i *)in)[i*4+3]);
649
650                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
651                 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
652                 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
653                 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
654
655                 tmp1 = _mm_xor_si128(X, tmp1);
656
657                 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
658         }
659         for (i = i*4; i<nbytes/16; i++) {
660                 tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i]);
661                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
662                 X = _mm_xor_si128(X, tmp1);
663                 gfmul(X,H,&X);
664         }
665         if (nbytes%16) {
666                 last_block = _mm_setzero_si128();
667                 for (j=0; j<nbytes%16; j++)
668                         ((unsigned char*)&last_block)[j] = in[i*16+j];
669                 tmp1 = last_block;
670                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
671                 X = _mm_xor_si128(X, tmp1);
672                 gfmul(X, H, &X);
673         }
674
675         tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
676         tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
677
678         X = _mm_xor_si128(X, tmp1);
679         gfmul(X,H,&X);
680         X = _mm_shuffle_epi8(X, BSWAP_MASK);
681         T = _mm_xor_si128(X, T);
682
683         if (!m128icmp(T, _mm_loadu_si128((const __m128i*)tag)))
684                 return 0; //in case the authentication failed
685
686         ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
687         ctr1 = _mm_add_epi64(ctr1, ONE);
688         ctr2 = _mm_add_epi64(ctr1, ONE);
689         ctr3 = _mm_add_epi64(ctr2, ONE);
690         ctr4 = _mm_add_epi64(ctr3, ONE);
691         ctr5 = _mm_add_epi64(ctr4, ONE);
692         ctr6 = _mm_add_epi64(ctr5, ONE);
693         ctr7 = _mm_add_epi64(ctr6, ONE);
694         ctr8 = _mm_add_epi64(ctr7, ONE);
695
696         for (i=0; i<nbytes/16/8; i++) {
697                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
698                 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
699                 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
700                 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
701                 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
702                 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
703                 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
704                 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
705
706                 ctr1 = _mm_add_epi64(ctr1, EIGHT);
707                 ctr2 = _mm_add_epi64(ctr2, EIGHT);
708                 ctr3 = _mm_add_epi64(ctr3, EIGHT);
709                 ctr4 = _mm_add_epi64(ctr4, EIGHT);
710                 ctr5 = _mm_add_epi64(ctr5, EIGHT);
711                 ctr6 = _mm_add_epi64(ctr6, EIGHT);
712                 ctr7 = _mm_add_epi64(ctr7, EIGHT);
713                 ctr8 = _mm_add_epi64(ctr8, EIGHT);
714
715                 tmp1 =_mm_xor_si128(tmp1, KEY[0]);
716                 tmp2 =_mm_xor_si128(tmp2, KEY[0]);
717                 tmp3 =_mm_xor_si128(tmp3, KEY[0]);
718                 tmp4 =_mm_xor_si128(tmp4, KEY[0]);
719                 tmp5 =_mm_xor_si128(tmp5, KEY[0]);
720                 tmp6 =_mm_xor_si128(tmp6, KEY[0]);
721                 tmp7 =_mm_xor_si128(tmp7, KEY[0]);
722                 tmp8 =_mm_xor_si128(tmp8, KEY[0]);
723
724                 for (j=1; j<nr; j++) {
725                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
726                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
727                         tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
728                         tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
729                         tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
730                         tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
731                         tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
732                         tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
733                 }
734                 tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
735                 tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
736                 tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
737                 tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
738                 tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
739                 tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
740                 tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
741                 tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
742
743                 tmp1 = _mm_xor_si128(tmp1,
744                     _mm_loadu_si128(&((const __m128i *)in)[i*8+0]));
745                 tmp2 = _mm_xor_si128(tmp2,
746                     _mm_loadu_si128(&((const __m128i *)in)[i*8+1]));
747                 tmp3 = _mm_xor_si128(tmp3,
748                     _mm_loadu_si128(&((const __m128i *)in)[i*8+2]));
749                 tmp4 = _mm_xor_si128(tmp4,
750                     _mm_loadu_si128(&((const __m128i *)in)[i*8+3]));
751                 tmp5 = _mm_xor_si128(tmp5,
752                     _mm_loadu_si128(&((const __m128i *)in)[i*8+4]));
753                 tmp6 = _mm_xor_si128(tmp6,
754                     _mm_loadu_si128(&((const __m128i *)in)[i*8+5]));
755                 tmp7 = _mm_xor_si128(tmp7,
756                     _mm_loadu_si128(&((const __m128i *)in)[i*8+6]));
757                 tmp8 = _mm_xor_si128(tmp8,
758                     _mm_loadu_si128(&((const __m128i *)in)[i*8+7]));
759
760                 _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
761                 _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
762                 _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
763                 _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
764                 _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
765                 _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
766                 _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
767                 _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
768
769                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
770                 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
771                 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
772                 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
773                 tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
774                 tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
775                 tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
776                 tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
777         }
778         for (k=i*8; k<nbytes/16; k++) {
779                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
780                 ctr1 = _mm_add_epi64(ctr1, ONE);
781                 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
782                 for (j=1; j<nr-1; j+=2) {
783                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
784                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
785                 }
786                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
787                 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
788                 tmp1 = _mm_xor_si128(tmp1,
789                     _mm_loadu_si128(&((const __m128i *)in)[k]));
790                 _mm_storeu_si128(&((__m128i*)out)[k], tmp1);
791         }
792         //If remains one incomplete block
793         if (nbytes%16) {
794                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
795                 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
796                 for (j=1; j<nr-1; j+=2) {
797                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
798                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
799                 }
800                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
801                 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
802                 last_block = _mm_setzero_si128();
803                 memcpy(&last_block, &((const __m128i *)in)[k], nbytes%16);
804                 tmp1 = _mm_xor_si128(tmp1, last_block);
805                 last_block = tmp1;
806                 for (j=0; j<nbytes%16; j++)
807                         out[k*16+j] = ((unsigned char*)&last_block)[j];
808         }
809         return 1; //when sucessfull returns 1
810 }