sys/crypto/aesni/aesni_ghash.c

   1 /*-
   2  * Copyright (c) 2014 The FreeBSD Foundation
   3  * All rights reserved.
   4  *
   5  * This software was developed by John-Mark Gurney under
   6  * the sponsorship of the FreeBSD Foundation and
   7  * Rubicon Communications, LLC (Netgate).
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1.  Redistributions of source code must retain the above copyright
  12  *     notice, this list of conditions and the following disclaimer.
  13  * 2.  Redistributions in binary form must reproduce the above copyright
  14  *     notice, this list of conditions and the following disclaimer in the
  15  *     documentation and/or other materials provided with the distribution.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27  * SUCH DAMAGE.
  28  *
  29  *
  30  *      $FreeBSD$
  31  *
  32  */
  33
  34 /*
  35  * Figure 5, 8 and 12 are copied from the Intel white paper:
  36  * Intel® Carry-Less Multiplication Instruction and its Usage for
  37  * Computing the GCM Mode
  38  *
  39  * and as such are:
  40  * Copyright © 2010 Intel Corporation.
  41  * All rights reserved.
  42  *
  43  * Redistribution and use in source and binary forms, with or without
  44  * modification, are permitted provided that the following conditions
  45  * are met:
  46  *   * Redistributions of source code must retain the above copyright
  47  *     notice, this list of conditions and the following disclaimer.
  48  *   * Redistributions in binary form must reproduce the above copyright
  49  *     notice, this list of conditions and the following disclaimer in the
  50  *     documentation and/or other materials provided with the distribution.
  51  *   * Neither the name of Intel Corporation nor the
  52  *     names of its contributors may be used to endorse or promote products
  53  *     derived from this software without specific prior written permission.
  54  *
  55  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  56  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  57  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  58  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  59  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  60  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  61  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  62  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  63  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  64  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  65  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  66  */
  67
  68 #ifdef _KERNEL
  69 #include <crypto/aesni/aesni.h>
  70 #include <crypto/aesni/aesni_os.h>
  71 #else
  72 #include <stdint.h>
  73 #endif
  74
  75 #include <wmmintrin.h>
  76 #include <emmintrin.h>
  77 #include <smmintrin.h>
  78
  79 static inline int
  80 m128icmp(__m128i a, __m128i b)
  81 {
  82         __m128i cmp;
  83
  84         cmp = _mm_cmpeq_epi32(a, b);
  85
  86         return _mm_movemask_epi8(cmp) == 0xffff;
  87 }
  88
  89 #ifdef __i386__
  90 static inline __m128i
  91 _mm_insert_epi64(__m128i a, int64_t b, const int ndx)
  92 {
  93
  94         if (!ndx) {
  95                 a = _mm_insert_epi32(a, b, 0);
  96                 a = _mm_insert_epi32(a, b >> 32, 1);
  97         } else {
  98                 a = _mm_insert_epi32(a, b, 2);
  99                 a = _mm_insert_epi32(a, b >> 32, 3);
 100         }
 101
 102         return a;
 103 }
 104 #endif
 105
 106 /* some code from carry-less-multiplication-instruction-in-gcm-mode-paper.pdf */
 107
 108 /* Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */
 109 static void
 110 gfmul(__m128i a, __m128i b, __m128i *res)
 111 {
 112         __m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
 113
 114         tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
 115         tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
 116         tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
 117         tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
 118
 119         tmp4 = _mm_xor_si128(tmp4, tmp5);
 120         tmp5 = _mm_slli_si128(tmp4, 8);
 121         tmp4 = _mm_srli_si128(tmp4, 8);
 122         tmp3 = _mm_xor_si128(tmp3, tmp5);
 123         tmp6 = _mm_xor_si128(tmp6, tmp4);
 124
 125         tmp7 = _mm_srli_epi32(tmp3, 31);
 126         tmp8 = _mm_srli_epi32(tmp6, 31);
 127         tmp3 = _mm_slli_epi32(tmp3, 1);
 128         tmp6 = _mm_slli_epi32(tmp6, 1);
 129
 130         tmp9 = _mm_srli_si128(tmp7, 12);
 131         tmp8 = _mm_slli_si128(tmp8, 4);
 132         tmp7 = _mm_slli_si128(tmp7, 4);
 133         tmp3 = _mm_or_si128(tmp3, tmp7);
 134         tmp6 = _mm_or_si128(tmp6, tmp8);
 135         tmp6 = _mm_or_si128(tmp6, tmp9);
 136
 137         tmp7 = _mm_slli_epi32(tmp3, 31);
 138         tmp8 = _mm_slli_epi32(tmp3, 30);
 139         tmp9 = _mm_slli_epi32(tmp3, 25);
 140
 141         tmp7 = _mm_xor_si128(tmp7, tmp8);
 142         tmp7 = _mm_xor_si128(tmp7, tmp9);
 143         tmp8 = _mm_srli_si128(tmp7, 4);
 144         tmp7 = _mm_slli_si128(tmp7, 12);
 145         tmp3 = _mm_xor_si128(tmp3, tmp7);
 146
 147         tmp2 = _mm_srli_epi32(tmp3, 1);
 148         tmp4 = _mm_srli_epi32(tmp3, 2);
 149         tmp5 = _mm_srli_epi32(tmp3, 7);
 150         tmp2 = _mm_xor_si128(tmp2, tmp4);
 151         tmp2 = _mm_xor_si128(tmp2, tmp5);
 152         tmp2 = _mm_xor_si128(tmp2, tmp8);
 153         tmp3 = _mm_xor_si128(tmp3, tmp2);
 154         tmp6 = _mm_xor_si128(tmp6, tmp3);
 155
 156         *res = tmp6;
 157 }
 158
 159 /*
 160  * Figure 8. Code Sample - Performing Ghash Using an Aggregated Reduction
 161  * Method */
 162 static void
 163 reduce4(__m128i H1, __m128i H2, __m128i H3, __m128i H4,
 164     __m128i X1, __m128i X2, __m128i X3, __m128i X4, __m128i *res)
 165 {
 166         /*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/
 167         __m128i H1_X1_lo, H1_X1_hi, H2_X2_lo, H2_X2_hi, H3_X3_lo,
 168             H3_X3_hi, H4_X4_lo, H4_X4_hi, lo, hi;
 169         __m128i tmp0, tmp1, tmp2, tmp3;
 170         __m128i tmp4, tmp5, tmp6, tmp7;
 171         __m128i tmp8, tmp9;
 172
 173         H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00);
 174         H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00);
 175         H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00);
 176         H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00);
 177
 178         lo = _mm_xor_si128(H1_X1_lo, H2_X2_lo);
 179         lo = _mm_xor_si128(lo, H3_X3_lo);
 180         lo = _mm_xor_si128(lo, H4_X4_lo);
 181
 182         H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11);
 183         H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11);
 184         H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11);
 185         H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11);
 186
 187         hi = _mm_xor_si128(H1_X1_hi, H2_X2_hi);
 188         hi = _mm_xor_si128(hi, H3_X3_hi);
 189         hi = _mm_xor_si128(hi, H4_X4_hi);
 190
 191         tmp0 = _mm_shuffle_epi32(H1, 78);
 192         tmp4 = _mm_shuffle_epi32(X1, 78);
 193         tmp0 = _mm_xor_si128(tmp0, H1);
 194         tmp4 = _mm_xor_si128(tmp4, X1);
 195         tmp1 = _mm_shuffle_epi32(H2, 78);
 196         tmp5 = _mm_shuffle_epi32(X2, 78);
 197         tmp1 = _mm_xor_si128(tmp1, H2);
 198         tmp5 = _mm_xor_si128(tmp5, X2);
 199         tmp2 = _mm_shuffle_epi32(H3, 78);
 200         tmp6 = _mm_shuffle_epi32(X3, 78);
 201         tmp2 = _mm_xor_si128(tmp2, H3);
 202         tmp6 = _mm_xor_si128(tmp6, X3);
 203         tmp3 = _mm_shuffle_epi32(H4, 78);
 204         tmp7 = _mm_shuffle_epi32(X4, 78);
 205         tmp3 = _mm_xor_si128(tmp3, H4);
 206         tmp7 = _mm_xor_si128(tmp7, X4);
 207
 208         tmp0 = _mm_clmulepi64_si128(tmp0, tmp4, 0x00);
 209         tmp1 = _mm_clmulepi64_si128(tmp1, tmp5, 0x00);
 210         tmp2 = _mm_clmulepi64_si128(tmp2, tmp6, 0x00);
 211         tmp3 = _mm_clmulepi64_si128(tmp3, tmp7, 0x00);
 212
 213         tmp0 = _mm_xor_si128(tmp0, lo);
 214         tmp0 = _mm_xor_si128(tmp0, hi);
 215         tmp0 = _mm_xor_si128(tmp1, tmp0);
 216         tmp0 = _mm_xor_si128(tmp2, tmp0);
 217         tmp0 = _mm_xor_si128(tmp3, tmp0);
 218
 219         tmp4 = _mm_slli_si128(tmp0, 8);
 220         tmp0 = _mm_srli_si128(tmp0, 8);
 221
 222         lo = _mm_xor_si128(tmp4, lo);
 223         hi = _mm_xor_si128(tmp0, hi);
 224
 225         tmp3 = lo;
 226         tmp6 = hi;
 227
 228         tmp7 = _mm_srli_epi32(tmp3, 31);
 229         tmp8 = _mm_srli_epi32(tmp6, 31);
 230         tmp3 = _mm_slli_epi32(tmp3, 1);
 231         tmp6 = _mm_slli_epi32(tmp6, 1);
 232
 233         tmp9 = _mm_srli_si128(tmp7, 12);
 234         tmp8 = _mm_slli_si128(tmp8, 4);
 235         tmp7 = _mm_slli_si128(tmp7, 4);
 236         tmp3 = _mm_or_si128(tmp3, tmp7);
 237         tmp6 = _mm_or_si128(tmp6, tmp8);
 238         tmp6 = _mm_or_si128(tmp6, tmp9);
 239
 240         tmp7 = _mm_slli_epi32(tmp3, 31);
 241         tmp8 = _mm_slli_epi32(tmp3, 30);
 242         tmp9 = _mm_slli_epi32(tmp3, 25);
 243
 244         tmp7 = _mm_xor_si128(tmp7, tmp8);
 245         tmp7 = _mm_xor_si128(tmp7, tmp9);
 246         tmp8 = _mm_srli_si128(tmp7, 4);
 247         tmp7 = _mm_slli_si128(tmp7, 12);
 248         tmp3 = _mm_xor_si128(tmp3, tmp7);
 249
 250         tmp2 = _mm_srli_epi32(tmp3, 1);
 251         tmp4 = _mm_srli_epi32(tmp3, 2);
 252         tmp5 = _mm_srli_epi32(tmp3, 7);
 253         tmp2 = _mm_xor_si128(tmp2, tmp4);
 254         tmp2 = _mm_xor_si128(tmp2, tmp5);
 255         tmp2 = _mm_xor_si128(tmp2, tmp8);
 256         tmp3 = _mm_xor_si128(tmp3, tmp2);
 257         tmp6 = _mm_xor_si128(tmp6, tmp3);
 258
 259         *res = tmp6;
 260 }
 261
 262 /*
 263  * Figure 12. AES-GCM: Processing Four Blocks in Parallel with Aggregated
 264  * Every Four Blocks
 265  */
 266 /*
 267  * per NIST SP-800-38D, 5.2.1.1, len(p) <= 2^39-256 (in bits), or
 268  * 2^32-256*8*16 bytes.
 269  */
 270 void
 271 AES_GCM_encrypt(const unsigned char *in, unsigned char *out,
 272         const unsigned char *addt, const unsigned char *ivec,
 273         unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
 274         const unsigned char *key, int nr)
 275 {
 276         int i, j ,k;
 277         __m128i tmp1, tmp2, tmp3, tmp4;
 278         __m128i tmp5, tmp6, tmp7, tmp8;
 279         __m128i H, H2, H3, H4, Y, T;
 280         const __m128i *KEY = (const __m128i *)key;
 281         __m128i ctr1, ctr2, ctr3, ctr4;
 282         __m128i ctr5, ctr6, ctr7, ctr8;
 283         __m128i last_block = _mm_setzero_si128();
 284         __m128i ONE = _mm_set_epi32(0, 1, 0, 0);
 285         __m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
 286         __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
 287             7);
 288         __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
 289             15);
 290         __m128i X = _mm_setzero_si128();
 291
 292         if (ibytes == 96/8) {
 293                 Y = _mm_loadu_si128((const __m128i *)ivec);
 294                 Y = _mm_insert_epi32(Y, 0x1000000, 3);
 295                 /*(Compute E[ZERO, KS] and E[Y0, KS] together*/
 296                 tmp1 = _mm_xor_si128(X, KEY[0]);
 297                 tmp2 = _mm_xor_si128(Y, KEY[0]);
 298                 for (j=1; j < nr-1; j+=2) {
 299                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 300                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
 301
 302                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
 303                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
 304                 }
 305                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
 306                 tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
 307
 308                 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
 309                 T = _mm_aesenclast_si128(tmp2, KEY[nr]);
 310
 311                 H = _mm_shuffle_epi8(H, BSWAP_MASK);
 312         } else {
 313                 tmp1 = _mm_xor_si128(X, KEY[0]);
 314                 for (j=1; j <nr; j++)
 315                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 316                 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
 317
 318                 H = _mm_shuffle_epi8(H, BSWAP_MASK);
 319                 Y = _mm_setzero_si128();
 320
 321                 for (i=0; i < ibytes/16; i++) {
 322                         tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]);
 323                         tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 324                         Y = _mm_xor_si128(Y, tmp1);
 325                         gfmul(Y, H, &Y);
 326                 }
 327                 if (ibytes%16) {
 328                         for (j=0; j < ibytes%16; j++)
 329                                 ((unsigned char*)&last_block)[j] = ivec[i*16+j];
 330                         tmp1 = last_block;
 331                         tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 332                         Y = _mm_xor_si128(Y, tmp1);
 333                         gfmul(Y, H, &Y);
 334                 }
 335                 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
 336                 tmp1 = _mm_insert_epi64(tmp1, 0, 1);
 337
 338                 Y = _mm_xor_si128(Y, tmp1);
 339                 gfmul(Y, H, &Y);
 340                 Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
 341                 tmp1 = _mm_xor_si128(Y, KEY[0]);
 342                 for (j=1; j < nr; j++)
 343                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 344                 T = _mm_aesenclast_si128(tmp1, KEY[nr]);
 345         }
 346
 347         gfmul(H,H,&H2);
 348         gfmul(H,H2,&H3);
 349         gfmul(H,H3,&H4);
 350
 351         for (i=0; i<abytes/16/4; i++) {
 352                 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]);
 353                 tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]);
 354                 tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]);
 355                 tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]);
 356
 357                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 358                 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
 359                 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
 360                 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
 361                 tmp1 = _mm_xor_si128(X, tmp1);
 362
 363                 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
 364         }
 365         for (i=i*4; i<abytes/16; i++) {
 366                 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]);
 367                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 368                 X = _mm_xor_si128(X,tmp1);
 369                 gfmul(X, H, &X);
 370         }
 371         if (abytes%16) {
 372                 last_block = _mm_setzero_si128();
 373                 for (j=0; j<abytes%16; j++)
 374                         ((unsigned char*)&last_block)[j] = addt[i*16+j];
 375                 tmp1 = last_block;
 376                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 377                 X =_mm_xor_si128(X,tmp1);
 378                 gfmul(X,H,&X);
 379         }
 380
 381         ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
 382         ctr1 = _mm_add_epi64(ctr1, ONE);
 383         ctr2 = _mm_add_epi64(ctr1, ONE);
 384         ctr3 = _mm_add_epi64(ctr2, ONE);
 385         ctr4 = _mm_add_epi64(ctr3, ONE);
 386         ctr5 = _mm_add_epi64(ctr4, ONE);
 387         ctr6 = _mm_add_epi64(ctr5, ONE);
 388         ctr7 = _mm_add_epi64(ctr6, ONE);
 389         ctr8 = _mm_add_epi64(ctr7, ONE);
 390
 391         for (i=0; i<nbytes/16/8; i++) {
 392                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
 393                 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
 394                 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
 395                 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
 396                 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
 397                 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
 398                 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
 399                 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
 400
 401                 ctr1 = _mm_add_epi64(ctr1, EIGHT);
 402                 ctr2 = _mm_add_epi64(ctr2, EIGHT);
 403                 ctr3 = _mm_add_epi64(ctr3, EIGHT);
 404                 ctr4 = _mm_add_epi64(ctr4, EIGHT);
 405                 ctr5 = _mm_add_epi64(ctr5, EIGHT);
 406                 ctr6 = _mm_add_epi64(ctr6, EIGHT);
 407                 ctr7 = _mm_add_epi64(ctr7, EIGHT);
 408                 ctr8 = _mm_add_epi64(ctr8, EIGHT);
 409
 410                 tmp1 =_mm_xor_si128(tmp1, KEY[0]);
 411                 tmp2 =_mm_xor_si128(tmp2, KEY[0]);
 412                 tmp3 =_mm_xor_si128(tmp3, KEY[0]);
 413                 tmp4 =_mm_xor_si128(tmp4, KEY[0]);
 414                 tmp5 =_mm_xor_si128(tmp5, KEY[0]);
 415                 tmp6 =_mm_xor_si128(tmp6, KEY[0]);
 416                 tmp7 =_mm_xor_si128(tmp7, KEY[0]);
 417                 tmp8 =_mm_xor_si128(tmp8, KEY[0]);
 418
 419                 for (j=1; j<nr; j++) {
 420                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 421                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
 422                         tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
 423                         tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
 424                         tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
 425                         tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
 426                         tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
 427                         tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
 428                 }
 429                 tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
 430                 tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
 431                 tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
 432                 tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
 433                 tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
 434                 tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
 435                 tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
 436                 tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
 437
 438                 tmp1 = _mm_xor_si128(tmp1,
 439                     _mm_loadu_si128(&((const __m128i *)in)[i*8+0]));
 440                 tmp2 = _mm_xor_si128(tmp2,
 441                     _mm_loadu_si128(&((const __m128i *)in)[i*8+1]));
 442                 tmp3 = _mm_xor_si128(tmp3,
 443                     _mm_loadu_si128(&((const __m128i *)in)[i*8+2]));
 444                 tmp4 = _mm_xor_si128(tmp4,
 445                     _mm_loadu_si128(&((const __m128i *)in)[i*8+3]));
 446                 tmp5 = _mm_xor_si128(tmp5,
 447                     _mm_loadu_si128(&((const __m128i *)in)[i*8+4]));
 448                 tmp6 = _mm_xor_si128(tmp6,
 449                     _mm_loadu_si128(&((const __m128i *)in)[i*8+5]));
 450                 tmp7 = _mm_xor_si128(tmp7,
 451                     _mm_loadu_si128(&((const __m128i *)in)[i*8+6]));
 452                 tmp8 = _mm_xor_si128(tmp8,
 453                     _mm_loadu_si128(&((const __m128i *)in)[i*8+7]));
 454
 455                 _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
 456                 _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
 457                 _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
 458                 _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
 459                 _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
 460                 _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
 461                 _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
 462                 _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
 463
 464                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 465                 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
 466                 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
 467                 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
 468                 tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
 469                 tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
 470                 tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
 471                 tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
 472
 473                 tmp1 = _mm_xor_si128(X, tmp1);
 474
 475                 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
 476
 477                 tmp5 = _mm_xor_si128(X, tmp5);
 478                 reduce4(H, H2, H3, H4, tmp8, tmp7, tmp6, tmp5, &X);
 479         }
 480         for (k=i*8; k<nbytes/16; k++) {
 481                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
 482                 ctr1 = _mm_add_epi64(ctr1, ONE);
 483                 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
 484                 for (j=1; j<nr-1; j+=2) {
 485                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 486                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
 487                 }
 488                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
 489                 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
 490                 tmp1 = _mm_xor_si128(tmp1,
 491                     _mm_loadu_si128(&((const __m128i *)in)[k]));
 492                 _mm_storeu_si128(&((__m128i*)out)[k], tmp1);
 493                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 494                 X = _mm_xor_si128(X, tmp1);
 495                 gfmul(X,H,&X);
 496         }
 497         //If remains one incomplete block
 498         if (nbytes%16) {
 499                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
 500                 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
 501                 for (j=1; j<nr-1; j+=2) {
 502                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 503                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
 504                 }
 505                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
 506                 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
 507                 last_block = _mm_setzero_si128();
 508                 memcpy(&last_block, &((const __m128i *)in)[k],
 509                     nbytes % 16);
 510                 last_block = _mm_xor_si128(last_block, tmp1);
 511                 for (j=0; j<nbytes%16; j++)
 512                         out[k*16+j] = ((unsigned char*)&last_block)[j];
 513                 for ((void)j; j<16; j++)
 514                         ((unsigned char*)&last_block)[j] = 0;
 515                 tmp1 = last_block;
 516                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 517                 X = _mm_xor_si128(X, tmp1);
 518                 gfmul(X, H, &X);
 519         }
 520         tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
 521         tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
 522
 523         X = _mm_xor_si128(X, tmp1);
 524         gfmul(X,H,&X);
 525         X = _mm_shuffle_epi8(X, BSWAP_MASK);
 526         T = _mm_xor_si128(X, T);
 527         _mm_storeu_si128((__m128i*)tag, T);
 528 }
 529
 530 /* My modification of _encrypt to be _decrypt */
 531 int
 532 AES_GCM_decrypt(const unsigned char *in, unsigned char *out,
 533         const unsigned char *addt, const unsigned char *ivec,
 534         const unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
 535         const unsigned char *key, int nr)
 536 {
 537         int i, j ,k;
 538         __m128i tmp1, tmp2, tmp3, tmp4;
 539         __m128i tmp5, tmp6, tmp7, tmp8;
 540         __m128i H, H2, H3, H4, Y, T;
 541         const __m128i *KEY = (const __m128i *)key;
 542         __m128i ctr1, ctr2, ctr3, ctr4;
 543         __m128i ctr5, ctr6, ctr7, ctr8;
 544         __m128i last_block = _mm_setzero_si128();
 545         __m128i ONE = _mm_set_epi32(0, 1, 0, 0);
 546         __m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
 547         __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
 548             7);
 549         __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
 550             15);
 551         __m128i X = _mm_setzero_si128();
 552
 553         if (ibytes == 96/8) {
 554                 Y = _mm_loadu_si128((const __m128i *)ivec);
 555                 Y = _mm_insert_epi32(Y, 0x1000000, 3);
 556                 /*(Compute E[ZERO, KS] and E[Y0, KS] together*/
 557                 tmp1 = _mm_xor_si128(X, KEY[0]);
 558                 tmp2 = _mm_xor_si128(Y, KEY[0]);
 559                 for (j=1; j < nr-1; j+=2) {
 560                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 561                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
 562
 563                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
 564                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
 565                 }
 566                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
 567                 tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
 568
 569                 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
 570                 T = _mm_aesenclast_si128(tmp2, KEY[nr]);
 571
 572                 H = _mm_shuffle_epi8(H, BSWAP_MASK);
 573         } else {
 574                 tmp1 = _mm_xor_si128(X, KEY[0]);
 575                 for (j=1; j <nr; j++)
 576                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 577                 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
 578
 579                 H = _mm_shuffle_epi8(H, BSWAP_MASK);
 580                 Y = _mm_setzero_si128();
 581
 582                 for (i=0; i < ibytes/16; i++) {
 583                         tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]);
 584                         tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 585                         Y = _mm_xor_si128(Y, tmp1);
 586                         gfmul(Y, H, &Y);
 587                 }
 588                 if (ibytes%16) {
 589                         for (j=0; j < ibytes%16; j++)
 590                                 ((unsigned char*)&last_block)[j] = ivec[i*16+j];
 591                         tmp1 = last_block;
 592                         tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 593                         Y = _mm_xor_si128(Y, tmp1);
 594                         gfmul(Y, H, &Y);
 595                 }
 596                 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
 597                 tmp1 = _mm_insert_epi64(tmp1, 0, 1);
 598
 599                 Y = _mm_xor_si128(Y, tmp1);
 600                 gfmul(Y, H, &Y);
 601                 Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
 602                 tmp1 = _mm_xor_si128(Y, KEY[0]);
 603                 for (j=1; j < nr; j++)
 604                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 605                 T = _mm_aesenclast_si128(tmp1, KEY[nr]);
 606         }
 607
 608         gfmul(H,H,&H2);
 609         gfmul(H,H2,&H3);
 610         gfmul(H,H3,&H4);
 611
 612         for (i=0; i<abytes/16/4; i++) {
 613                 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]);
 614                 tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]);
 615                 tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]);
 616                 tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]);
 617
 618                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 619                 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
 620                 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
 621                 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
 622
 623                 tmp1 = _mm_xor_si128(X, tmp1);
 624
 625                 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
 626         }
 627         for (i=i*4; i<abytes/16; i++) {
 628                 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]);
 629                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 630                 X = _mm_xor_si128(X,tmp1);
 631                 gfmul(X, H, &X);
 632         }
 633         if (abytes%16) {
 634                 last_block = _mm_setzero_si128();
 635                 for (j=0; j<abytes%16; j++)
 636                         ((unsigned char*)&last_block)[j] = addt[i*16+j];
 637                 tmp1 = last_block;
 638                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 639                 X =_mm_xor_si128(X,tmp1);
 640                 gfmul(X,H,&X);
 641         }
 642
 643         /* This is where we validate the cipher text before decrypt */
 644         for (i = 0; i<nbytes/16/4; i++) {
 645                 tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i*4]);
 646                 tmp2 = _mm_loadu_si128(&((const __m128i *)in)[i*4+1]);
 647                 tmp3 = _mm_loadu_si128(&((const __m128i *)in)[i*4+2]);
 648                 tmp4 = _mm_loadu_si128(&((const __m128i *)in)[i*4+3]);
 649
 650                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 651                 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
 652                 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
 653                 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
 654
 655                 tmp1 = _mm_xor_si128(X, tmp1);
 656
 657                 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
 658         }
 659         for (i = i*4; i<nbytes/16; i++) {
 660                 tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i]);
 661                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 662                 X = _mm_xor_si128(X, tmp1);
 663                 gfmul(X,H,&X);
 664         }
 665         if (nbytes%16) {
 666                 last_block = _mm_setzero_si128();
 667                 for (j=0; j<nbytes%16; j++)
 668                         ((unsigned char*)&last_block)[j] = in[i*16+j];
 669                 tmp1 = last_block;
 670                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 671                 X = _mm_xor_si128(X, tmp1);
 672                 gfmul(X, H, &X);
 673         }
 674
 675         tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
 676         tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
 677
 678         X = _mm_xor_si128(X, tmp1);
 679         gfmul(X,H,&X);
 680         X = _mm_shuffle_epi8(X, BSWAP_MASK);
 681         T = _mm_xor_si128(X, T);
 682
 683         if (!m128icmp(T, _mm_loadu_si128((const __m128i*)tag)))
 684                 return 0; //in case the authentication failed
 685
 686         ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
 687         ctr1 = _mm_add_epi64(ctr1, ONE);
 688         ctr2 = _mm_add_epi64(ctr1, ONE);
 689         ctr3 = _mm_add_epi64(ctr2, ONE);
 690         ctr4 = _mm_add_epi64(ctr3, ONE);
 691         ctr5 = _mm_add_epi64(ctr4, ONE);
 692         ctr6 = _mm_add_epi64(ctr5, ONE);
 693         ctr7 = _mm_add_epi64(ctr6, ONE);
 694         ctr8 = _mm_add_epi64(ctr7, ONE);
 695
 696         for (i=0; i<nbytes/16/8; i++) {
 697                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
 698                 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
 699                 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
 700                 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
 701                 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
 702                 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
 703                 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
 704                 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
 705
 706                 ctr1 = _mm_add_epi64(ctr1, EIGHT);
 707                 ctr2 = _mm_add_epi64(ctr2, EIGHT);
 708                 ctr3 = _mm_add_epi64(ctr3, EIGHT);
 709                 ctr4 = _mm_add_epi64(ctr4, EIGHT);
 710                 ctr5 = _mm_add_epi64(ctr5, EIGHT);
 711                 ctr6 = _mm_add_epi64(ctr6, EIGHT);
 712                 ctr7 = _mm_add_epi64(ctr7, EIGHT);
 713                 ctr8 = _mm_add_epi64(ctr8, EIGHT);
 714
 715                 tmp1 =_mm_xor_si128(tmp1, KEY[0]);
 716                 tmp2 =_mm_xor_si128(tmp2, KEY[0]);
 717                 tmp3 =_mm_xor_si128(tmp3, KEY[0]);
 718                 tmp4 =_mm_xor_si128(tmp4, KEY[0]);
 719                 tmp5 =_mm_xor_si128(tmp5, KEY[0]);
 720                 tmp6 =_mm_xor_si128(tmp6, KEY[0]);
 721                 tmp7 =_mm_xor_si128(tmp7, KEY[0]);
 722                 tmp8 =_mm_xor_si128(tmp8, KEY[0]);
 723
 724                 for (j=1; j<nr; j++) {
 725                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 726                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
 727                         tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
 728                         tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
 729                         tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
 730                         tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
 731                         tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
 732                         tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
 733                 }
 734                 tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
 735                 tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
 736                 tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
 737                 tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
 738                 tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
 739                 tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
 740                 tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
 741                 tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
 742
 743                 tmp1 = _mm_xor_si128(tmp1,
 744                     _mm_loadu_si128(&((const __m128i *)in)[i*8+0]));
 745                 tmp2 = _mm_xor_si128(tmp2,
 746                     _mm_loadu_si128(&((const __m128i *)in)[i*8+1]));
 747                 tmp3 = _mm_xor_si128(tmp3,
 748                     _mm_loadu_si128(&((const __m128i *)in)[i*8+2]));
 749                 tmp4 = _mm_xor_si128(tmp4,
 750                     _mm_loadu_si128(&((const __m128i *)in)[i*8+3]));
 751                 tmp5 = _mm_xor_si128(tmp5,
 752                     _mm_loadu_si128(&((const __m128i *)in)[i*8+4]));
 753                 tmp6 = _mm_xor_si128(tmp6,
 754                     _mm_loadu_si128(&((const __m128i *)in)[i*8+5]));
 755                 tmp7 = _mm_xor_si128(tmp7,
 756                     _mm_loadu_si128(&((const __m128i *)in)[i*8+6]));
 757                 tmp8 = _mm_xor_si128(tmp8,
 758                     _mm_loadu_si128(&((const __m128i *)in)[i*8+7]));
 759
 760                 _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
 761                 _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
 762                 _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
 763                 _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
 764                 _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
 765                 _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
 766                 _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
 767                 _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
 768
 769                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 770                 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
 771                 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
 772                 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
 773                 tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
 774                 tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
 775                 tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
 776                 tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
 777         }
 778         for (k=i*8; k<nbytes/16; k++) {
 779                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
 780                 ctr1 = _mm_add_epi64(ctr1, ONE);
 781                 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
 782                 for (j=1; j<nr-1; j+=2) {
 783                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 784                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
 785                 }
 786                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
 787                 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
 788                 tmp1 = _mm_xor_si128(tmp1,
 789                     _mm_loadu_si128(&((const __m128i *)in)[k]));
 790                 _mm_storeu_si128(&((__m128i*)out)[k], tmp1);
 791         }
 792         //If remains one incomplete block
 793         if (nbytes%16) {
 794                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
 795                 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
 796                 for (j=1; j<nr-1; j+=2) {
 797                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 798                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
 799                 }
 800                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
 801                 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
 802                 last_block = _mm_setzero_si128();
 803                 memcpy(&last_block, &((const __m128i *)in)[k], nbytes%16);
 804                 tmp1 = _mm_xor_si128(tmp1, last_block);
 805                 last_block = tmp1;
 806                 for (j=0; j<nbytes%16; j++)
 807                         out[k*16+j] = ((unsigned char*)&last_block)[j];
 808         }
 809         return 1; //when sucessfull returns 1
 810 }