sys/crypto/aesni/aesni_ghash.c

   1 /*-
   2  * Copyright (c) 2014 The FreeBSD Foundation
   3  * All rights reserved.
   4  *
   5  * This software was developed by John-Mark Gurney under
   6  * the sponsorship of the FreeBSD Foundation and
   7  * Rubicon Communications, LLC (Netgate).
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1.  Redistributions of source code must retain the above copyright
  12  *     notice, this list of conditions and the following disclaimer.
  13  * 2.  Redistributions in binary form must reproduce the above copyright
  14  *     notice, this list of conditions and the following disclaimer in the
  15  *     documentation and/or other materials provided with the distribution.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27  * SUCH DAMAGE.
  28  *
  29  *
  30  *      $FreeBSD$
  31  *
  32  */
  33
  34 /*
  35  * Figure 5, 8 and 12 are copied from the Intel white paper:
  36  * Intel® Carry-Less Multiplication Instruction and its Usage for
  37  * Computing the GCM Mode
  38  *
  39  * and as such are:
  40  * Copyright © 2010 Intel Corporation.
  41  * All rights reserved.
  42  *
  43  * Redistribution and use in source and binary forms, with or without
  44  * modification, are permitted provided that the following conditions
  45  * are met:
  46  *   * Redistributions of source code must retain the above copyright
  47  *     notice, this list of conditions and the following disclaimer.
  48  *   * Redistributions in binary form must reproduce the above copyright
  49  *     notice, this list of conditions and the following disclaimer in the
  50  *     documentation and/or other materials provided with the distribution.
  51  *   * Neither the name of Intel Corporation nor the
  52  *     names of its contributors may be used to endorse or promote products
  53  *     derived from this software without specific prior written permission.
  54  *
  55  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  56  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  57  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  58  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  59  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  60  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  61  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  62  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  63  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  64  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  65  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  66  */
  67
  68 #ifdef _KERNEL
  69 #include <crypto/aesni/aesni.h>
  70 #include <crypto/aesni/aesni_os.h>
  71 #else
  72 #include <stdint.h>
  73 #endif
  74
  75 #include <wmmintrin.h>
  76 #include <emmintrin.h>
  77 #include <smmintrin.h>
  78
  79 static inline int
  80 m128icmp(__m128i a, __m128i b)
  81 {
  82         __m128i cmp;
  83
  84         cmp = _mm_cmpeq_epi32(a, b);
  85
  86         return _mm_movemask_epi8(cmp) == 0xffff;
  87 }
  88
  89 #ifdef __i386__
  90 static inline __m128i
  91 _mm_insert_epi64(__m128i a, int64_t b, const int ndx)
  92 {
  93
  94         if (!ndx) {
  95                 a = _mm_insert_epi32(a, b, 0);
  96                 a = _mm_insert_epi32(a, b >> 32, 1);
  97         } else {
  98                 a = _mm_insert_epi32(a, b, 2);
  99                 a = _mm_insert_epi32(a, b >> 32, 3);
 100         }
 101
 102         return a;
 103 }
 104 #endif
 105
 106 /* some code from carry-less-multiplication-instruction-in-gcm-mode-paper.pdf */
 107
 108 /* Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */
 109 static void
 110 gfmul(__m128i a, __m128i b, __m128i *res)
 111 {
 112         __m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
 113
 114         tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
 115         tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
 116         tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
 117         tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
 118
 119         tmp4 = _mm_xor_si128(tmp4, tmp5);
 120         tmp5 = _mm_slli_si128(tmp4, 8);
 121         tmp4 = _mm_srli_si128(tmp4, 8);
 122         tmp3 = _mm_xor_si128(tmp3, tmp5);
 123         tmp6 = _mm_xor_si128(tmp6, tmp4);
 124
 125         tmp7 = _mm_srli_epi32(tmp3, 31);
 126         tmp8 = _mm_srli_epi32(tmp6, 31);
 127         tmp3 = _mm_slli_epi32(tmp3, 1);
 128         tmp6 = _mm_slli_epi32(tmp6, 1);
 129
 130         tmp9 = _mm_srli_si128(tmp7, 12);
 131         tmp8 = _mm_slli_si128(tmp8, 4);
 132         tmp7 = _mm_slli_si128(tmp7, 4);
 133         tmp3 = _mm_or_si128(tmp3, tmp7);
 134         tmp6 = _mm_or_si128(tmp6, tmp8);
 135         tmp6 = _mm_or_si128(tmp6, tmp9);
 136
 137         tmp7 = _mm_slli_epi32(tmp3, 31);
 138         tmp8 = _mm_slli_epi32(tmp3, 30);
 139         tmp9 = _mm_slli_epi32(tmp3, 25);
 140
 141         tmp7 = _mm_xor_si128(tmp7, tmp8);
 142         tmp7 = _mm_xor_si128(tmp7, tmp9);
 143         tmp8 = _mm_srli_si128(tmp7, 4);
 144         tmp7 = _mm_slli_si128(tmp7, 12);
 145         tmp3 = _mm_xor_si128(tmp3, tmp7);
 146
 147         tmp2 = _mm_srli_epi32(tmp3, 1);
 148         tmp4 = _mm_srli_epi32(tmp3, 2);
 149         tmp5 = _mm_srli_epi32(tmp3, 7);
 150         tmp2 = _mm_xor_si128(tmp2, tmp4);
 151         tmp2 = _mm_xor_si128(tmp2, tmp5);
 152         tmp2 = _mm_xor_si128(tmp2, tmp8);
 153         tmp3 = _mm_xor_si128(tmp3, tmp2);
 154         tmp6 = _mm_xor_si128(tmp6, tmp3);
 155
 156         *res = tmp6;
 157 }
 158
 159 /*
 160  * Figure 8. Code Sample - Performing Ghash Using an Aggregated Reduction
 161  * Method */
 162 static void
 163 reduce4(__m128i H1, __m128i H2, __m128i H3, __m128i H4,
 164     __m128i X1, __m128i X2, __m128i X3, __m128i X4, __m128i *res)
 165 {
 166         /*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/
 167         __m128i H1_X1_lo, H1_X1_hi, H2_X2_lo, H2_X2_hi, H3_X3_lo,
 168             H3_X3_hi, H4_X4_lo, H4_X4_hi, lo, hi;
 169         __m128i tmp0, tmp1, tmp2, tmp3;
 170         __m128i tmp4, tmp5, tmp6, tmp7;
 171         __m128i tmp8, tmp9;
 172
 173         H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00);
 174         H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00);
 175         H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00);
 176         H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00);
 177
 178         lo = _mm_xor_si128(H1_X1_lo, H2_X2_lo);
 179         lo = _mm_xor_si128(lo, H3_X3_lo);
 180         lo = _mm_xor_si128(lo, H4_X4_lo);
 181
 182         H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11);
 183         H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11);
 184         H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11);
 185         H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11);
 186
 187         hi = _mm_xor_si128(H1_X1_hi, H2_X2_hi);
 188         hi = _mm_xor_si128(hi, H3_X3_hi);
 189         hi = _mm_xor_si128(hi, H4_X4_hi);
 190
 191         tmp0 = _mm_shuffle_epi32(H1, 78);
 192         tmp4 = _mm_shuffle_epi32(X1, 78);
 193         tmp0 = _mm_xor_si128(tmp0, H1);
 194         tmp4 = _mm_xor_si128(tmp4, X1);
 195         tmp1 = _mm_shuffle_epi32(H2, 78);
 196         tmp5 = _mm_shuffle_epi32(X2, 78);
 197         tmp1 = _mm_xor_si128(tmp1, H2);
 198         tmp5 = _mm_xor_si128(tmp5, X2);
 199         tmp2 = _mm_shuffle_epi32(H3, 78);
 200         tmp6 = _mm_shuffle_epi32(X3, 78);
 201         tmp2 = _mm_xor_si128(tmp2, H3);
 202         tmp6 = _mm_xor_si128(tmp6, X3);
 203         tmp3 = _mm_shuffle_epi32(H4, 78);
 204         tmp7 = _mm_shuffle_epi32(X4, 78);
 205         tmp3 = _mm_xor_si128(tmp3, H4);
 206         tmp7 = _mm_xor_si128(tmp7, X4);
 207
 208         tmp0 = _mm_clmulepi64_si128(tmp0, tmp4, 0x00);
 209         tmp1 = _mm_clmulepi64_si128(tmp1, tmp5, 0x00);
 210         tmp2 = _mm_clmulepi64_si128(tmp2, tmp6, 0x00);
 211         tmp3 = _mm_clmulepi64_si128(tmp3, tmp7, 0x00);
 212
 213         tmp0 = _mm_xor_si128(tmp0, lo);
 214         tmp0 = _mm_xor_si128(tmp0, hi);
 215         tmp0 = _mm_xor_si128(tmp1, tmp0);
 216         tmp0 = _mm_xor_si128(tmp2, tmp0);
 217         tmp0 = _mm_xor_si128(tmp3, tmp0);
 218
 219         tmp4 = _mm_slli_si128(tmp0, 8);
 220         tmp0 = _mm_srli_si128(tmp0, 8);
 221
 222         lo = _mm_xor_si128(tmp4, lo);
 223         hi = _mm_xor_si128(tmp0, hi);
 224
 225         tmp3 = lo;
 226         tmp6 = hi;
 227
 228         tmp7 = _mm_srli_epi32(tmp3, 31);
 229         tmp8 = _mm_srli_epi32(tmp6, 31);
 230         tmp3 = _mm_slli_epi32(tmp3, 1);
 231         tmp6 = _mm_slli_epi32(tmp6, 1);
 232
 233         tmp9 = _mm_srli_si128(tmp7, 12);
 234         tmp8 = _mm_slli_si128(tmp8, 4);
 235         tmp7 = _mm_slli_si128(tmp7, 4);
 236         tmp3 = _mm_or_si128(tmp3, tmp7);
 237         tmp6 = _mm_or_si128(tmp6, tmp8);
 238         tmp6 = _mm_or_si128(tmp6, tmp9);
 239
 240         tmp7 = _mm_slli_epi32(tmp3, 31);
 241         tmp8 = _mm_slli_epi32(tmp3, 30);
 242         tmp9 = _mm_slli_epi32(tmp3, 25);
 243
 244         tmp7 = _mm_xor_si128(tmp7, tmp8);
 245         tmp7 = _mm_xor_si128(tmp7, tmp9);
 246         tmp8 = _mm_srli_si128(tmp7, 4);
 247         tmp7 = _mm_slli_si128(tmp7, 12);
 248         tmp3 = _mm_xor_si128(tmp3, tmp7);
 249
 250         tmp2 = _mm_srli_epi32(tmp3, 1);
 251         tmp4 = _mm_srli_epi32(tmp3, 2);
 252         tmp5 = _mm_srli_epi32(tmp3, 7);
 253         tmp2 = _mm_xor_si128(tmp2, tmp4);
 254         tmp2 = _mm_xor_si128(tmp2, tmp5);
 255         tmp2 = _mm_xor_si128(tmp2, tmp8);
 256         tmp3 = _mm_xor_si128(tmp3, tmp2);
 257         tmp6 = _mm_xor_si128(tmp6, tmp3);
 258
 259         *res = tmp6;
 260 }
 261
 262 /*
 263  * Figure 12. AES-GCM: Processing Four Blocks in Parallel with Aggregated
 264  * Every Four Blocks
 265  */
 266 /*
 267  * per NIST SP-800-38D, 5.2.1.1, len(p) <= 2^39-256 (in bits), or
 268  * 2^32-256*8*16 bytes.
 269  */
 270 void
 271 AES_GCM_encrypt(const unsigned char *in, unsigned char *out,
 272         const unsigned char *addt, const unsigned char *ivec,
 273         unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
 274         const unsigned char *key, int nr)
 275 {
 276         int i, j ,k;
 277         __m128i tmp1, tmp2, tmp3, tmp4;
 278         __m128i tmp5, tmp6, tmp7, tmp8;
 279         __m128i H, H2, H3, H4, Y, T;
 280         const __m128i *KEY = (const __m128i *)key;
 281         __m128i ctr1, ctr2, ctr3, ctr4;
 282         __m128i ctr5, ctr6, ctr7, ctr8;
 283         __m128i last_block = _mm_setzero_si128();
 284         __m128i ONE = _mm_set_epi32(0, 1, 0, 0);
 285         __m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
 286         __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
 287             7);
 288         __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
 289             15);
 290         __m128i X = _mm_setzero_si128();
 291
 292         if (ibytes == 96/8) {
 293                 Y = _mm_loadu_si128((const __m128i *)ivec);
 294                 Y = _mm_insert_epi32(Y, 0x1000000, 3);
 295                 /*(Compute E[ZERO, KS] and E[Y0, KS] together*/
 296                 tmp1 = _mm_xor_si128(X, KEY[0]);
 297                 tmp2 = _mm_xor_si128(Y, KEY[0]);
 298                 for (j=1; j < nr-1; j+=2) {
 299                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 300                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
 301
 302                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
 303                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
 304                 }
 305                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
 306                 tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
 307
 308                 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
 309                 T = _mm_aesenclast_si128(tmp2, KEY[nr]);
 310
 311                 H = _mm_shuffle_epi8(H, BSWAP_MASK);
 312         } else {
 313                 tmp1 = _mm_xor_si128(X, KEY[0]);
 314                 for (j=1; j <nr; j++)
 315                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 316                 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
 317
 318                 H = _mm_shuffle_epi8(H, BSWAP_MASK);
 319                 Y = _mm_setzero_si128();
 320
 321                 for (i=0; i < ibytes/16; i++) {
 322                         tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]);
 323                         tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 324                         Y = _mm_xor_si128(Y, tmp1);
 325                         gfmul(Y, H, &Y);
 326                 }
 327                 if (ibytes%16) {
 328                         for (j=0; j < ibytes%16; j++)
 329                                 ((unsigned char*)&last_block)[j] = ivec[i*16+j];
 330                         tmp1 = last_block;
 331                         tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 332                         Y = _mm_xor_si128(Y, tmp1);
 333                         gfmul(Y, H, &Y);
 334                 }
 335                 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
 336                 tmp1 = _mm_insert_epi64(tmp1, 0, 1);
 337
 338                 Y = _mm_xor_si128(Y, tmp1);
 339                 gfmul(Y, H, &Y);
 340                 Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
 341                 tmp1 = _mm_xor_si128(Y, KEY[0]);
 342                 for (j=1; j < nr; j++)
 343                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 344                 T = _mm_aesenclast_si128(tmp1, KEY[nr]);
 345         }
 346
 347         gfmul(H,H,&H2);
 348         gfmul(H,H2,&H3);
 349         gfmul(H,H3,&H4);
 350
 351         for (i=0; i<abytes/16/4; i++) {
 352                 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]);
 353                 tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]);
 354                 tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]);
 355                 tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]);
 356
 357                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 358                 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
 359                 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
 360                 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
 361                 tmp1 = _mm_xor_si128(X, tmp1);
 362
 363                 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
 364         }
 365         for (i=i*4; i<abytes/16; i++) {
 366                 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]);
 367                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 368                 X = _mm_xor_si128(X,tmp1);
 369                 gfmul(X, H, &X);
 370         }
 371         if (abytes%16) {
 372                 last_block = _mm_setzero_si128();
 373                 for (j=0; j<abytes%16; j++)
 374                         ((unsigned char*)&last_block)[j] = addt[i*16+j];
 375                 tmp1 = last_block;
 376                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 377                 X =_mm_xor_si128(X,tmp1);
 378                 gfmul(X,H,&X);
 379         }
 380
 381         ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
 382         ctr1 = _mm_add_epi64(ctr1, ONE);
 383         ctr2 = _mm_add_epi64(ctr1, ONE);
 384         ctr3 = _mm_add_epi64(ctr2, ONE);
 385         ctr4 = _mm_add_epi64(ctr3, ONE);
 386         ctr5 = _mm_add_epi64(ctr4, ONE);
 387         ctr6 = _mm_add_epi64(ctr5, ONE);
 388         ctr7 = _mm_add_epi64(ctr6, ONE);
 389         ctr8 = _mm_add_epi64(ctr7, ONE);
 390
 391         for (i=0; i<nbytes/16/8; i++) {
 392                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
 393                 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
 394                 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
 395                 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
 396                 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
 397                 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
 398                 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
 399                 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
 400
 401                 ctr1 = _mm_add_epi64(ctr1, EIGHT);
 402                 ctr2 = _mm_add_epi64(ctr2, EIGHT);
 403                 ctr3 = _mm_add_epi64(ctr3, EIGHT);
 404                 ctr4 = _mm_add_epi64(ctr4, EIGHT);
 405                 ctr5 = _mm_add_epi64(ctr5, EIGHT);
 406                 ctr6 = _mm_add_epi64(ctr6, EIGHT);
 407                 ctr7 = _mm_add_epi64(ctr7, EIGHT);
 408                 ctr8 = _mm_add_epi64(ctr8, EIGHT);
 409
 410                 tmp1 =_mm_xor_si128(tmp1, KEY[0]);
 411                 tmp2 =_mm_xor_si128(tmp2, KEY[0]);
 412                 tmp3 =_mm_xor_si128(tmp3, KEY[0]);
 413                 tmp4 =_mm_xor_si128(tmp4, KEY[0]);
 414                 tmp5 =_mm_xor_si128(tmp5, KEY[0]);
 415                 tmp6 =_mm_xor_si128(tmp6, KEY[0]);
 416                 tmp7 =_mm_xor_si128(tmp7, KEY[0]);
 417                 tmp8 =_mm_xor_si128(tmp8, KEY[0]);
 418
 419                 for (j=1; j<nr; j++) {
 420                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 421                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
 422                         tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
 423                         tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
 424                         tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
 425                         tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
 426                         tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
 427                         tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
 428                 }
 429                 tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
 430                 tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
 431                 tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
 432                 tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
 433                 tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
 434                 tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
 435                 tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
 436                 tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
 437
 438                 tmp1 = _mm_xor_si128(tmp1,
 439                     _mm_loadu_si128(&((const __m128i *)in)[i*8+0]));
 440                 tmp2 = _mm_xor_si128(tmp2,
 441                     _mm_loadu_si128(&((const __m128i *)in)[i*8+1]));
 442                 tmp3 = _mm_xor_si128(tmp3,
 443                     _mm_loadu_si128(&((const __m128i *)in)[i*8+2]));
 444                 tmp4 = _mm_xor_si128(tmp4,
 445                     _mm_loadu_si128(&((const __m128i *)in)[i*8+3]));
 446                 tmp5 = _mm_xor_si128(tmp5,
 447                     _mm_loadu_si128(&((const __m128i *)in)[i*8+4]));
 448                 tmp6 = _mm_xor_si128(tmp6,
 449                     _mm_loadu_si128(&((const __m128i *)in)[i*8+5]));
 450                 tmp7 = _mm_xor_si128(tmp7,
 451                     _mm_loadu_si128(&((const __m128i *)in)[i*8+6]));
 452                 tmp8 = _mm_xor_si128(tmp8,
 453                     _mm_loadu_si128(&((const __m128i *)in)[i*8+7]));
 454
 455                 _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
 456                 _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
 457                 _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
 458                 _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
 459                 _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
 460                 _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
 461                 _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
 462                 _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
 463
 464                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 465                 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
 466                 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
 467                 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
 468                 tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
 469                 tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
 470                 tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
 471                 tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
 472
 473                 tmp1 = _mm_xor_si128(X, tmp1);
 474
 475                 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
 476
 477                 tmp5 = _mm_xor_si128(X, tmp5);
 478                 reduce4(H, H2, H3, H4, tmp8, tmp7, tmp6, tmp5, &X);
 479         }
 480         for (k=i*8; k<nbytes/16; k++) {
 481                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
 482                 ctr1 = _mm_add_epi64(ctr1, ONE);
 483                 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
 484                 for (j=1; j<nr-1; j+=2) {
 485                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 486                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
 487                 }
 488                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
 489                 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
 490                 tmp1 = _mm_xor_si128(tmp1,
 491                     _mm_loadu_si128(&((const __m128i *)in)[k]));
 492                 _mm_storeu_si128(&((__m128i*)out)[k], tmp1);
 493                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 494                 X = _mm_xor_si128(X, tmp1);
 495                 gfmul(X,H,&X);
 496         }
 497         //If remains one incomplete block
 498         if (nbytes%16) {
 499                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
 500                 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
 501                 for (j=1; j<nr-1; j+=2) {
 502                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 503                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
 504                 }
 505                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
 506                 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
 507                 tmp1 = _mm_xor_si128(tmp1,
 508                     _mm_loadu_si128(&((const __m128i *)in)[k]));
 509                 last_block = tmp1;
 510                 for (j=0; j<nbytes%16; j++)
 511                         out[k*16+j] = ((unsigned char*)&last_block)[j];
 512                 for ((void)j; j<16; j++)
 513                         ((unsigned char*)&last_block)[j] = 0;
 514                 tmp1 = last_block;
 515                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 516                 X = _mm_xor_si128(X, tmp1);
 517                 gfmul(X, H, &X);
 518         }
 519         tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
 520         tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
 521
 522         X = _mm_xor_si128(X, tmp1);
 523         gfmul(X,H,&X);
 524         X = _mm_shuffle_epi8(X, BSWAP_MASK);
 525         T = _mm_xor_si128(X, T);
 526         _mm_storeu_si128((__m128i*)tag, T);
 527 }
 528
 529 /* My modification of _encrypt to be _decrypt */
 530 int
 531 AES_GCM_decrypt(const unsigned char *in, unsigned char *out,
 532         const unsigned char *addt, const unsigned char *ivec,
 533         const unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
 534         const unsigned char *key, int nr)
 535 {
 536         int i, j ,k;
 537         __m128i tmp1, tmp2, tmp3, tmp4;
 538         __m128i tmp5, tmp6, tmp7, tmp8;
 539         __m128i H, H2, H3, H4, Y, T;
 540         const __m128i *KEY = (const __m128i *)key;
 541         __m128i ctr1, ctr2, ctr3, ctr4;
 542         __m128i ctr5, ctr6, ctr7, ctr8;
 543         __m128i last_block = _mm_setzero_si128();
 544         __m128i ONE = _mm_set_epi32(0, 1, 0, 0);
 545         __m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
 546         __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
 547             7);
 548         __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
 549             15);
 550         __m128i X = _mm_setzero_si128();
 551
 552         if (ibytes == 96/8) {
 553                 Y = _mm_loadu_si128((const __m128i *)ivec);
 554                 Y = _mm_insert_epi32(Y, 0x1000000, 3);
 555                 /*(Compute E[ZERO, KS] and E[Y0, KS] together*/
 556                 tmp1 = _mm_xor_si128(X, KEY[0]);
 557                 tmp2 = _mm_xor_si128(Y, KEY[0]);
 558                 for (j=1; j < nr-1; j+=2) {
 559                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 560                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
 561
 562                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
 563                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
 564                 }
 565                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
 566                 tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
 567
 568                 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
 569                 T = _mm_aesenclast_si128(tmp2, KEY[nr]);
 570
 571                 H = _mm_shuffle_epi8(H, BSWAP_MASK);
 572         } else {
 573                 tmp1 = _mm_xor_si128(X, KEY[0]);
 574                 for (j=1; j <nr; j++)
 575                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 576                 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
 577
 578                 H = _mm_shuffle_epi8(H, BSWAP_MASK);
 579                 Y = _mm_setzero_si128();
 580
 581                 for (i=0; i < ibytes/16; i++) {
 582                         tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]);
 583                         tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 584                         Y = _mm_xor_si128(Y, tmp1);
 585                         gfmul(Y, H, &Y);
 586                 }
 587                 if (ibytes%16) {
 588                         for (j=0; j < ibytes%16; j++)
 589                                 ((unsigned char*)&last_block)[j] = ivec[i*16+j];
 590                         tmp1 = last_block;
 591                         tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 592                         Y = _mm_xor_si128(Y, tmp1);
 593                         gfmul(Y, H, &Y);
 594                 }
 595                 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
 596                 tmp1 = _mm_insert_epi64(tmp1, 0, 1);
 597
 598                 Y = _mm_xor_si128(Y, tmp1);
 599                 gfmul(Y, H, &Y);
 600                 Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
 601                 tmp1 = _mm_xor_si128(Y, KEY[0]);
 602                 for (j=1; j < nr; j++)
 603                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 604                 T = _mm_aesenclast_si128(tmp1, KEY[nr]);
 605         }
 606
 607         gfmul(H,H,&H2);
 608         gfmul(H,H2,&H3);
 609         gfmul(H,H3,&H4);
 610
 611         for (i=0; i<abytes/16/4; i++) {
 612                 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]);
 613                 tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]);
 614                 tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]);
 615                 tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]);
 616
 617                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 618                 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
 619                 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
 620                 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
 621
 622                 tmp1 = _mm_xor_si128(X, tmp1);
 623
 624                 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
 625         }
 626         for (i=i*4; i<abytes/16; i++) {
 627                 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]);
 628                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 629                 X = _mm_xor_si128(X,tmp1);
 630                 gfmul(X, H, &X);
 631         }
 632         if (abytes%16) {
 633                 last_block = _mm_setzero_si128();
 634                 for (j=0; j<abytes%16; j++)
 635                         ((unsigned char*)&last_block)[j] = addt[i*16+j];
 636                 tmp1 = last_block;
 637                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 638                 X =_mm_xor_si128(X,tmp1);
 639                 gfmul(X,H,&X);
 640         }
 641
 642         /* This is where we validate the cipher text before decrypt */
 643         for (i = 0; i<nbytes/16/4; i++) {
 644                 tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i*4]);
 645                 tmp2 = _mm_loadu_si128(&((const __m128i *)in)[i*4+1]);
 646                 tmp3 = _mm_loadu_si128(&((const __m128i *)in)[i*4+2]);
 647                 tmp4 = _mm_loadu_si128(&((const __m128i *)in)[i*4+3]);
 648
 649                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 650                 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
 651                 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
 652                 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
 653
 654                 tmp1 = _mm_xor_si128(X, tmp1);
 655
 656                 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
 657         }
 658         for (i = i*4; i<nbytes/16; i++) {
 659                 tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i]);
 660                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 661                 X = _mm_xor_si128(X, tmp1);
 662                 gfmul(X,H,&X);
 663         }
 664         if (nbytes%16) {
 665                 last_block = _mm_setzero_si128();
 666                 for (j=0; j<nbytes%16; j++)
 667                         ((unsigned char*)&last_block)[j] = in[i*16+j];
 668                 tmp1 = last_block;
 669                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 670                 X = _mm_xor_si128(X, tmp1);
 671                 gfmul(X, H, &X);
 672         }
 673
 674         tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
 675         tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
 676
 677         X = _mm_xor_si128(X, tmp1);
 678         gfmul(X,H,&X);
 679         X = _mm_shuffle_epi8(X, BSWAP_MASK);
 680         T = _mm_xor_si128(X, T);
 681
 682         if (!m128icmp(T, _mm_loadu_si128((const __m128i*)tag)))
 683                 return 0; //in case the authentication failed
 684
 685         ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
 686         ctr1 = _mm_add_epi64(ctr1, ONE);
 687         ctr2 = _mm_add_epi64(ctr1, ONE);
 688         ctr3 = _mm_add_epi64(ctr2, ONE);
 689         ctr4 = _mm_add_epi64(ctr3, ONE);
 690         ctr5 = _mm_add_epi64(ctr4, ONE);
 691         ctr6 = _mm_add_epi64(ctr5, ONE);
 692         ctr7 = _mm_add_epi64(ctr6, ONE);
 693         ctr8 = _mm_add_epi64(ctr7, ONE);
 694
 695         for (i=0; i<nbytes/16/8; i++) {
 696                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
 697                 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
 698                 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
 699                 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
 700                 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
 701                 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
 702                 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
 703                 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
 704
 705                 ctr1 = _mm_add_epi64(ctr1, EIGHT);
 706                 ctr2 = _mm_add_epi64(ctr2, EIGHT);
 707                 ctr3 = _mm_add_epi64(ctr3, EIGHT);
 708                 ctr4 = _mm_add_epi64(ctr4, EIGHT);
 709                 ctr5 = _mm_add_epi64(ctr5, EIGHT);
 710                 ctr6 = _mm_add_epi64(ctr6, EIGHT);
 711                 ctr7 = _mm_add_epi64(ctr7, EIGHT);
 712                 ctr8 = _mm_add_epi64(ctr8, EIGHT);
 713
 714                 tmp1 =_mm_xor_si128(tmp1, KEY[0]);
 715                 tmp2 =_mm_xor_si128(tmp2, KEY[0]);
 716                 tmp3 =_mm_xor_si128(tmp3, KEY[0]);
 717                 tmp4 =_mm_xor_si128(tmp4, KEY[0]);
 718                 tmp5 =_mm_xor_si128(tmp5, KEY[0]);
 719                 tmp6 =_mm_xor_si128(tmp6, KEY[0]);
 720                 tmp7 =_mm_xor_si128(tmp7, KEY[0]);
 721                 tmp8 =_mm_xor_si128(tmp8, KEY[0]);
 722
 723                 for (j=1; j<nr; j++) {
 724                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 725                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
 726                         tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
 727                         tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
 728                         tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
 729                         tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
 730                         tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
 731                         tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
 732                 }
 733                 tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
 734                 tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
 735                 tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
 736                 tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
 737                 tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
 738                 tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
 739                 tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
 740                 tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
 741
 742                 tmp1 = _mm_xor_si128(tmp1,
 743                     _mm_loadu_si128(&((const __m128i *)in)[i*8+0]));
 744                 tmp2 = _mm_xor_si128(tmp2,
 745                     _mm_loadu_si128(&((const __m128i *)in)[i*8+1]));
 746                 tmp3 = _mm_xor_si128(tmp3,
 747                     _mm_loadu_si128(&((const __m128i *)in)[i*8+2]));
 748                 tmp4 = _mm_xor_si128(tmp4,
 749                     _mm_loadu_si128(&((const __m128i *)in)[i*8+3]));
 750                 tmp5 = _mm_xor_si128(tmp5,
 751                     _mm_loadu_si128(&((const __m128i *)in)[i*8+4]));
 752                 tmp6 = _mm_xor_si128(tmp6,
 753                     _mm_loadu_si128(&((const __m128i *)in)[i*8+5]));
 754                 tmp7 = _mm_xor_si128(tmp7,
 755                     _mm_loadu_si128(&((const __m128i *)in)[i*8+6]));
 756                 tmp8 = _mm_xor_si128(tmp8,
 757                     _mm_loadu_si128(&((const __m128i *)in)[i*8+7]));
 758
 759                 _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
 760                 _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
 761                 _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
 762                 _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
 763                 _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
 764                 _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
 765                 _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
 766                 _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
 767
 768                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 769                 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
 770                 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
 771                 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
 772                 tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
 773                 tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
 774                 tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
 775                 tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
 776         }
 777         for (k=i*8; k<nbytes/16; k++) {
 778                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
 779                 ctr1 = _mm_add_epi64(ctr1, ONE);
 780                 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
 781                 for (j=1; j<nr-1; j+=2) {
 782                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 783                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
 784                 }
 785                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
 786                 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
 787                 tmp1 = _mm_xor_si128(tmp1,
 788                     _mm_loadu_si128(&((const __m128i *)in)[k]));
 789                 _mm_storeu_si128(&((__m128i*)out)[k], tmp1);
 790         }
 791         //If remains one incomplete block
 792         if (nbytes%16) {
 793                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
 794                 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
 795                 for (j=1; j<nr-1; j+=2) {
 796                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 797                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
 798                 }
 799                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
 800                 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
 801                 tmp1 = _mm_xor_si128(tmp1,
 802                     _mm_loadu_si128(&((const __m128i *)in)[k]));
 803                 last_block = tmp1;
 804                 for (j=0; j<nbytes%16; j++)
 805                         out[k*16+j] = ((unsigned char*)&last_block)[j];
 806         }
 807         return 1; //when sucessfull returns 1
 808 }