crypto/openssl/crypto/modes/gcm128.c

   1 /*
   2  * Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
   3  *
   4  * Licensed under the OpenSSL license (the "License").  You may not use
   5  * this file except in compliance with the License.  You can obtain a copy
   6  * in the file LICENSE in the source distribution or at
   7  * https://www.openssl.org/source/license.html
   8  */
   9
  10 #include <openssl/crypto.h>
  11 #include "modes_local.h"
  12 #include <string.h>
  13
  14 #if defined(__GNUC__) && !defined(STRICT_ALIGNMENT)
  15 typedef size_t size_t_aX __attribute((__aligned__(1)));
  16 #else
  17 typedef size_t size_t_aX;
  18 #endif
  19
  20 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
  21 /* redefine, because alignment is ensured */
  22 # undef  GETU32
  23 # define GETU32(p)       BSWAP4(*(const u32 *)(p))
  24 # undef  PUTU32
  25 # define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
  26 #endif
  27
  28 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
  29 #define REDUCE1BIT(V)   do { \
  30         if (sizeof(size_t)==8) { \
  31                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
  32                 V.lo  = (V.hi<<63)|(V.lo>>1); \
  33                 V.hi  = (V.hi>>1 )^T; \
  34         } \
  35         else { \
  36                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
  37                 V.lo  = (V.hi<<63)|(V.lo>>1); \
  38                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
  39         } \
  40 } while(0)
  41
  42 /*-
  43  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
  44  * never be set to 8. 8 is effectively reserved for testing purposes.
  45  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
  46  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
  47  * whole spectrum of possible table driven implementations. Why? In
  48  * non-"Shoup's" case memory access pattern is segmented in such manner,
  49  * that it's trivial to see that cache timing information can reveal
  50  * fair portion of intermediate hash value. Given that ciphertext is
  51  * always available to attacker, it's possible for him to attempt to
  52  * deduce secret parameter H and if successful, tamper with messages
  53  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
  54  * not as trivial, but there is no reason to believe that it's resistant
  55  * to cache-timing attack. And the thing about "8-bit" implementation is
  56  * that it consumes 16 (sixteen) times more memory, 4KB per individual
  57  * key + 1KB shared. Well, on pros side it should be twice as fast as
  58  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
  59  * was observed to run ~75% faster, closer to 100% for commercial
  60  * compilers... Yet "4-bit" procedure is preferred, because it's
  61  * believed to provide better security-performance balance and adequate
  62  * all-round performance. "All-round" refers to things like:
  63  *
  64  * - shorter setup time effectively improves overall timing for
  65  *   handling short messages;
  66  * - larger table allocation can become unbearable because of VM
  67  *   subsystem penalties (for example on Windows large enough free
  68  *   results in VM working set trimming, meaning that consequent
  69  *   malloc would immediately incur working set expansion);
  70  * - larger table has larger cache footprint, which can affect
  71  *   performance of other code paths (not necessarily even from same
  72  *   thread in Hyper-Threading world);
  73  *
  74  * Value of 1 is not appropriate for performance reasons.
  75  */
  76 #if     TABLE_BITS==8
  77
  78 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
  79 {
  80     int i, j;
  81     u128 V;
  82
  83     Htable[0].hi = 0;
  84     Htable[0].lo = 0;
  85     V.hi = H[0];
  86     V.lo = H[1];
  87
  88     for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
  89         REDUCE1BIT(V);
  90         Htable[i] = V;
  91     }
  92
  93     for (i = 2; i < 256; i <<= 1) {
  94         u128 *Hi = Htable + i, H0 = *Hi;
  95         for (j = 1; j < i; ++j) {
  96             Hi[j].hi = H0.hi ^ Htable[j].hi;
  97             Hi[j].lo = H0.lo ^ Htable[j].lo;
  98         }
  99     }
 100 }
 101
 102 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
 103 {
 104     u128 Z = { 0, 0 };
 105     const u8 *xi = (const u8 *)Xi + 15;
 106     size_t rem, n = *xi;
 107     const union {
 108         long one;
 109         char little;
 110     } is_endian = { 1 };
 111     static const size_t rem_8bit[256] = {
 112         PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
 113         PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
 114         PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
 115         PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
 116         PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
 117         PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
 118         PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
 119         PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
 120         PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
 121         PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
 122         PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
 123         PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
 124         PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
 125         PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
 126         PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
 127         PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
 128         PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
 129         PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
 130         PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
 131         PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
 132         PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
 133         PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
 134         PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
 135         PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
 136         PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
 137         PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
 138         PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
 139         PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
 140         PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
 141         PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
 142         PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
 143         PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
 144         PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
 145         PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
 146         PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
 147         PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
 148         PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
 149         PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
 150         PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
 151         PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
 152         PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
 153         PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
 154         PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
 155         PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
 156         PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
 157         PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
 158         PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
 159         PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
 160         PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
 161         PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
 162         PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
 163         PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
 164         PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
 165         PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
 166         PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
 167         PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
 168         PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
 169         PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
 170         PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
 171         PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
 172         PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
 173         PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
 174         PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
 175         PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE)
 176     };
 177
 178     while (1) {
 179         Z.hi ^= Htable[n].hi;
 180         Z.lo ^= Htable[n].lo;
 181
 182         if ((u8 *)Xi == xi)
 183             break;
 184
 185         n = *(--xi);
 186
 187         rem = (size_t)Z.lo & 0xff;
 188         Z.lo = (Z.hi << 56) | (Z.lo >> 8);
 189         Z.hi = (Z.hi >> 8);
 190         if (sizeof(size_t) == 8)
 191             Z.hi ^= rem_8bit[rem];
 192         else
 193             Z.hi ^= (u64)rem_8bit[rem] << 32;
 194     }
 195
 196     if (is_endian.little) {
 197 # ifdef BSWAP8
 198         Xi[0] = BSWAP8(Z.hi);
 199         Xi[1] = BSWAP8(Z.lo);
 200 # else
 201         u8 *p = (u8 *)Xi;
 202         u32 v;
 203         v = (u32)(Z.hi >> 32);
 204         PUTU32(p, v);
 205         v = (u32)(Z.hi);
 206         PUTU32(p + 4, v);
 207         v = (u32)(Z.lo >> 32);
 208         PUTU32(p + 8, v);
 209         v = (u32)(Z.lo);
 210         PUTU32(p + 12, v);
 211 # endif
 212     } else {
 213         Xi[0] = Z.hi;
 214         Xi[1] = Z.lo;
 215     }
 216 }
 217
 218 # define GCM_MUL(ctx)      gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
 219
 220 #elif   TABLE_BITS==4
 221
 222 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
 223 {
 224     u128 V;
 225 # if defined(OPENSSL_SMALL_FOOTPRINT)
 226     int i;
 227 # endif
 228
 229     Htable[0].hi = 0;
 230     Htable[0].lo = 0;
 231     V.hi = H[0];
 232     V.lo = H[1];
 233
 234 # if defined(OPENSSL_SMALL_FOOTPRINT)
 235     for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
 236         REDUCE1BIT(V);
 237         Htable[i] = V;
 238     }
 239
 240     for (i = 2; i < 16; i <<= 1) {
 241         u128 *Hi = Htable + i;
 242         int j;
 243         for (V = *Hi, j = 1; j < i; ++j) {
 244             Hi[j].hi = V.hi ^ Htable[j].hi;
 245             Hi[j].lo = V.lo ^ Htable[j].lo;
 246         }
 247     }
 248 # else
 249     Htable[8] = V;
 250     REDUCE1BIT(V);
 251     Htable[4] = V;
 252     REDUCE1BIT(V);
 253     Htable[2] = V;
 254     REDUCE1BIT(V);
 255     Htable[1] = V;
 256     Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
 257     V = Htable[4];
 258     Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
 259     Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
 260     Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
 261     V = Htable[8];
 262     Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
 263     Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
 264     Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
 265     Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
 266     Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
 267     Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
 268     Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
 269 # endif
 270 # if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
 271     /*
 272      * ARM assembler expects specific dword order in Htable.
 273      */
 274     {
 275         int j;
 276         const union {
 277             long one;
 278             char little;
 279         } is_endian = { 1 };
 280
 281         if (is_endian.little)
 282             for (j = 0; j < 16; ++j) {
 283                 V = Htable[j];
 284                 Htable[j].hi = V.lo;
 285                 Htable[j].lo = V.hi;
 286         } else
 287             for (j = 0; j < 16; ++j) {
 288                 V = Htable[j];
 289                 Htable[j].hi = V.lo << 32 | V.lo >> 32;
 290                 Htable[j].lo = V.hi << 32 | V.hi >> 32;
 291             }
 292     }
 293 # endif
 294 }
 295
 296 # ifndef GHASH_ASM
 297 static const size_t rem_4bit[16] = {
 298     PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
 299     PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
 300     PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
 301     PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)
 302 };
 303
 304 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
 305 {
 306     u128 Z;
 307     int cnt = 15;
 308     size_t rem, nlo, nhi;
 309     const union {
 310         long one;
 311         char little;
 312     } is_endian = { 1 };
 313
 314     nlo = ((const u8 *)Xi)[15];
 315     nhi = nlo >> 4;
 316     nlo &= 0xf;
 317
 318     Z.hi = Htable[nlo].hi;
 319     Z.lo = Htable[nlo].lo;
 320
 321     while (1) {
 322         rem = (size_t)Z.lo & 0xf;
 323         Z.lo = (Z.hi << 60) | (Z.lo >> 4);
 324         Z.hi = (Z.hi >> 4);
 325         if (sizeof(size_t) == 8)
 326             Z.hi ^= rem_4bit[rem];
 327         else
 328             Z.hi ^= (u64)rem_4bit[rem] << 32;
 329
 330         Z.hi ^= Htable[nhi].hi;
 331         Z.lo ^= Htable[nhi].lo;
 332
 333         if (--cnt < 0)
 334             break;
 335
 336         nlo = ((const u8 *)Xi)[cnt];
 337         nhi = nlo >> 4;
 338         nlo &= 0xf;
 339
 340         rem = (size_t)Z.lo & 0xf;
 341         Z.lo = (Z.hi << 60) | (Z.lo >> 4);
 342         Z.hi = (Z.hi >> 4);
 343         if (sizeof(size_t) == 8)
 344             Z.hi ^= rem_4bit[rem];
 345         else
 346             Z.hi ^= (u64)rem_4bit[rem] << 32;
 347
 348         Z.hi ^= Htable[nlo].hi;
 349         Z.lo ^= Htable[nlo].lo;
 350     }
 351
 352     if (is_endian.little) {
 353 #  ifdef BSWAP8
 354         Xi[0] = BSWAP8(Z.hi);
 355         Xi[1] = BSWAP8(Z.lo);
 356 #  else
 357         u8 *p = (u8 *)Xi;
 358         u32 v;
 359         v = (u32)(Z.hi >> 32);
 360         PUTU32(p, v);
 361         v = (u32)(Z.hi);
 362         PUTU32(p + 4, v);
 363         v = (u32)(Z.lo >> 32);
 364         PUTU32(p + 8, v);
 365         v = (u32)(Z.lo);
 366         PUTU32(p + 12, v);
 367 #  endif
 368     } else {
 369         Xi[0] = Z.hi;
 370         Xi[1] = Z.lo;
 371     }
 372 }
 373
 374 #  if !defined(OPENSSL_SMALL_FOOTPRINT)
 375 /*
 376  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
 377  * details... Compiler-generated code doesn't seem to give any
 378  * performance improvement, at least not on x86[_64]. It's here
 379  * mostly as reference and a placeholder for possible future
 380  * non-trivial optimization[s]...
 381  */
 382 static void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
 383                            const u8 *inp, size_t len)
 384 {
 385     u128 Z;
 386     int cnt;
 387     size_t rem, nlo, nhi;
 388     const union {
 389         long one;
 390         char little;
 391     } is_endian = { 1 };
 392
 393 #   if 1
 394     do {
 395         cnt = 15;
 396         nlo = ((const u8 *)Xi)[15];
 397         nlo ^= inp[15];
 398         nhi = nlo >> 4;
 399         nlo &= 0xf;
 400
 401         Z.hi = Htable[nlo].hi;
 402         Z.lo = Htable[nlo].lo;
 403
 404         while (1) {
 405             rem = (size_t)Z.lo & 0xf;
 406             Z.lo = (Z.hi << 60) | (Z.lo >> 4);
 407             Z.hi = (Z.hi >> 4);
 408             if (sizeof(size_t) == 8)
 409                 Z.hi ^= rem_4bit[rem];
 410             else
 411                 Z.hi ^= (u64)rem_4bit[rem] << 32;
 412
 413             Z.hi ^= Htable[nhi].hi;
 414             Z.lo ^= Htable[nhi].lo;
 415
 416             if (--cnt < 0)
 417                 break;
 418
 419             nlo = ((const u8 *)Xi)[cnt];
 420             nlo ^= inp[cnt];
 421             nhi = nlo >> 4;
 422             nlo &= 0xf;
 423
 424             rem = (size_t)Z.lo & 0xf;
 425             Z.lo = (Z.hi << 60) | (Z.lo >> 4);
 426             Z.hi = (Z.hi >> 4);
 427             if (sizeof(size_t) == 8)
 428                 Z.hi ^= rem_4bit[rem];
 429             else
 430                 Z.hi ^= (u64)rem_4bit[rem] << 32;
 431
 432             Z.hi ^= Htable[nlo].hi;
 433             Z.lo ^= Htable[nlo].lo;
 434         }
 435 #   else
 436     /*
 437      * Extra 256+16 bytes per-key plus 512 bytes shared tables
 438      * [should] give ~50% improvement... One could have PACK()-ed
 439      * the rem_8bit even here, but the priority is to minimize
 440      * cache footprint...
 441      */
 442     u128 Hshr4[16];             /* Htable shifted right by 4 bits */
 443     u8 Hshl4[16];               /* Htable shifted left by 4 bits */
 444     static const unsigned short rem_8bit[256] = {
 445         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
 446         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
 447         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
 448         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
 449         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
 450         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
 451         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
 452         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
 453         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
 454         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
 455         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
 456         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
 457         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
 458         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
 459         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
 460         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
 461         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
 462         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
 463         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
 464         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
 465         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
 466         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
 467         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
 468         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
 469         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
 470         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
 471         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
 472         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
 473         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
 474         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
 475         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
 476         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE
 477     };
 478     /*
 479      * This pre-processing phase slows down procedure by approximately
 480      * same time as it makes each loop spin faster. In other words
 481      * single block performance is approximately same as straightforward
 482      * "4-bit" implementation, and then it goes only faster...
 483      */
 484     for (cnt = 0; cnt < 16; ++cnt) {
 485         Z.hi = Htable[cnt].hi;
 486         Z.lo = Htable[cnt].lo;
 487         Hshr4[cnt].lo = (Z.hi << 60) | (Z.lo >> 4);
 488         Hshr4[cnt].hi = (Z.hi >> 4);
 489         Hshl4[cnt] = (u8)(Z.lo << 4);
 490     }
 491
 492     do {
 493         for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
 494             nlo = ((const u8 *)Xi)[cnt];
 495             nlo ^= inp[cnt];
 496             nhi = nlo >> 4;
 497             nlo &= 0xf;
 498
 499             Z.hi ^= Htable[nlo].hi;
 500             Z.lo ^= Htable[nlo].lo;
 501
 502             rem = (size_t)Z.lo & 0xff;
 503
 504             Z.lo = (Z.hi << 56) | (Z.lo >> 8);
 505             Z.hi = (Z.hi >> 8);
 506
 507             Z.hi ^= Hshr4[nhi].hi;
 508             Z.lo ^= Hshr4[nhi].lo;
 509             Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
 510         }
 511
 512         nlo = ((const u8 *)Xi)[0];
 513         nlo ^= inp[0];
 514         nhi = nlo >> 4;
 515         nlo &= 0xf;
 516
 517         Z.hi ^= Htable[nlo].hi;
 518         Z.lo ^= Htable[nlo].lo;
 519
 520         rem = (size_t)Z.lo & 0xf;
 521
 522         Z.lo = (Z.hi << 60) | (Z.lo >> 4);
 523         Z.hi = (Z.hi >> 4);
 524
 525         Z.hi ^= Htable[nhi].hi;
 526         Z.lo ^= Htable[nhi].lo;
 527         Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
 528 #   endif
 529
 530         if (is_endian.little) {
 531 #   ifdef BSWAP8
 532             Xi[0] = BSWAP8(Z.hi);
 533             Xi[1] = BSWAP8(Z.lo);
 534 #   else
 535             u8 *p = (u8 *)Xi;
 536             u32 v;
 537             v = (u32)(Z.hi >> 32);
 538             PUTU32(p, v);
 539             v = (u32)(Z.hi);
 540             PUTU32(p + 4, v);
 541             v = (u32)(Z.lo >> 32);
 542             PUTU32(p + 8, v);
 543             v = (u32)(Z.lo);
 544             PUTU32(p + 12, v);
 545 #   endif
 546         } else {
 547             Xi[0] = Z.hi;
 548             Xi[1] = Z.lo;
 549         }
 550     } while (inp += 16, len -= 16);
 551 }
 552 #  endif
 553 # else
 554 void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
 555 void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
 556                     size_t len);
 557 # endif
 558
 559 # define GCM_MUL(ctx)      gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
 560 # if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
 561 #  define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
 562 /*
 563  * GHASH_CHUNK is "stride parameter" missioned to mitigate cache trashing
 564  * effect. In other words idea is to hash data while it's still in L1 cache
 565  * after encryption pass...
 566  */
 567 #  define GHASH_CHUNK       (3*1024)
 568 # endif
 569
 570 #else                           /* TABLE_BITS */
 571
 572 static void gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
 573 {
 574     u128 V, Z = { 0, 0 };
 575     long X;
 576     int i, j;
 577     const long *xi = (const long *)Xi;
 578     const union {
 579         long one;
 580         char little;
 581     } is_endian = { 1 };
 582
 583     V.hi = H[0];                /* H is in host byte order, no byte swapping */
 584     V.lo = H[1];
 585
 586     for (j = 0; j < 16 / sizeof(long); ++j) {
 587         if (is_endian.little) {
 588             if (sizeof(long) == 8) {
 589 # ifdef BSWAP8
 590                 X = (long)(BSWAP8(xi[j]));
 591 # else
 592                 const u8 *p = (const u8 *)(xi + j);
 593                 X = (long)((u64)GETU32(p) << 32 | GETU32(p + 4));
 594 # endif
 595             } else {
 596                 const u8 *p = (const u8 *)(xi + j);
 597                 X = (long)GETU32(p);
 598             }
 599         } else
 600             X = xi[j];
 601
 602         for (i = 0; i < 8 * sizeof(long); ++i, X <<= 1) {
 603             u64 M = (u64)(X >> (8 * sizeof(long) - 1));
 604             Z.hi ^= V.hi & M;
 605             Z.lo ^= V.lo & M;
 606
 607             REDUCE1BIT(V);
 608         }
 609     }
 610
 611     if (is_endian.little) {
 612 # ifdef BSWAP8
 613         Xi[0] = BSWAP8(Z.hi);
 614         Xi[1] = BSWAP8(Z.lo);
 615 # else
 616         u8 *p = (u8 *)Xi;
 617         u32 v;
 618         v = (u32)(Z.hi >> 32);
 619         PUTU32(p, v);
 620         v = (u32)(Z.hi);
 621         PUTU32(p + 4, v);
 622         v = (u32)(Z.lo >> 32);
 623         PUTU32(p + 8, v);
 624         v = (u32)(Z.lo);
 625         PUTU32(p + 12, v);
 626 # endif
 627     } else {
 628         Xi[0] = Z.hi;
 629         Xi[1] = Z.lo;
 630     }
 631 }
 632
 633 # define GCM_MUL(ctx)      gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
 634
 635 #endif
 636
 637 #if     TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
 638 # if    !defined(I386_ONLY) && \
 639         (defined(__i386)        || defined(__i386__)    || \
 640          defined(__x86_64)      || defined(__x86_64__)  || \
 641          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
 642 #  define GHASH_ASM_X86_OR_64
 643 #  define GCM_FUNCREF_4BIT
 644 extern unsigned int OPENSSL_ia32cap_P[];
 645
 646 void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
 647 void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
 648 void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
 649                      size_t len);
 650
 651 #  if defined(__i386) || defined(__i386__) || defined(_M_IX86)
 652 #   define gcm_init_avx   gcm_init_clmul
 653 #   define gcm_gmult_avx  gcm_gmult_clmul
 654 #   define gcm_ghash_avx  gcm_ghash_clmul
 655 #  else
 656 void gcm_init_avx(u128 Htable[16], const u64 Xi[2]);
 657 void gcm_gmult_avx(u64 Xi[2], const u128 Htable[16]);
 658 void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
 659                    size_t len);
 660 #  endif
 661
 662 #  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
 663 #   define GHASH_ASM_X86
 664 void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
 665 void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
 666                         size_t len);
 667
 668 void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
 669 void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
 670                         size_t len);
 671 #  endif
 672 # elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
 673 #  include "arm_arch.h"
 674 #  if __ARM_MAX_ARCH__>=7
 675 #   define GHASH_ASM_ARM
 676 #   define GCM_FUNCREF_4BIT
 677 #   define PMULL_CAPABLE        (OPENSSL_armcap_P & ARMV8_PMULL)
 678 #   if defined(__arm__) || defined(__arm)
 679 #    define NEON_CAPABLE        (OPENSSL_armcap_P & ARMV7_NEON)
 680 #   endif
 681 void gcm_init_neon(u128 Htable[16], const u64 Xi[2]);
 682 void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
 683 void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
 684                     size_t len);
 685 void gcm_init_v8(u128 Htable[16], const u64 Xi[2]);
 686 void gcm_gmult_v8(u64 Xi[2], const u128 Htable[16]);
 687 void gcm_ghash_v8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
 688                   size_t len);
 689 #  endif
 690 # elif defined(__sparc__) || defined(__sparc)
 691 #  include "sparc_arch.h"
 692 #  define GHASH_ASM_SPARC
 693 #  define GCM_FUNCREF_4BIT
 694 extern unsigned int OPENSSL_sparcv9cap_P[];
 695 void gcm_init_vis3(u128 Htable[16], const u64 Xi[2]);
 696 void gcm_gmult_vis3(u64 Xi[2], const u128 Htable[16]);
 697 void gcm_ghash_vis3(u64 Xi[2], const u128 Htable[16], const u8 *inp,
 698                     size_t len);
 699 # elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
 700 #  include "ppc_arch.h"
 701 #  define GHASH_ASM_PPC
 702 #  define GCM_FUNCREF_4BIT
 703 void gcm_init_p8(u128 Htable[16], const u64 Xi[2]);
 704 void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]);
 705 void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
 706                   size_t len);
 707 # endif
 708 #endif
 709
 710 #ifdef GCM_FUNCREF_4BIT
 711 # undef  GCM_MUL
 712 # define GCM_MUL(ctx)           (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
 713 # ifdef GHASH
 714 #  undef  GHASH
 715 #  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
 716 # endif
 717 #endif
 718
 719 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
 720 {
 721     const union {
 722         long one;
 723         char little;
 724     } is_endian = { 1 };
 725
 726     memset(ctx, 0, sizeof(*ctx));
 727     ctx->block = block;
 728     ctx->key = key;
 729
 730     (*block) (ctx->H.c, ctx->H.c, key);
 731
 732     if (is_endian.little) {
 733         /* H is stored in host byte order */
 734 #ifdef BSWAP8
 735         ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
 736         ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
 737 #else
 738         u8 *p = ctx->H.c;
 739         u64 hi, lo;
 740         hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
 741         lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
 742         ctx->H.u[0] = hi;
 743         ctx->H.u[1] = lo;
 744 #endif
 745     }
 746 #if     TABLE_BITS==8
 747     gcm_init_8bit(ctx->Htable, ctx->H.u);
 748 #elif   TABLE_BITS==4
 749 # if    defined(GHASH)
 750 #  define CTX__GHASH(f) (ctx->ghash = (f))
 751 # else
 752 #  define CTX__GHASH(f) (ctx->ghash = NULL)
 753 # endif
 754 # if    defined(GHASH_ASM_X86_OR_64)
 755 #  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
 756     if (OPENSSL_ia32cap_P[1] & (1 << 1)) { /* check PCLMULQDQ bit */
 757         if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */
 758             gcm_init_avx(ctx->Htable, ctx->H.u);
 759             ctx->gmult = gcm_gmult_avx;
 760             CTX__GHASH(gcm_ghash_avx);
 761         } else {
 762             gcm_init_clmul(ctx->Htable, ctx->H.u);
 763             ctx->gmult = gcm_gmult_clmul;
 764             CTX__GHASH(gcm_ghash_clmul);
 765         }
 766         return;
 767     }
 768 #  endif
 769     gcm_init_4bit(ctx->Htable, ctx->H.u);
 770 #  if   defined(GHASH_ASM_X86)  /* x86 only */
 771 #   if  defined(OPENSSL_IA32_SSE2)
 772     if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */
 773 #   else
 774     if (OPENSSL_ia32cap_P[0] & (1 << 23)) { /* check MMX bit */
 775 #   endif
 776         ctx->gmult = gcm_gmult_4bit_mmx;
 777         CTX__GHASH(gcm_ghash_4bit_mmx);
 778     } else {
 779         ctx->gmult = gcm_gmult_4bit_x86;
 780         CTX__GHASH(gcm_ghash_4bit_x86);
 781     }
 782 #  else
 783     ctx->gmult = gcm_gmult_4bit;
 784     CTX__GHASH(gcm_ghash_4bit);
 785 #  endif
 786 # elif  defined(GHASH_ASM_ARM)
 787 #  ifdef PMULL_CAPABLE
 788     if (PMULL_CAPABLE) {
 789         gcm_init_v8(ctx->Htable, ctx->H.u);
 790         ctx->gmult = gcm_gmult_v8;
 791         CTX__GHASH(gcm_ghash_v8);
 792     } else
 793 #  endif
 794 #  ifdef NEON_CAPABLE
 795     if (NEON_CAPABLE) {
 796         gcm_init_neon(ctx->Htable, ctx->H.u);
 797         ctx->gmult = gcm_gmult_neon;
 798         CTX__GHASH(gcm_ghash_neon);
 799     } else
 800 #  endif
 801     {
 802         gcm_init_4bit(ctx->Htable, ctx->H.u);
 803         ctx->gmult = gcm_gmult_4bit;
 804         CTX__GHASH(gcm_ghash_4bit);
 805     }
 806 # elif  defined(GHASH_ASM_SPARC)
 807     if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
 808         gcm_init_vis3(ctx->Htable, ctx->H.u);
 809         ctx->gmult = gcm_gmult_vis3;
 810         CTX__GHASH(gcm_ghash_vis3);
 811     } else {
 812         gcm_init_4bit(ctx->Htable, ctx->H.u);
 813         ctx->gmult = gcm_gmult_4bit;
 814         CTX__GHASH(gcm_ghash_4bit);
 815     }
 816 # elif  defined(GHASH_ASM_PPC)
 817     if (OPENSSL_ppccap_P & PPC_CRYPTO207) {
 818         gcm_init_p8(ctx->Htable, ctx->H.u);
 819         ctx->gmult = gcm_gmult_p8;
 820         CTX__GHASH(gcm_ghash_p8);
 821     } else {
 822         gcm_init_4bit(ctx->Htable, ctx->H.u);
 823         ctx->gmult = gcm_gmult_4bit;
 824         CTX__GHASH(gcm_ghash_4bit);
 825     }
 826 # else
 827     gcm_init_4bit(ctx->Htable, ctx->H.u);
 828 # endif
 829 # undef CTX__GHASH
 830 #endif
 831 }
 832
 833 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
 834                          size_t len)
 835 {
 836     const union {
 837         long one;
 838         char little;
 839     } is_endian = { 1 };
 840     unsigned int ctr;
 841 #ifdef GCM_FUNCREF_4BIT
 842     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
 843 #endif
 844
 845     ctx->len.u[0] = 0;          /* AAD length */
 846     ctx->len.u[1] = 0;          /* message length */
 847     ctx->ares = 0;
 848     ctx->mres = 0;
 849
 850     if (len == 12) {
 851         memcpy(ctx->Yi.c, iv, 12);
 852         ctx->Yi.c[12] = 0;
 853         ctx->Yi.c[13] = 0;
 854         ctx->Yi.c[14] = 0;
 855         ctx->Yi.c[15] = 1;
 856         ctr = 1;
 857     } else {
 858         size_t i;
 859         u64 len0 = len;
 860
 861         /* Borrow ctx->Xi to calculate initial Yi */
 862         ctx->Xi.u[0] = 0;
 863         ctx->Xi.u[1] = 0;
 864
 865         while (len >= 16) {
 866             for (i = 0; i < 16; ++i)
 867                 ctx->Xi.c[i] ^= iv[i];
 868             GCM_MUL(ctx);
 869             iv += 16;
 870             len -= 16;
 871         }
 872         if (len) {
 873             for (i = 0; i < len; ++i)
 874                 ctx->Xi.c[i] ^= iv[i];
 875             GCM_MUL(ctx);
 876         }
 877         len0 <<= 3;
 878         if (is_endian.little) {
 879 #ifdef BSWAP8
 880             ctx->Xi.u[1] ^= BSWAP8(len0);
 881 #else
 882             ctx->Xi.c[8] ^= (u8)(len0 >> 56);
 883             ctx->Xi.c[9] ^= (u8)(len0 >> 48);
 884             ctx->Xi.c[10] ^= (u8)(len0 >> 40);
 885             ctx->Xi.c[11] ^= (u8)(len0 >> 32);
 886             ctx->Xi.c[12] ^= (u8)(len0 >> 24);
 887             ctx->Xi.c[13] ^= (u8)(len0 >> 16);
 888             ctx->Xi.c[14] ^= (u8)(len0 >> 8);
 889             ctx->Xi.c[15] ^= (u8)(len0);
 890 #endif
 891         } else {
 892             ctx->Xi.u[1] ^= len0;
 893         }
 894
 895         GCM_MUL(ctx);
 896
 897         if (is_endian.little)
 898 #ifdef BSWAP4
 899             ctr = BSWAP4(ctx->Xi.d[3]);
 900 #else
 901             ctr = GETU32(ctx->Xi.c + 12);
 902 #endif
 903         else
 904             ctr = ctx->Xi.d[3];
 905
 906         /* Copy borrowed Xi to Yi */
 907         ctx->Yi.u[0] = ctx->Xi.u[0];
 908         ctx->Yi.u[1] = ctx->Xi.u[1];
 909     }
 910
 911     ctx->Xi.u[0] = 0;
 912     ctx->Xi.u[1] = 0;
 913
 914     (*ctx->block) (ctx->Yi.c, ctx->EK0.c, ctx->key);
 915     ++ctr;
 916     if (is_endian.little)
 917 #ifdef BSWAP4
 918         ctx->Yi.d[3] = BSWAP4(ctr);
 919 #else
 920         PUTU32(ctx->Yi.c + 12, ctr);
 921 #endif
 922     else
 923         ctx->Yi.d[3] = ctr;
 924 }
 925
 926 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
 927                       size_t len)
 928 {
 929     size_t i;
 930     unsigned int n;
 931     u64 alen = ctx->len.u[0];
 932 #ifdef GCM_FUNCREF_4BIT
 933     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
 934 # ifdef GHASH
 935     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
 936                          const u8 *inp, size_t len) = ctx->ghash;
 937 # endif
 938 #endif
 939
 940     if (ctx->len.u[1])
 941         return -2;
 942
 943     alen += len;
 944     if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len))
 945         return -1;
 946     ctx->len.u[0] = alen;
 947
 948     n = ctx->ares;
 949     if (n) {
 950         while (n && len) {
 951             ctx->Xi.c[n] ^= *(aad++);
 952             --len;
 953             n = (n + 1) % 16;
 954         }
 955         if (n == 0)
 956             GCM_MUL(ctx);
 957         else {
 958             ctx->ares = n;
 959             return 0;
 960         }
 961     }
 962 #ifdef GHASH
 963     if ((i = (len & (size_t)-16))) {
 964         GHASH(ctx, aad, i);
 965         aad += i;
 966         len -= i;
 967     }
 968 #else
 969     while (len >= 16) {
 970         for (i = 0; i < 16; ++i)
 971             ctx->Xi.c[i] ^= aad[i];
 972         GCM_MUL(ctx);
 973         aad += 16;
 974         len -= 16;
 975     }
 976 #endif
 977     if (len) {
 978         n = (unsigned int)len;
 979         for (i = 0; i < len; ++i)
 980             ctx->Xi.c[i] ^= aad[i];
 981     }
 982
 983     ctx->ares = n;
 984     return 0;
 985 }
 986
 987 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
 988                           const unsigned char *in, unsigned char *out,
 989                           size_t len)
 990 {
 991     const union {
 992         long one;
 993         char little;
 994     } is_endian = { 1 };
 995     unsigned int n, ctr, mres;
 996     size_t i;
 997     u64 mlen = ctx->len.u[1];
 998     block128_f block = ctx->block;
 999     void *key = ctx->key;
1000 #ifdef GCM_FUNCREF_4BIT
1001     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1002 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1003     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1004                          const u8 *inp, size_t len) = ctx->ghash;
1005 # endif
1006 #endif
1007
1008     mlen += len;
1009     if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1010         return -1;
1011     ctx->len.u[1] = mlen;
1012
1013     mres = ctx->mres;
1014
1015     if (ctx->ares) {
1016         /* First call to encrypt finalizes GHASH(AAD) */
1017 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1018         if (len == 0) {
1019             GCM_MUL(ctx);
1020             ctx->ares = 0;
1021             return 0;
1022         }
1023         memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1024         ctx->Xi.u[0] = 0;
1025         ctx->Xi.u[1] = 0;
1026         mres = sizeof(ctx->Xi);
1027 #else
1028         GCM_MUL(ctx);
1029 #endif
1030         ctx->ares = 0;
1031     }
1032
1033     if (is_endian.little)
1034 #ifdef BSWAP4
1035         ctr = BSWAP4(ctx->Yi.d[3]);
1036 #else
1037         ctr = GETU32(ctx->Yi.c + 12);
1038 #endif
1039     else
1040         ctr = ctx->Yi.d[3];
1041
1042     n = mres % 16;
1043 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1044     if (16 % sizeof(size_t) == 0) { /* always true actually */
1045         do {
1046             if (n) {
1047 # if defined(GHASH)
1048                 while (n && len) {
1049                     ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1050                     --len;
1051                     n = (n + 1) % 16;
1052                 }
1053                 if (n == 0) {
1054                     GHASH(ctx, ctx->Xn, mres);
1055                     mres = 0;
1056                 } else {
1057                     ctx->mres = mres;
1058                     return 0;
1059                 }
1060 # else
1061                 while (n && len) {
1062                     ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1063                     --len;
1064                     n = (n + 1) % 16;
1065                 }
1066                 if (n == 0) {
1067                     GCM_MUL(ctx);
1068                     mres = 0;
1069                 } else {
1070                     ctx->mres = n;
1071                     return 0;
1072                 }
1073 # endif
1074             }
1075 # if defined(STRICT_ALIGNMENT)
1076             if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1077                 break;
1078 # endif
1079 # if defined(GHASH)
1080             if (len >= 16 && mres) {
1081                 GHASH(ctx, ctx->Xn, mres);
1082                 mres = 0;
1083             }
1084 #  if defined(GHASH_CHUNK)
1085             while (len >= GHASH_CHUNK) {
1086                 size_t j = GHASH_CHUNK;
1087
1088                 while (j) {
1089                     size_t_aX *out_t = (size_t_aX *)out;
1090                     const size_t_aX *in_t = (const size_t_aX *)in;
1091
1092                     (*block) (ctx->Yi.c, ctx->EKi.c, key);
1093                     ++ctr;
1094                     if (is_endian.little)
1095 #   ifdef BSWAP4
1096                         ctx->Yi.d[3] = BSWAP4(ctr);
1097 #   else
1098                         PUTU32(ctx->Yi.c + 12, ctr);
1099 #   endif
1100                     else
1101                         ctx->Yi.d[3] = ctr;
1102                     for (i = 0; i < 16 / sizeof(size_t); ++i)
1103                         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1104                     out += 16;
1105                     in += 16;
1106                     j -= 16;
1107                 }
1108                 GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
1109                 len -= GHASH_CHUNK;
1110             }
1111 #  endif
1112             if ((i = (len & (size_t)-16))) {
1113                 size_t j = i;
1114
1115                 while (len >= 16) {
1116                     size_t_aX *out_t = (size_t_aX *)out;
1117                     const size_t_aX *in_t = (const size_t_aX *)in;
1118
1119                     (*block) (ctx->Yi.c, ctx->EKi.c, key);
1120                     ++ctr;
1121                     if (is_endian.little)
1122 #  ifdef BSWAP4
1123                         ctx->Yi.d[3] = BSWAP4(ctr);
1124 #  else
1125                         PUTU32(ctx->Yi.c + 12, ctr);
1126 #  endif
1127                     else
1128                         ctx->Yi.d[3] = ctr;
1129                     for (i = 0; i < 16 / sizeof(size_t); ++i)
1130                         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1131                     out += 16;
1132                     in += 16;
1133                     len -= 16;
1134                 }
1135                 GHASH(ctx, out - j, j);
1136             }
1137 # else
1138             while (len >= 16) {
1139                 size_t *out_t = (size_t *)out;
1140                 const size_t *in_t = (const size_t *)in;
1141
1142                 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1143                 ++ctr;
1144                 if (is_endian.little)
1145 #  ifdef BSWAP4
1146                     ctx->Yi.d[3] = BSWAP4(ctr);
1147 #  else
1148                     PUTU32(ctx->Yi.c + 12, ctr);
1149 #  endif
1150                 else
1151                     ctx->Yi.d[3] = ctr;
1152                 for (i = 0; i < 16 / sizeof(size_t); ++i)
1153                     ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1154                 GCM_MUL(ctx);
1155                 out += 16;
1156                 in += 16;
1157                 len -= 16;
1158             }
1159 # endif
1160             if (len) {
1161                 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1162                 ++ctr;
1163                 if (is_endian.little)
1164 # ifdef BSWAP4
1165                     ctx->Yi.d[3] = BSWAP4(ctr);
1166 # else
1167                     PUTU32(ctx->Yi.c + 12, ctr);
1168 # endif
1169                 else
1170                     ctx->Yi.d[3] = ctr;
1171 # if defined(GHASH)
1172                 while (len--) {
1173                     ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1174                     ++n;
1175                 }
1176 # else
1177                 while (len--) {
1178                     ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1179                     ++n;
1180                 }
1181                 mres = n;
1182 # endif
1183             }
1184
1185             ctx->mres = mres;
1186             return 0;
1187         } while (0);
1188     }
1189 #endif
1190     for (i = 0; i < len; ++i) {
1191         if (n == 0) {
1192             (*block) (ctx->Yi.c, ctx->EKi.c, key);
1193             ++ctr;
1194             if (is_endian.little)
1195 #ifdef BSWAP4
1196                 ctx->Yi.d[3] = BSWAP4(ctr);
1197 #else
1198                 PUTU32(ctx->Yi.c + 12, ctr);
1199 #endif
1200             else
1201                 ctx->Yi.d[3] = ctr;
1202         }
1203 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1204         ctx->Xn[mres++] = out[i] = in[i] ^ ctx->EKi.c[n];
1205         n = (n + 1) % 16;
1206         if (mres == sizeof(ctx->Xn)) {
1207             GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1208             mres = 0;
1209         }
1210 #else
1211         ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
1212         mres = n = (n + 1) % 16;
1213         if (n == 0)
1214             GCM_MUL(ctx);
1215 #endif
1216     }
1217
1218     ctx->mres = mres;
1219     return 0;
1220 }
1221
1222 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1223                           const unsigned char *in, unsigned char *out,
1224                           size_t len)
1225 {
1226     const union {
1227         long one;
1228         char little;
1229     } is_endian = { 1 };
1230     unsigned int n, ctr, mres;
1231     size_t i;
1232     u64 mlen = ctx->len.u[1];
1233     block128_f block = ctx->block;
1234     void *key = ctx->key;
1235 #ifdef GCM_FUNCREF_4BIT
1236     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1237 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1238     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1239                          const u8 *inp, size_t len) = ctx->ghash;
1240 # endif
1241 #endif
1242
1243     mlen += len;
1244     if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1245         return -1;
1246     ctx->len.u[1] = mlen;
1247
1248     mres = ctx->mres;
1249
1250     if (ctx->ares) {
1251         /* First call to decrypt finalizes GHASH(AAD) */
1252 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1253         if (len == 0) {
1254             GCM_MUL(ctx);
1255             ctx->ares = 0;
1256             return 0;
1257         }
1258         memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1259         ctx->Xi.u[0] = 0;
1260         ctx->Xi.u[1] = 0;
1261         mres = sizeof(ctx->Xi);
1262 #else
1263         GCM_MUL(ctx);
1264 #endif
1265         ctx->ares = 0;
1266     }
1267
1268     if (is_endian.little)
1269 #ifdef BSWAP4
1270         ctr = BSWAP4(ctx->Yi.d[3]);
1271 #else
1272         ctr = GETU32(ctx->Yi.c + 12);
1273 #endif
1274     else
1275         ctr = ctx->Yi.d[3];
1276
1277     n = mres % 16;
1278 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1279     if (16 % sizeof(size_t) == 0) { /* always true actually */
1280         do {
1281             if (n) {
1282 # if defined(GHASH)
1283                 while (n && len) {
1284                     *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1285                     --len;
1286                     n = (n + 1) % 16;
1287                 }
1288                 if (n == 0) {
1289                     GHASH(ctx, ctx->Xn, mres);
1290                     mres = 0;
1291                 } else {
1292                     ctx->mres = mres;
1293                     return 0;
1294                 }
1295 # else
1296                 while (n && len) {
1297                     u8 c = *(in++);
1298                     *(out++) = c ^ ctx->EKi.c[n];
1299                     ctx->Xi.c[n] ^= c;
1300                     --len;
1301                     n = (n + 1) % 16;
1302                 }
1303                 if (n == 0) {
1304                     GCM_MUL(ctx);
1305                     mres = 0;
1306                 } else {
1307                     ctx->mres = n;
1308                     return 0;
1309                 }
1310 # endif
1311             }
1312 # if defined(STRICT_ALIGNMENT)
1313             if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1314                 break;
1315 # endif
1316 # if defined(GHASH)
1317             if (len >= 16 && mres) {
1318                 GHASH(ctx, ctx->Xn, mres);
1319                 mres = 0;
1320             }
1321 #  if defined(GHASH_CHUNK)
1322             while (len >= GHASH_CHUNK) {
1323                 size_t j = GHASH_CHUNK;
1324
1325                 GHASH(ctx, in, GHASH_CHUNK);
1326                 while (j) {
1327                     size_t_aX *out_t = (size_t_aX *)out;
1328                     const size_t_aX *in_t = (const size_t_aX *)in;
1329
1330                     (*block) (ctx->Yi.c, ctx->EKi.c, key);
1331                     ++ctr;
1332                     if (is_endian.little)
1333 #   ifdef BSWAP4
1334                         ctx->Yi.d[3] = BSWAP4(ctr);
1335 #   else
1336                         PUTU32(ctx->Yi.c + 12, ctr);
1337 #   endif
1338                     else
1339                         ctx->Yi.d[3] = ctr;
1340                     for (i = 0; i < 16 / sizeof(size_t); ++i)
1341                         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1342                     out += 16;
1343                     in += 16;
1344                     j -= 16;
1345                 }
1346                 len -= GHASH_CHUNK;
1347             }
1348 #  endif
1349             if ((i = (len & (size_t)-16))) {
1350                 GHASH(ctx, in, i);
1351                 while (len >= 16) {
1352                     size_t_aX *out_t = (size_t_aX *)out;
1353                     const size_t_aX *in_t = (const size_t_aX *)in;
1354
1355                     (*block) (ctx->Yi.c, ctx->EKi.c, key);
1356                     ++ctr;
1357                     if (is_endian.little)
1358 #  ifdef BSWAP4
1359                         ctx->Yi.d[3] = BSWAP4(ctr);
1360 #  else
1361                         PUTU32(ctx->Yi.c + 12, ctr);
1362 #  endif
1363                     else
1364                         ctx->Yi.d[3] = ctr;
1365                     for (i = 0; i < 16 / sizeof(size_t); ++i)
1366                         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1367                     out += 16;
1368                     in += 16;
1369                     len -= 16;
1370                 }
1371             }
1372 # else
1373             while (len >= 16) {
1374                 size_t *out_t = (size_t *)out;
1375                 const size_t *in_t = (const size_t *)in;
1376
1377                 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1378                 ++ctr;
1379                 if (is_endian.little)
1380 #  ifdef BSWAP4
1381                     ctx->Yi.d[3] = BSWAP4(ctr);
1382 #  else
1383                     PUTU32(ctx->Yi.c + 12, ctr);
1384 #  endif
1385                 else
1386                     ctx->Yi.d[3] = ctr;
1387                 for (i = 0; i < 16 / sizeof(size_t); ++i) {
1388                     size_t c = in[i];
1389                     out[i] = c ^ ctx->EKi.t[i];
1390                     ctx->Xi.t[i] ^= c;
1391                 }
1392                 GCM_MUL(ctx);
1393                 out += 16;
1394                 in += 16;
1395                 len -= 16;
1396             }
1397 # endif
1398             if (len) {
1399                 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1400                 ++ctr;
1401                 if (is_endian.little)
1402 # ifdef BSWAP4
1403                     ctx->Yi.d[3] = BSWAP4(ctr);
1404 # else
1405                     PUTU32(ctx->Yi.c + 12, ctr);
1406 # endif
1407                 else
1408                     ctx->Yi.d[3] = ctr;
1409 # if defined(GHASH)
1410                 while (len--) {
1411                     out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1412                     ++n;
1413                 }
1414 # else
1415                 while (len--) {
1416                     u8 c = in[n];
1417                     ctx->Xi.c[n] ^= c;
1418                     out[n] = c ^ ctx->EKi.c[n];
1419                     ++n;
1420                 }
1421                 mres = n;
1422 # endif
1423             }
1424
1425             ctx->mres = mres;
1426             return 0;
1427         } while (0);
1428     }
1429 #endif
1430     for (i = 0; i < len; ++i) {
1431         u8 c;
1432         if (n == 0) {
1433             (*block) (ctx->Yi.c, ctx->EKi.c, key);
1434             ++ctr;
1435             if (is_endian.little)
1436 #ifdef BSWAP4
1437                 ctx->Yi.d[3] = BSWAP4(ctr);
1438 #else
1439                 PUTU32(ctx->Yi.c + 12, ctr);
1440 #endif
1441             else
1442                 ctx->Yi.d[3] = ctr;
1443         }
1444 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1445         out[i] = (ctx->Xn[mres++] = c = in[i]) ^ ctx->EKi.c[n];
1446         n = (n + 1) % 16;
1447         if (mres == sizeof(ctx->Xn)) {
1448             GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1449             mres = 0;
1450         }
1451 #else
1452         c = in[i];
1453         out[i] = c ^ ctx->EKi.c[n];
1454         ctx->Xi.c[n] ^= c;
1455         mres = n = (n + 1) % 16;
1456         if (n == 0)
1457             GCM_MUL(ctx);
1458 #endif
1459     }
1460
1461     ctx->mres = mres;
1462     return 0;
1463 }
1464
1465 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1466                                 const unsigned char *in, unsigned char *out,
1467                                 size_t len, ctr128_f stream)
1468 {
1469 #if defined(OPENSSL_SMALL_FOOTPRINT)
1470     return CRYPTO_gcm128_encrypt(ctx, in, out, len);
1471 #else
1472     const union {
1473         long one;
1474         char little;
1475     } is_endian = { 1 };
1476     unsigned int n, ctr, mres;
1477     size_t i;
1478     u64 mlen = ctx->len.u[1];
1479     void *key = ctx->key;
1480 # ifdef GCM_FUNCREF_4BIT
1481     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1482 #  ifdef GHASH
1483     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1484                          const u8 *inp, size_t len) = ctx->ghash;
1485 #  endif
1486 # endif
1487
1488     mlen += len;
1489     if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1490         return -1;
1491     ctx->len.u[1] = mlen;
1492
1493     mres = ctx->mres;
1494
1495     if (ctx->ares) {
1496         /* First call to encrypt finalizes GHASH(AAD) */
1497 #if defined(GHASH)
1498         if (len == 0) {
1499             GCM_MUL(ctx);
1500             ctx->ares = 0;
1501             return 0;
1502         }
1503         memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1504         ctx->Xi.u[0] = 0;
1505         ctx->Xi.u[1] = 0;
1506         mres = sizeof(ctx->Xi);
1507 #else
1508         GCM_MUL(ctx);
1509 #endif
1510         ctx->ares = 0;
1511     }
1512
1513     if (is_endian.little)
1514 # ifdef BSWAP4
1515         ctr = BSWAP4(ctx->Yi.d[3]);
1516 # else
1517         ctr = GETU32(ctx->Yi.c + 12);
1518 # endif
1519     else
1520         ctr = ctx->Yi.d[3];
1521
1522     n = mres % 16;
1523     if (n) {
1524 # if defined(GHASH)
1525         while (n && len) {
1526             ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1527             --len;
1528             n = (n + 1) % 16;
1529         }
1530         if (n == 0) {
1531             GHASH(ctx, ctx->Xn, mres);
1532             mres = 0;
1533         } else {
1534             ctx->mres = mres;
1535             return 0;
1536         }
1537 # else
1538         while (n && len) {
1539             ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1540             --len;
1541             n = (n + 1) % 16;
1542         }
1543         if (n == 0) {
1544             GCM_MUL(ctx);
1545             mres = 0;
1546         } else {
1547             ctx->mres = n;
1548             return 0;
1549         }
1550 # endif
1551     }
1552 # if defined(GHASH)
1553         if (len >= 16 && mres) {
1554             GHASH(ctx, ctx->Xn, mres);
1555             mres = 0;
1556         }
1557 #  if defined(GHASH_CHUNK)
1558     while (len >= GHASH_CHUNK) {
1559         (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1560         ctr += GHASH_CHUNK / 16;
1561         if (is_endian.little)
1562 #   ifdef BSWAP4
1563             ctx->Yi.d[3] = BSWAP4(ctr);
1564 #   else
1565             PUTU32(ctx->Yi.c + 12, ctr);
1566 #   endif
1567         else
1568             ctx->Yi.d[3] = ctr;
1569         GHASH(ctx, out, GHASH_CHUNK);
1570         out += GHASH_CHUNK;
1571         in += GHASH_CHUNK;
1572         len -= GHASH_CHUNK;
1573     }
1574 #  endif
1575 # endif
1576     if ((i = (len & (size_t)-16))) {
1577         size_t j = i / 16;
1578
1579         (*stream) (in, out, j, key, ctx->Yi.c);
1580         ctr += (unsigned int)j;
1581         if (is_endian.little)
1582 # ifdef BSWAP4
1583             ctx->Yi.d[3] = BSWAP4(ctr);
1584 # else
1585             PUTU32(ctx->Yi.c + 12, ctr);
1586 # endif
1587         else
1588             ctx->Yi.d[3] = ctr;
1589         in += i;
1590         len -= i;
1591 # if defined(GHASH)
1592         GHASH(ctx, out, i);
1593         out += i;
1594 # else
1595         while (j--) {
1596             for (i = 0; i < 16; ++i)
1597                 ctx->Xi.c[i] ^= out[i];
1598             GCM_MUL(ctx);
1599             out += 16;
1600         }
1601 # endif
1602     }
1603     if (len) {
1604         (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1605         ++ctr;
1606         if (is_endian.little)
1607 # ifdef BSWAP4
1608             ctx->Yi.d[3] = BSWAP4(ctr);
1609 # else
1610             PUTU32(ctx->Yi.c + 12, ctr);
1611 # endif
1612         else
1613             ctx->Yi.d[3] = ctr;
1614         while (len--) {
1615 # if defined(GHASH)
1616             ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1617 # else
1618             ctx->Xi.c[mres++] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1619 # endif
1620             ++n;
1621         }
1622     }
1623
1624     ctx->mres = mres;
1625     return 0;
1626 #endif
1627 }
1628
1629 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1630                                 const unsigned char *in, unsigned char *out,
1631                                 size_t len, ctr128_f stream)
1632 {
1633 #if defined(OPENSSL_SMALL_FOOTPRINT)
1634     return CRYPTO_gcm128_decrypt(ctx, in, out, len);
1635 #else
1636     const union {
1637         long one;
1638         char little;
1639     } is_endian = { 1 };
1640     unsigned int n, ctr, mres;
1641     size_t i;
1642     u64 mlen = ctx->len.u[1];
1643     void *key = ctx->key;
1644 # ifdef GCM_FUNCREF_4BIT
1645     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1646 #  ifdef GHASH
1647     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1648                          const u8 *inp, size_t len) = ctx->ghash;
1649 #  endif
1650 # endif
1651
1652     mlen += len;
1653     if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1654         return -1;
1655     ctx->len.u[1] = mlen;
1656
1657     mres = ctx->mres;
1658
1659     if (ctx->ares) {
1660         /* First call to decrypt finalizes GHASH(AAD) */
1661 # if defined(GHASH)
1662         if (len == 0) {
1663             GCM_MUL(ctx);
1664             ctx->ares = 0;
1665             return 0;
1666         }
1667         memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1668         ctx->Xi.u[0] = 0;
1669         ctx->Xi.u[1] = 0;
1670         mres = sizeof(ctx->Xi);
1671 # else
1672         GCM_MUL(ctx);
1673 # endif
1674         ctx->ares = 0;
1675     }
1676
1677     if (is_endian.little)
1678 # ifdef BSWAP4
1679         ctr = BSWAP4(ctx->Yi.d[3]);
1680 # else
1681         ctr = GETU32(ctx->Yi.c + 12);
1682 # endif
1683     else
1684         ctr = ctx->Yi.d[3];
1685
1686     n = mres % 16;
1687     if (n) {
1688 # if defined(GHASH)
1689         while (n && len) {
1690             *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1691             --len;
1692             n = (n + 1) % 16;
1693         }
1694         if (n == 0) {
1695             GHASH(ctx, ctx->Xn, mres);
1696             mres = 0;
1697         } else {
1698             ctx->mres = mres;
1699             return 0;
1700         }
1701 # else
1702         while (n && len) {
1703             u8 c = *(in++);
1704             *(out++) = c ^ ctx->EKi.c[n];
1705             ctx->Xi.c[n] ^= c;
1706             --len;
1707             n = (n + 1) % 16;
1708         }
1709         if (n == 0) {
1710             GCM_MUL(ctx);
1711             mres = 0;
1712         } else {
1713             ctx->mres = n;
1714             return 0;
1715         }
1716 # endif
1717     }
1718 # if defined(GHASH)
1719     if (len >= 16 && mres) {
1720         GHASH(ctx, ctx->Xn, mres);
1721         mres = 0;
1722     }
1723 #  if defined(GHASH_CHUNK)
1724     while (len >= GHASH_CHUNK) {
1725         GHASH(ctx, in, GHASH_CHUNK);
1726         (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1727         ctr += GHASH_CHUNK / 16;
1728         if (is_endian.little)
1729 #   ifdef BSWAP4
1730             ctx->Yi.d[3] = BSWAP4(ctr);
1731 #   else
1732             PUTU32(ctx->Yi.c + 12, ctr);
1733 #   endif
1734         else
1735             ctx->Yi.d[3] = ctr;
1736         out += GHASH_CHUNK;
1737         in += GHASH_CHUNK;
1738         len -= GHASH_CHUNK;
1739     }
1740 #  endif
1741 # endif
1742     if ((i = (len & (size_t)-16))) {
1743         size_t j = i / 16;
1744
1745 # if defined(GHASH)
1746         GHASH(ctx, in, i);
1747 # else
1748         while (j--) {
1749             size_t k;
1750             for (k = 0; k < 16; ++k)
1751                 ctx->Xi.c[k] ^= in[k];
1752             GCM_MUL(ctx);
1753             in += 16;
1754         }
1755         j = i / 16;
1756         in -= i;
1757 # endif
1758         (*stream) (in, out, j, key, ctx->Yi.c);
1759         ctr += (unsigned int)j;
1760         if (is_endian.little)
1761 # ifdef BSWAP4
1762             ctx->Yi.d[3] = BSWAP4(ctr);
1763 # else
1764             PUTU32(ctx->Yi.c + 12, ctr);
1765 # endif
1766         else
1767             ctx->Yi.d[3] = ctr;
1768         out += i;
1769         in += i;
1770         len -= i;
1771     }
1772     if (len) {
1773         (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1774         ++ctr;
1775         if (is_endian.little)
1776 # ifdef BSWAP4
1777             ctx->Yi.d[3] = BSWAP4(ctr);
1778 # else
1779             PUTU32(ctx->Yi.c + 12, ctr);
1780 # endif
1781         else
1782             ctx->Yi.d[3] = ctr;
1783         while (len--) {
1784 # if defined(GHASH)
1785             out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1786 # else
1787             u8 c = in[n];
1788             ctx->Xi.c[mres++] ^= c;
1789             out[n] = c ^ ctx->EKi.c[n];
1790 # endif
1791             ++n;
1792         }
1793     }
1794
1795     ctx->mres = mres;
1796     return 0;
1797 #endif
1798 }
1799
1800 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
1801                          size_t len)
1802 {
1803     const union {
1804         long one;
1805         char little;
1806     } is_endian = { 1 };
1807     u64 alen = ctx->len.u[0] << 3;
1808     u64 clen = ctx->len.u[1] << 3;
1809 #ifdef GCM_FUNCREF_4BIT
1810     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1811 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1812     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1813                          const u8 *inp, size_t len) = ctx->ghash;
1814 # endif
1815 #endif
1816
1817 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1818     u128 bitlen;
1819     unsigned int mres = ctx->mres;
1820
1821     if (mres) {
1822         unsigned blocks = (mres + 15) & -16;
1823
1824         memset(ctx->Xn + mres, 0, blocks - mres);
1825         mres = blocks;
1826         if (mres == sizeof(ctx->Xn)) {
1827             GHASH(ctx, ctx->Xn, mres);
1828             mres = 0;
1829         }
1830     } else if (ctx->ares) {
1831         GCM_MUL(ctx);
1832     }
1833 #else
1834     if (ctx->mres || ctx->ares)
1835         GCM_MUL(ctx);
1836 #endif
1837
1838     if (is_endian.little) {
1839 #ifdef BSWAP8
1840         alen = BSWAP8(alen);
1841         clen = BSWAP8(clen);
1842 #else
1843         u8 *p = ctx->len.c;
1844
1845         ctx->len.u[0] = alen;
1846         ctx->len.u[1] = clen;
1847
1848         alen = (u64)GETU32(p) << 32 | GETU32(p + 4);
1849         clen = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
1850 #endif
1851     }
1852
1853 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1854     bitlen.hi = alen;
1855     bitlen.lo = clen;
1856     memcpy(ctx->Xn + mres, &bitlen, sizeof(bitlen));
1857     mres += sizeof(bitlen);
1858     GHASH(ctx, ctx->Xn, mres);
1859 #else
1860     ctx->Xi.u[0] ^= alen;
1861     ctx->Xi.u[1] ^= clen;
1862     GCM_MUL(ctx);
1863 #endif
1864
1865     ctx->Xi.u[0] ^= ctx->EK0.u[0];
1866     ctx->Xi.u[1] ^= ctx->EK0.u[1];
1867
1868     if (tag && len <= sizeof(ctx->Xi))
1869         return CRYPTO_memcmp(ctx->Xi.c, tag, len);
1870     else
1871         return -1;
1872 }
1873
1874 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1875 {
1876     CRYPTO_gcm128_finish(ctx, NULL, 0);
1877     memcpy(tag, ctx->Xi.c,
1878            len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
1879 }
1880
1881 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1882 {
1883     GCM128_CONTEXT *ret;
1884
1885     if ((ret = OPENSSL_malloc(sizeof(*ret))) != NULL)
1886         CRYPTO_gcm128_init(ret, key, block);
1887
1888     return ret;
1889 }
1890
1891 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1892 {
1893     OPENSSL_clear_free(ctx, sizeof(*ctx));
1894 }