2 * Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
4 * Licensed under the OpenSSL license (the "License"). You may not use
5 * this file except in compliance with the License. You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
10 #include <openssl/crypto.h>
11 #include "modes_local.h"
14 #if defined(__GNUC__) && !defined(STRICT_ALIGNMENT)
15 typedef size_t size_t_aX __attribute((__aligned__(1)));
17 typedef size_t size_t_aX;
20 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
21 /* redefine, because alignment is ensured */
23 # define GETU32(p) BSWAP4(*(const u32 *)(p))
25 # define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
28 #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
29 #define REDUCE1BIT(V) do { \
30 if (sizeof(size_t)==8) { \
31 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
32 V.lo = (V.hi<<63)|(V.lo>>1); \
33 V.hi = (V.hi>>1 )^T; \
36 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
37 V.lo = (V.hi<<63)|(V.lo>>1); \
38 V.hi = (V.hi>>1 )^((u64)T<<32); \
43 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
44 * never be set to 8. 8 is effectively reserved for testing purposes.
45 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
46 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
47 * whole spectrum of possible table driven implementations. Why? In
48 * non-"Shoup's" case memory access pattern is segmented in such manner,
49 * that it's trivial to see that cache timing information can reveal
50 * fair portion of intermediate hash value. Given that ciphertext is
51 * always available to attacker, it's possible for him to attempt to
52 * deduce secret parameter H and if successful, tamper with messages
53 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
54 * not as trivial, but there is no reason to believe that it's resistant
55 * to cache-timing attack. And the thing about "8-bit" implementation is
56 * that it consumes 16 (sixteen) times more memory, 4KB per individual
57 * key + 1KB shared. Well, on pros side it should be twice as fast as
58 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
59 * was observed to run ~75% faster, closer to 100% for commercial
60 * compilers... Yet "4-bit" procedure is preferred, because it's
61 * believed to provide better security-performance balance and adequate
62 * all-round performance. "All-round" refers to things like:
64 * - shorter setup time effectively improves overall timing for
65 * handling short messages;
66 * - larger table allocation can become unbearable because of VM
67 * subsystem penalties (for example on Windows large enough free
68 * results in VM working set trimming, meaning that consequent
69 * malloc would immediately incur working set expansion);
70 * - larger table has larger cache footprint, which can affect
71 * performance of other code paths (not necessarily even from same
72 * thread in Hyper-Threading world);
74 * Value of 1 is not appropriate for performance reasons.
78 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
88 for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
93 for (i = 2; i < 256; i <<= 1) {
94 u128 *Hi = Htable + i, H0 = *Hi;
95 for (j = 1; j < i; ++j) {
96 Hi[j].hi = H0.hi ^ Htable[j].hi;
97 Hi[j].lo = H0.lo ^ Htable[j].lo;
102 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
105 const u8 *xi = (const u8 *)Xi + 15;
111 static const size_t rem_8bit[256] = {
112 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
113 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
114 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
115 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
116 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
117 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
118 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
119 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
120 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
121 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
122 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
123 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
124 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
125 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
126 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
127 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
128 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
129 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
130 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
131 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
132 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
133 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
134 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
135 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
136 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
137 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
138 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
139 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
140 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
141 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
142 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
143 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
144 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
145 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
146 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
147 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
148 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
149 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
150 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
151 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
152 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
153 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
154 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
155 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
156 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
157 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
158 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
159 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
160 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
161 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
162 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
163 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
164 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
165 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
166 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
167 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
168 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
169 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
170 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
171 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
172 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
173 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
174 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
175 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE)
179 Z.hi ^= Htable[n].hi;
180 Z.lo ^= Htable[n].lo;
187 rem = (size_t)Z.lo & 0xff;
188 Z.lo = (Z.hi << 56) | (Z.lo >> 8);
190 if (sizeof(size_t) == 8)
191 Z.hi ^= rem_8bit[rem];
193 Z.hi ^= (u64)rem_8bit[rem] << 32;
196 if (is_endian.little) {
198 Xi[0] = BSWAP8(Z.hi);
199 Xi[1] = BSWAP8(Z.lo);
203 v = (u32)(Z.hi >> 32);
207 v = (u32)(Z.lo >> 32);
218 # define GCM_MUL(ctx) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
222 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
225 # if defined(OPENSSL_SMALL_FOOTPRINT)
234 # if defined(OPENSSL_SMALL_FOOTPRINT)
235 for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
240 for (i = 2; i < 16; i <<= 1) {
241 u128 *Hi = Htable + i;
243 for (V = *Hi, j = 1; j < i; ++j) {
244 Hi[j].hi = V.hi ^ Htable[j].hi;
245 Hi[j].lo = V.lo ^ Htable[j].lo;
256 Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
258 Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
259 Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
260 Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
262 Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
263 Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
264 Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
265 Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
266 Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
267 Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
268 Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
270 # if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
272 * ARM assembler expects specific dword order in Htable.
281 if (is_endian.little)
282 for (j = 0; j < 16; ++j) {
287 for (j = 0; j < 16; ++j) {
289 Htable[j].hi = V.lo << 32 | V.lo >> 32;
290 Htable[j].lo = V.hi << 32 | V.hi >> 32;
297 static const size_t rem_4bit[16] = {
298 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
299 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
300 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
301 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)
304 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
308 size_t rem, nlo, nhi;
314 nlo = ((const u8 *)Xi)[15];
318 Z.hi = Htable[nlo].hi;
319 Z.lo = Htable[nlo].lo;
322 rem = (size_t)Z.lo & 0xf;
323 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
325 if (sizeof(size_t) == 8)
326 Z.hi ^= rem_4bit[rem];
328 Z.hi ^= (u64)rem_4bit[rem] << 32;
330 Z.hi ^= Htable[nhi].hi;
331 Z.lo ^= Htable[nhi].lo;
336 nlo = ((const u8 *)Xi)[cnt];
340 rem = (size_t)Z.lo & 0xf;
341 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
343 if (sizeof(size_t) == 8)
344 Z.hi ^= rem_4bit[rem];
346 Z.hi ^= (u64)rem_4bit[rem] << 32;
348 Z.hi ^= Htable[nlo].hi;
349 Z.lo ^= Htable[nlo].lo;
352 if (is_endian.little) {
354 Xi[0] = BSWAP8(Z.hi);
355 Xi[1] = BSWAP8(Z.lo);
359 v = (u32)(Z.hi >> 32);
363 v = (u32)(Z.lo >> 32);
374 # if !defined(OPENSSL_SMALL_FOOTPRINT)
376 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
377 * details... Compiler-generated code doesn't seem to give any
378 * performance improvement, at least not on x86[_64]. It's here
379 * mostly as reference and a placeholder for possible future
380 * non-trivial optimization[s]...
382 static void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
383 const u8 *inp, size_t len)
387 size_t rem, nlo, nhi;
396 nlo = ((const u8 *)Xi)[15];
401 Z.hi = Htable[nlo].hi;
402 Z.lo = Htable[nlo].lo;
405 rem = (size_t)Z.lo & 0xf;
406 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
408 if (sizeof(size_t) == 8)
409 Z.hi ^= rem_4bit[rem];
411 Z.hi ^= (u64)rem_4bit[rem] << 32;
413 Z.hi ^= Htable[nhi].hi;
414 Z.lo ^= Htable[nhi].lo;
419 nlo = ((const u8 *)Xi)[cnt];
424 rem = (size_t)Z.lo & 0xf;
425 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
427 if (sizeof(size_t) == 8)
428 Z.hi ^= rem_4bit[rem];
430 Z.hi ^= (u64)rem_4bit[rem] << 32;
432 Z.hi ^= Htable[nlo].hi;
433 Z.lo ^= Htable[nlo].lo;
437 * Extra 256+16 bytes per-key plus 512 bytes shared tables
438 * [should] give ~50% improvement... One could have PACK()-ed
439 * the rem_8bit even here, but the priority is to minimize
442 u128 Hshr4[16]; /* Htable shifted right by 4 bits */
443 u8 Hshl4[16]; /* Htable shifted left by 4 bits */
444 static const unsigned short rem_8bit[256] = {
445 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
446 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
447 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
448 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
449 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
450 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
451 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
452 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
453 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
454 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
455 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
456 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
457 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
458 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
459 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
460 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
461 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
462 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
463 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
464 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
465 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
466 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
467 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
468 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
469 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
470 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
471 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
472 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
473 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
474 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
475 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
476 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE
479 * This pre-processing phase slows down procedure by approximately
480 * same time as it makes each loop spin faster. In other words
481 * single block performance is approximately same as straightforward
482 * "4-bit" implementation, and then it goes only faster...
484 for (cnt = 0; cnt < 16; ++cnt) {
485 Z.hi = Htable[cnt].hi;
486 Z.lo = Htable[cnt].lo;
487 Hshr4[cnt].lo = (Z.hi << 60) | (Z.lo >> 4);
488 Hshr4[cnt].hi = (Z.hi >> 4);
489 Hshl4[cnt] = (u8)(Z.lo << 4);
493 for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
494 nlo = ((const u8 *)Xi)[cnt];
499 Z.hi ^= Htable[nlo].hi;
500 Z.lo ^= Htable[nlo].lo;
502 rem = (size_t)Z.lo & 0xff;
504 Z.lo = (Z.hi << 56) | (Z.lo >> 8);
507 Z.hi ^= Hshr4[nhi].hi;
508 Z.lo ^= Hshr4[nhi].lo;
509 Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
512 nlo = ((const u8 *)Xi)[0];
517 Z.hi ^= Htable[nlo].hi;
518 Z.lo ^= Htable[nlo].lo;
520 rem = (size_t)Z.lo & 0xf;
522 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
525 Z.hi ^= Htable[nhi].hi;
526 Z.lo ^= Htable[nhi].lo;
527 Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
530 if (is_endian.little) {
532 Xi[0] = BSWAP8(Z.hi);
533 Xi[1] = BSWAP8(Z.lo);
537 v = (u32)(Z.hi >> 32);
541 v = (u32)(Z.lo >> 32);
550 } while (inp += 16, len -= 16);
554 void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
555 void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
559 # define GCM_MUL(ctx) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
560 # if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
561 # define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
563 * GHASH_CHUNK is "stride parameter" missioned to mitigate cache trashing
564 * effect. In other words idea is to hash data while it's still in L1 cache
565 * after encryption pass...
567 # define GHASH_CHUNK (3*1024)
570 #else /* TABLE_BITS */
572 static void gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
574 u128 V, Z = { 0, 0 };
577 const long *xi = (const long *)Xi;
583 V.hi = H[0]; /* H is in host byte order, no byte swapping */
586 for (j = 0; j < 16 / sizeof(long); ++j) {
587 if (is_endian.little) {
588 if (sizeof(long) == 8) {
590 X = (long)(BSWAP8(xi[j]));
592 const u8 *p = (const u8 *)(xi + j);
593 X = (long)((u64)GETU32(p) << 32 | GETU32(p + 4));
596 const u8 *p = (const u8 *)(xi + j);
602 for (i = 0; i < 8 * sizeof(long); ++i, X <<= 1) {
603 u64 M = (u64)(X >> (8 * sizeof(long) - 1));
611 if (is_endian.little) {
613 Xi[0] = BSWAP8(Z.hi);
614 Xi[1] = BSWAP8(Z.lo);
618 v = (u32)(Z.hi >> 32);
622 v = (u32)(Z.lo >> 32);
633 # define GCM_MUL(ctx) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
637 #if TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
638 # if !defined(I386_ONLY) && \
639 (defined(__i386) || defined(__i386__) || \
640 defined(__x86_64) || defined(__x86_64__) || \
641 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
642 # define GHASH_ASM_X86_OR_64
643 # define GCM_FUNCREF_4BIT
644 extern unsigned int OPENSSL_ia32cap_P[];
646 void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
647 void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
648 void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
651 # if defined(__i386) || defined(__i386__) || defined(_M_IX86)
652 # define gcm_init_avx gcm_init_clmul
653 # define gcm_gmult_avx gcm_gmult_clmul
654 # define gcm_ghash_avx gcm_ghash_clmul
656 void gcm_init_avx(u128 Htable[16], const u64 Xi[2]);
657 void gcm_gmult_avx(u64 Xi[2], const u128 Htable[16]);
658 void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
662 # if defined(__i386) || defined(__i386__) || defined(_M_IX86)
663 # define GHASH_ASM_X86
664 void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
665 void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
668 void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
669 void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
672 # elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
673 # include "arm_arch.h"
674 # if __ARM_MAX_ARCH__>=7
675 # define GHASH_ASM_ARM
676 # define GCM_FUNCREF_4BIT
677 # define PMULL_CAPABLE (OPENSSL_armcap_P & ARMV8_PMULL)
678 # if defined(__arm__) || defined(__arm)
679 # define NEON_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
681 void gcm_init_neon(u128 Htable[16], const u64 Xi[2]);
682 void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
683 void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
685 void gcm_init_v8(u128 Htable[16], const u64 Xi[2]);
686 void gcm_gmult_v8(u64 Xi[2], const u128 Htable[16]);
687 void gcm_ghash_v8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
690 # elif defined(__sparc__) || defined(__sparc)
691 # include "sparc_arch.h"
692 # define GHASH_ASM_SPARC
693 # define GCM_FUNCREF_4BIT
694 extern unsigned int OPENSSL_sparcv9cap_P[];
695 void gcm_init_vis3(u128 Htable[16], const u64 Xi[2]);
696 void gcm_gmult_vis3(u64 Xi[2], const u128 Htable[16]);
697 void gcm_ghash_vis3(u64 Xi[2], const u128 Htable[16], const u8 *inp,
699 # elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
700 # include "ppc_arch.h"
701 # define GHASH_ASM_PPC
702 # define GCM_FUNCREF_4BIT
703 void gcm_init_p8(u128 Htable[16], const u64 Xi[2]);
704 void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]);
705 void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
710 #ifdef GCM_FUNCREF_4BIT
712 # define GCM_MUL(ctx) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
715 # define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
719 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
726 memset(ctx, 0, sizeof(*ctx));
730 (*block) (ctx->H.c, ctx->H.c, key);
732 if (is_endian.little) {
733 /* H is stored in host byte order */
735 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
736 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
740 hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
741 lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
747 gcm_init_8bit(ctx->Htable, ctx->H.u);
750 # define CTX__GHASH(f) (ctx->ghash = (f))
752 # define CTX__GHASH(f) (ctx->ghash = NULL)
754 # if defined(GHASH_ASM_X86_OR_64)
755 # if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
756 if (OPENSSL_ia32cap_P[1] & (1 << 1)) { /* check PCLMULQDQ bit */
757 if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */
758 gcm_init_avx(ctx->Htable, ctx->H.u);
759 ctx->gmult = gcm_gmult_avx;
760 CTX__GHASH(gcm_ghash_avx);
762 gcm_init_clmul(ctx->Htable, ctx->H.u);
763 ctx->gmult = gcm_gmult_clmul;
764 CTX__GHASH(gcm_ghash_clmul);
769 gcm_init_4bit(ctx->Htable, ctx->H.u);
770 # if defined(GHASH_ASM_X86) /* x86 only */
771 # if defined(OPENSSL_IA32_SSE2)
772 if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */
774 if (OPENSSL_ia32cap_P[0] & (1 << 23)) { /* check MMX bit */
776 ctx->gmult = gcm_gmult_4bit_mmx;
777 CTX__GHASH(gcm_ghash_4bit_mmx);
779 ctx->gmult = gcm_gmult_4bit_x86;
780 CTX__GHASH(gcm_ghash_4bit_x86);
783 ctx->gmult = gcm_gmult_4bit;
784 CTX__GHASH(gcm_ghash_4bit);
786 # elif defined(GHASH_ASM_ARM)
787 # ifdef PMULL_CAPABLE
789 gcm_init_v8(ctx->Htable, ctx->H.u);
790 ctx->gmult = gcm_gmult_v8;
791 CTX__GHASH(gcm_ghash_v8);
796 gcm_init_neon(ctx->Htable, ctx->H.u);
797 ctx->gmult = gcm_gmult_neon;
798 CTX__GHASH(gcm_ghash_neon);
802 gcm_init_4bit(ctx->Htable, ctx->H.u);
803 ctx->gmult = gcm_gmult_4bit;
804 CTX__GHASH(gcm_ghash_4bit);
806 # elif defined(GHASH_ASM_SPARC)
807 if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
808 gcm_init_vis3(ctx->Htable, ctx->H.u);
809 ctx->gmult = gcm_gmult_vis3;
810 CTX__GHASH(gcm_ghash_vis3);
812 gcm_init_4bit(ctx->Htable, ctx->H.u);
813 ctx->gmult = gcm_gmult_4bit;
814 CTX__GHASH(gcm_ghash_4bit);
816 # elif defined(GHASH_ASM_PPC)
817 if (OPENSSL_ppccap_P & PPC_CRYPTO207) {
818 gcm_init_p8(ctx->Htable, ctx->H.u);
819 ctx->gmult = gcm_gmult_p8;
820 CTX__GHASH(gcm_ghash_p8);
822 gcm_init_4bit(ctx->Htable, ctx->H.u);
823 ctx->gmult = gcm_gmult_4bit;
824 CTX__GHASH(gcm_ghash_4bit);
827 gcm_init_4bit(ctx->Htable, ctx->H.u);
833 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
841 #ifdef GCM_FUNCREF_4BIT
842 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
845 ctx->len.u[0] = 0; /* AAD length */
846 ctx->len.u[1] = 0; /* message length */
851 memcpy(ctx->Yi.c, iv, 12);
861 /* Borrow ctx->Xi to calculate initial Yi */
866 for (i = 0; i < 16; ++i)
867 ctx->Xi.c[i] ^= iv[i];
873 for (i = 0; i < len; ++i)
874 ctx->Xi.c[i] ^= iv[i];
878 if (is_endian.little) {
880 ctx->Xi.u[1] ^= BSWAP8(len0);
882 ctx->Xi.c[8] ^= (u8)(len0 >> 56);
883 ctx->Xi.c[9] ^= (u8)(len0 >> 48);
884 ctx->Xi.c[10] ^= (u8)(len0 >> 40);
885 ctx->Xi.c[11] ^= (u8)(len0 >> 32);
886 ctx->Xi.c[12] ^= (u8)(len0 >> 24);
887 ctx->Xi.c[13] ^= (u8)(len0 >> 16);
888 ctx->Xi.c[14] ^= (u8)(len0 >> 8);
889 ctx->Xi.c[15] ^= (u8)(len0);
892 ctx->Xi.u[1] ^= len0;
897 if (is_endian.little)
899 ctr = BSWAP4(ctx->Xi.d[3]);
901 ctr = GETU32(ctx->Xi.c + 12);
906 /* Copy borrowed Xi to Yi */
907 ctx->Yi.u[0] = ctx->Xi.u[0];
908 ctx->Yi.u[1] = ctx->Xi.u[1];
914 (*ctx->block) (ctx->Yi.c, ctx->EK0.c, ctx->key);
916 if (is_endian.little)
918 ctx->Yi.d[3] = BSWAP4(ctr);
920 PUTU32(ctx->Yi.c + 12, ctr);
926 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
931 u64 alen = ctx->len.u[0];
932 #ifdef GCM_FUNCREF_4BIT
933 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
935 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
936 const u8 *inp, size_t len) = ctx->ghash;
944 if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len))
946 ctx->len.u[0] = alen;
951 ctx->Xi.c[n] ^= *(aad++);
963 if ((i = (len & (size_t)-16))) {
970 for (i = 0; i < 16; ++i)
971 ctx->Xi.c[i] ^= aad[i];
978 n = (unsigned int)len;
979 for (i = 0; i < len; ++i)
980 ctx->Xi.c[i] ^= aad[i];
987 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
988 const unsigned char *in, unsigned char *out,
995 unsigned int n, ctr, mres;
997 u64 mlen = ctx->len.u[1];
998 block128_f block = ctx->block;
999 void *key = ctx->key;
1000 #ifdef GCM_FUNCREF_4BIT
1001 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1002 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1003 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1004 const u8 *inp, size_t len) = ctx->ghash;
1009 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1011 ctx->len.u[1] = mlen;
1016 /* First call to encrypt finalizes GHASH(AAD) */
1017 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1023 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1026 mres = sizeof(ctx->Xi);
1033 if (is_endian.little)
1035 ctr = BSWAP4(ctx->Yi.d[3]);
1037 ctr = GETU32(ctx->Yi.c + 12);
1043 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1044 if (16 % sizeof(size_t) == 0) { /* always true actually */
1049 ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1054 GHASH(ctx, ctx->Xn, mres);
1062 ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1075 # if defined(STRICT_ALIGNMENT)
1076 if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1080 if (len >= 16 && mres) {
1081 GHASH(ctx, ctx->Xn, mres);
1084 # if defined(GHASH_CHUNK)
1085 while (len >= GHASH_CHUNK) {
1086 size_t j = GHASH_CHUNK;
1089 size_t_aX *out_t = (size_t_aX *)out;
1090 const size_t_aX *in_t = (const size_t_aX *)in;
1092 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1094 if (is_endian.little)
1096 ctx->Yi.d[3] = BSWAP4(ctr);
1098 PUTU32(ctx->Yi.c + 12, ctr);
1102 for (i = 0; i < 16 / sizeof(size_t); ++i)
1103 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1108 GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
1112 if ((i = (len & (size_t)-16))) {
1116 size_t_aX *out_t = (size_t_aX *)out;
1117 const size_t_aX *in_t = (const size_t_aX *)in;
1119 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1121 if (is_endian.little)
1123 ctx->Yi.d[3] = BSWAP4(ctr);
1125 PUTU32(ctx->Yi.c + 12, ctr);
1129 for (i = 0; i < 16 / sizeof(size_t); ++i)
1130 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1135 GHASH(ctx, out - j, j);
1139 size_t *out_t = (size_t *)out;
1140 const size_t *in_t = (const size_t *)in;
1142 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1144 if (is_endian.little)
1146 ctx->Yi.d[3] = BSWAP4(ctr);
1148 PUTU32(ctx->Yi.c + 12, ctr);
1152 for (i = 0; i < 16 / sizeof(size_t); ++i)
1153 ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1161 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1163 if (is_endian.little)
1165 ctx->Yi.d[3] = BSWAP4(ctr);
1167 PUTU32(ctx->Yi.c + 12, ctr);
1173 ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1178 ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1190 for (i = 0; i < len; ++i) {
1192 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1194 if (is_endian.little)
1196 ctx->Yi.d[3] = BSWAP4(ctr);
1198 PUTU32(ctx->Yi.c + 12, ctr);
1203 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1204 ctx->Xn[mres++] = out[i] = in[i] ^ ctx->EKi.c[n];
1206 if (mres == sizeof(ctx->Xn)) {
1207 GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1211 ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
1212 mres = n = (n + 1) % 16;
1222 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1223 const unsigned char *in, unsigned char *out,
1229 } is_endian = { 1 };
1230 unsigned int n, ctr, mres;
1232 u64 mlen = ctx->len.u[1];
1233 block128_f block = ctx->block;
1234 void *key = ctx->key;
1235 #ifdef GCM_FUNCREF_4BIT
1236 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1237 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1238 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1239 const u8 *inp, size_t len) = ctx->ghash;
1244 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1246 ctx->len.u[1] = mlen;
1251 /* First call to decrypt finalizes GHASH(AAD) */
1252 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1258 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1261 mres = sizeof(ctx->Xi);
1268 if (is_endian.little)
1270 ctr = BSWAP4(ctx->Yi.d[3]);
1272 ctr = GETU32(ctx->Yi.c + 12);
1278 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1279 if (16 % sizeof(size_t) == 0) { /* always true actually */
1284 *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1289 GHASH(ctx, ctx->Xn, mres);
1298 *(out++) = c ^ ctx->EKi.c[n];
1312 # if defined(STRICT_ALIGNMENT)
1313 if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1317 if (len >= 16 && mres) {
1318 GHASH(ctx, ctx->Xn, mres);
1321 # if defined(GHASH_CHUNK)
1322 while (len >= GHASH_CHUNK) {
1323 size_t j = GHASH_CHUNK;
1325 GHASH(ctx, in, GHASH_CHUNK);
1327 size_t_aX *out_t = (size_t_aX *)out;
1328 const size_t_aX *in_t = (const size_t_aX *)in;
1330 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1332 if (is_endian.little)
1334 ctx->Yi.d[3] = BSWAP4(ctr);
1336 PUTU32(ctx->Yi.c + 12, ctr);
1340 for (i = 0; i < 16 / sizeof(size_t); ++i)
1341 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1349 if ((i = (len & (size_t)-16))) {
1352 size_t_aX *out_t = (size_t_aX *)out;
1353 const size_t_aX *in_t = (const size_t_aX *)in;
1355 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1357 if (is_endian.little)
1359 ctx->Yi.d[3] = BSWAP4(ctr);
1361 PUTU32(ctx->Yi.c + 12, ctr);
1365 for (i = 0; i < 16 / sizeof(size_t); ++i)
1366 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1374 size_t *out_t = (size_t *)out;
1375 const size_t *in_t = (const size_t *)in;
1377 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1379 if (is_endian.little)
1381 ctx->Yi.d[3] = BSWAP4(ctr);
1383 PUTU32(ctx->Yi.c + 12, ctr);
1387 for (i = 0; i < 16 / sizeof(size_t); ++i) {
1389 out[i] = c ^ ctx->EKi.t[i];
1399 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1401 if (is_endian.little)
1403 ctx->Yi.d[3] = BSWAP4(ctr);
1405 PUTU32(ctx->Yi.c + 12, ctr);
1411 out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1418 out[n] = c ^ ctx->EKi.c[n];
1430 for (i = 0; i < len; ++i) {
1433 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1435 if (is_endian.little)
1437 ctx->Yi.d[3] = BSWAP4(ctr);
1439 PUTU32(ctx->Yi.c + 12, ctr);
1444 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1445 out[i] = (ctx->Xn[mres++] = c = in[i]) ^ ctx->EKi.c[n];
1447 if (mres == sizeof(ctx->Xn)) {
1448 GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1453 out[i] = c ^ ctx->EKi.c[n];
1455 mres = n = (n + 1) % 16;
1465 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1466 const unsigned char *in, unsigned char *out,
1467 size_t len, ctr128_f stream)
1469 #if defined(OPENSSL_SMALL_FOOTPRINT)
1470 return CRYPTO_gcm128_encrypt(ctx, in, out, len);
1475 } is_endian = { 1 };
1476 unsigned int n, ctr, mres;
1478 u64 mlen = ctx->len.u[1];
1479 void *key = ctx->key;
1480 # ifdef GCM_FUNCREF_4BIT
1481 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1483 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1484 const u8 *inp, size_t len) = ctx->ghash;
1489 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1491 ctx->len.u[1] = mlen;
1496 /* First call to encrypt finalizes GHASH(AAD) */
1503 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1506 mres = sizeof(ctx->Xi);
1513 if (is_endian.little)
1515 ctr = BSWAP4(ctx->Yi.d[3]);
1517 ctr = GETU32(ctx->Yi.c + 12);
1526 ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1531 GHASH(ctx, ctx->Xn, mres);
1539 ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1553 if (len >= 16 && mres) {
1554 GHASH(ctx, ctx->Xn, mres);
1557 # if defined(GHASH_CHUNK)
1558 while (len >= GHASH_CHUNK) {
1559 (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1560 ctr += GHASH_CHUNK / 16;
1561 if (is_endian.little)
1563 ctx->Yi.d[3] = BSWAP4(ctr);
1565 PUTU32(ctx->Yi.c + 12, ctr);
1569 GHASH(ctx, out, GHASH_CHUNK);
1576 if ((i = (len & (size_t)-16))) {
1579 (*stream) (in, out, j, key, ctx->Yi.c);
1580 ctr += (unsigned int)j;
1581 if (is_endian.little)
1583 ctx->Yi.d[3] = BSWAP4(ctr);
1585 PUTU32(ctx->Yi.c + 12, ctr);
1596 for (i = 0; i < 16; ++i)
1597 ctx->Xi.c[i] ^= out[i];
1604 (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1606 if (is_endian.little)
1608 ctx->Yi.d[3] = BSWAP4(ctr);
1610 PUTU32(ctx->Yi.c + 12, ctr);
1616 ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1618 ctx->Xi.c[mres++] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1629 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1630 const unsigned char *in, unsigned char *out,
1631 size_t len, ctr128_f stream)
1633 #if defined(OPENSSL_SMALL_FOOTPRINT)
1634 return CRYPTO_gcm128_decrypt(ctx, in, out, len);
1639 } is_endian = { 1 };
1640 unsigned int n, ctr, mres;
1642 u64 mlen = ctx->len.u[1];
1643 void *key = ctx->key;
1644 # ifdef GCM_FUNCREF_4BIT
1645 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1647 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1648 const u8 *inp, size_t len) = ctx->ghash;
1653 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1655 ctx->len.u[1] = mlen;
1660 /* First call to decrypt finalizes GHASH(AAD) */
1667 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1670 mres = sizeof(ctx->Xi);
1677 if (is_endian.little)
1679 ctr = BSWAP4(ctx->Yi.d[3]);
1681 ctr = GETU32(ctx->Yi.c + 12);
1690 *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1695 GHASH(ctx, ctx->Xn, mres);
1704 *(out++) = c ^ ctx->EKi.c[n];
1719 if (len >= 16 && mres) {
1720 GHASH(ctx, ctx->Xn, mres);
1723 # if defined(GHASH_CHUNK)
1724 while (len >= GHASH_CHUNK) {
1725 GHASH(ctx, in, GHASH_CHUNK);
1726 (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1727 ctr += GHASH_CHUNK / 16;
1728 if (is_endian.little)
1730 ctx->Yi.d[3] = BSWAP4(ctr);
1732 PUTU32(ctx->Yi.c + 12, ctr);
1742 if ((i = (len & (size_t)-16))) {
1750 for (k = 0; k < 16; ++k)
1751 ctx->Xi.c[k] ^= in[k];
1758 (*stream) (in, out, j, key, ctx->Yi.c);
1759 ctr += (unsigned int)j;
1760 if (is_endian.little)
1762 ctx->Yi.d[3] = BSWAP4(ctr);
1764 PUTU32(ctx->Yi.c + 12, ctr);
1773 (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1775 if (is_endian.little)
1777 ctx->Yi.d[3] = BSWAP4(ctr);
1779 PUTU32(ctx->Yi.c + 12, ctr);
1785 out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1788 ctx->Xi.c[mres++] ^= c;
1789 out[n] = c ^ ctx->EKi.c[n];
1800 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
1806 } is_endian = { 1 };
1807 u64 alen = ctx->len.u[0] << 3;
1808 u64 clen = ctx->len.u[1] << 3;
1809 #ifdef GCM_FUNCREF_4BIT
1810 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1811 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1812 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1813 const u8 *inp, size_t len) = ctx->ghash;
1817 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1819 unsigned int mres = ctx->mres;
1822 unsigned blocks = (mres + 15) & -16;
1824 memset(ctx->Xn + mres, 0, blocks - mres);
1826 if (mres == sizeof(ctx->Xn)) {
1827 GHASH(ctx, ctx->Xn, mres);
1830 } else if (ctx->ares) {
1834 if (ctx->mres || ctx->ares)
1838 if (is_endian.little) {
1840 alen = BSWAP8(alen);
1841 clen = BSWAP8(clen);
1845 ctx->len.u[0] = alen;
1846 ctx->len.u[1] = clen;
1848 alen = (u64)GETU32(p) << 32 | GETU32(p + 4);
1849 clen = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
1853 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1856 memcpy(ctx->Xn + mres, &bitlen, sizeof(bitlen));
1857 mres += sizeof(bitlen);
1858 GHASH(ctx, ctx->Xn, mres);
1860 ctx->Xi.u[0] ^= alen;
1861 ctx->Xi.u[1] ^= clen;
1865 ctx->Xi.u[0] ^= ctx->EK0.u[0];
1866 ctx->Xi.u[1] ^= ctx->EK0.u[1];
1868 if (tag && len <= sizeof(ctx->Xi))
1869 return CRYPTO_memcmp(ctx->Xi.c, tag, len);
1874 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1876 CRYPTO_gcm128_finish(ctx, NULL, 0);
1877 memcpy(tag, ctx->Xi.c,
1878 len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
1881 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1883 GCM128_CONTEXT *ret;
1885 if ((ret = OPENSSL_malloc(sizeof(*ret))) != NULL)
1886 CRYPTO_gcm128_init(ret, key, block);
1891 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1893 OPENSSL_clear_free(ctx, sizeof(*ctx));