contrib/bearssl/src/hash/ghash_pclmul.c

   1 /*
   2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining
   5  * a copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sublicense, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be
  13  * included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #define BR_ENABLE_INTRINSICS   1
  26 #include "inner.h"
  27
  28 /*
  29  * This is the GHASH implementation that leverages the pclmulqdq opcode
  30  * (from the AES-NI instructions).
  31  */
  32
  33 #if BR_AES_X86NI
  34
  35 /*
  36  * Test CPU support for PCLMULQDQ.
  37  */
  38 static inline int
  39 pclmul_supported(void)
  40 {
  41         /*
  42          * Bit mask for features in ECX:
  43          *    1   PCLMULQDQ support
  44          */
  45         return br_cpuid(0, 0, 0x00000002, 0);
  46 }
  47
  48 /* see bearssl_hash.h */
  49 br_ghash
  50 br_ghash_pclmul_get(void)
  51 {
  52         return pclmul_supported() ? &br_ghash_pclmul : 0;
  53 }
  54
  55 BR_TARGETS_X86_UP
  56
  57 /*
  58  * GHASH is defined over elements of GF(2^128) with "full little-endian"
  59  * representation: leftmost byte is least significant, and, within each
  60  * byte, leftmost _bit_ is least significant. The natural ordering in
  61  * x86 is "mixed little-endian": bytes are ordered from least to most
  62  * significant, but bits within a byte are in most-to-least significant
  63  * order. Going to full little-endian representation would require
  64  * reversing bits within each byte, which is doable but expensive.
  65  *
  66  * Instead, we go to full big-endian representation, by swapping bytes
  67  * around, which is done with a single _mm_shuffle_epi8() opcode (it
  68  * comes with SSSE3; all CPU that offer pclmulqdq also have SSSE3). We
  69  * can use a full big-endian representation because in a carryless
  70  * multiplication, we have a nice bit reversal property:
  71  *
  72  *    rev_128(x) * rev_128(y) = rev_255(x * y)
  73  *
  74  * So by using full big-endian, we still get the right result, except
  75  * that it is right-shifted by 1 bit. The left-shift is relatively
  76  * inexpensive, and it can be mutualised.
  77  *
  78  *
  79  * Since SSE2 opcodes do not have facilities for shitfting full 128-bit
  80  * values with bit precision, we have to break down values into 64-bit
  81  * chunks. We number chunks from 0 to 3 in left to right order.
  82  */
  83
  84 /*
  85  * Byte-swap a complete 128-bit value. This normally uses
  86  * _mm_shuffle_epi8(), which gets translated to pshufb (an SSSE3 opcode).
  87  * However, this crashes old Clang versions, so, for Clang before 3.8,
  88  * we use an alternate (and less efficient) version.
  89  */
  90 #if BR_CLANG && !BR_CLANG_3_8
  91 #define BYTESWAP_DECL
  92 #define BYTESWAP_PREP   (void)0
  93 #define BYTESWAP(x)   do { \
  94                 __m128i byteswap1, byteswap2; \
  95                 byteswap1 = (x); \
  96                 byteswap2 = _mm_srli_epi16(byteswap1, 8); \
  97                 byteswap1 = _mm_slli_epi16(byteswap1, 8); \
  98                 byteswap1 = _mm_or_si128(byteswap1, byteswap2); \
  99                 byteswap1 = _mm_shufflelo_epi16(byteswap1, 0x1B); \
 100                 byteswap1 = _mm_shufflehi_epi16(byteswap1, 0x1B); \
 101                 (x) = _mm_shuffle_epi32(byteswap1, 0x4E); \
 102         } while (0)
 103 #else
 104 #define BYTESWAP_DECL   __m128i byteswap_index;
 105 #define BYTESWAP_PREP   do { \
 106                 byteswap_index = _mm_set_epi8( \
 107                         0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
 108         } while (0)
 109 #define BYTESWAP(x)   do { \
 110                 (x) = _mm_shuffle_epi8((x), byteswap_index); \
 111         } while (0)
 112 #endif
 113
 114 /*
 115  * Call pclmulqdq. Clang appears to have trouble with the intrinsic, so,
 116  * for that compiler, we use inline assembly. Inline assembly is
 117  * potentially a bit slower because the compiler does not understand
 118  * what the opcode does, and thus cannot optimize instruction
 119  * scheduling.
 120  *
 121  * We use a target of "sse2" only, so that Clang may still handle the
 122  * '__m128i' type and allocate SSE2 registers.
 123  */
 124 #if BR_CLANG
 125 BR_TARGET("sse2")
 126 static inline __m128i
 127 pclmulqdq00(__m128i x, __m128i y)
 128 {
 129         __asm__ ("pclmulqdq $0x00, %1, %0" : "+x" (x) : "x" (y));
 130         return x;
 131 }
 132 BR_TARGET("sse2")
 133 static inline __m128i
 134 pclmulqdq11(__m128i x, __m128i y)
 135 {
 136         __asm__ ("pclmulqdq $0x11, %1, %0" : "+x" (x) : "x" (y));
 137         return x;
 138 }
 139 #else
 140 #define pclmulqdq00(x, y)   _mm_clmulepi64_si128(x, y, 0x00)
 141 #define pclmulqdq11(x, y)   _mm_clmulepi64_si128(x, y, 0x11)
 142 #endif
 143
 144 /*
 145  * From a 128-bit value kw, compute kx as the XOR of the two 64-bit
 146  * halves of kw (into the right half of kx; left half is unspecified).
 147  */
 148 #define BK(kw, kx)   do { \
 149                 kx = _mm_xor_si128(kw, _mm_shuffle_epi32(kw, 0x0E)); \
 150         } while (0)
 151
 152 /*
 153  * Combine two 64-bit values (k0:k1) into a 128-bit (kw) value and
 154  * the XOR of the two values (kx).
 155  */
 156 #define PBK(k0, k1, kw, kx)   do { \
 157                 kw = _mm_unpacklo_epi64(k1, k0); \
 158                 kx = _mm_xor_si128(k0, k1); \
 159         } while (0)
 160
 161 /*
 162  * Left-shift by 1 bit a 256-bit value (in four 64-bit words).
 163  */
 164 #define SL_256(x0, x1, x2, x3)   do { \
 165                 x0 = _mm_or_si128( \
 166                         _mm_slli_epi64(x0, 1), \
 167                         _mm_srli_epi64(x1, 63)); \
 168                 x1 = _mm_or_si128( \
 169                         _mm_slli_epi64(x1, 1), \
 170                         _mm_srli_epi64(x2, 63)); \
 171                 x2 = _mm_or_si128( \
 172                         _mm_slli_epi64(x2, 1), \
 173                         _mm_srli_epi64(x3, 63)); \
 174                 x3 = _mm_slli_epi64(x3, 1); \
 175         } while (0)
 176
 177 /*
 178  * Perform reduction in GF(2^128). The 256-bit value is in x0..x3;
 179  * result is written in x0..x1.
 180  */
 181 #define REDUCE_F128(x0, x1, x2, x3)   do { \
 182                 x1 = _mm_xor_si128( \
 183                         x1, \
 184                         _mm_xor_si128( \
 185                                 _mm_xor_si128( \
 186                                         x3, \
 187                                         _mm_srli_epi64(x3, 1)), \
 188                                 _mm_xor_si128( \
 189                                         _mm_srli_epi64(x3, 2), \
 190                                         _mm_srli_epi64(x3, 7)))); \
 191                 x2 = _mm_xor_si128( \
 192                         _mm_xor_si128( \
 193                                 x2, \
 194                                 _mm_slli_epi64(x3, 63)), \
 195                         _mm_xor_si128( \
 196                                 _mm_slli_epi64(x3, 62), \
 197                                 _mm_slli_epi64(x3, 57))); \
 198                 x0 = _mm_xor_si128( \
 199                         x0, \
 200                         _mm_xor_si128( \
 201                                 _mm_xor_si128( \
 202                                         x2, \
 203                                         _mm_srli_epi64(x2, 1)), \
 204                                 _mm_xor_si128( \
 205                                         _mm_srli_epi64(x2, 2), \
 206                                         _mm_srli_epi64(x2, 7)))); \
 207                 x1 = _mm_xor_si128( \
 208                         _mm_xor_si128( \
 209                                 x1, \
 210                                 _mm_slli_epi64(x2, 63)), \
 211                         _mm_xor_si128( \
 212                                 _mm_slli_epi64(x2, 62), \
 213                                 _mm_slli_epi64(x2, 57))); \
 214         } while (0)
 215
 216 /*
 217  * Square value kw into (dw,dx).
 218  */
 219 #define SQUARE_F128(kw, dw, dx)   do { \
 220                 __m128i z0, z1, z2, z3; \
 221                 z1 = pclmulqdq11(kw, kw); \
 222                 z3 = pclmulqdq00(kw, kw); \
 223                 z0 = _mm_shuffle_epi32(z1, 0x0E); \
 224                 z2 = _mm_shuffle_epi32(z3, 0x0E); \
 225                 SL_256(z0, z1, z2, z3); \
 226                 REDUCE_F128(z0, z1, z2, z3); \
 227                 PBK(z0, z1, dw, dx); \
 228         } while (0)
 229
 230 /* see bearssl_hash.h */
 231 BR_TARGET("ssse3,pclmul")
 232 void
 233 br_ghash_pclmul(void *y, const void *h, const void *data, size_t len)
 234 {
 235         const unsigned char *buf1, *buf2;
 236         unsigned char tmp[64];
 237         size_t num4, num1;
 238         __m128i yw, h1w, h1x;
 239         BYTESWAP_DECL
 240
 241         /*
 242          * We split data into two chunks. First chunk starts at buf1
 243          * and contains num4 blocks of 64-byte values. Second chunk
 244          * starts at buf2 and contains num1 blocks of 16-byte values.
 245          * We want the first chunk to be as large as possible.
 246          */
 247         buf1 = data;
 248         num4 = len >> 6;
 249         len &= 63;
 250         buf2 = buf1 + (num4 << 6);
 251         num1 = (len + 15) >> 4;
 252         if ((len & 15) != 0) {
 253                 memcpy(tmp, buf2, len);
 254                 memset(tmp + len, 0, (num1 << 4) - len);
 255                 buf2 = tmp;
 256         }
 257
 258         /*
 259          * Preparatory step for endian conversions.
 260          */
 261         BYTESWAP_PREP;
 262
 263         /*
 264          * Load y and h.
 265          */
 266         yw = _mm_loadu_si128(y);
 267         h1w = _mm_loadu_si128(h);
 268         BYTESWAP(yw);
 269         BYTESWAP(h1w);
 270         BK(h1w, h1x);
 271
 272         if (num4 > 0) {
 273                 __m128i h2w, h2x, h3w, h3x, h4w, h4x;
 274                 __m128i t0, t1, t2, t3;
 275
 276                 /*
 277                  * Compute h2 = h^2.
 278                  */
 279                 SQUARE_F128(h1w, h2w, h2x);
 280
 281                 /*
 282                  * Compute h3 = h^3 = h*(h^2).
 283                  */
 284                 t1 = pclmulqdq11(h1w, h2w);
 285                 t3 = pclmulqdq00(h1w, h2w);
 286                 t2 = _mm_xor_si128(pclmulqdq00(h1x, h2x),
 287                         _mm_xor_si128(t1, t3));
 288                 t0 = _mm_shuffle_epi32(t1, 0x0E);
 289                 t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
 290                 t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
 291                 SL_256(t0, t1, t2, t3);
 292                 REDUCE_F128(t0, t1, t2, t3);
 293                 PBK(t0, t1, h3w, h3x);
 294
 295                 /*
 296                  * Compute h4 = h^4 = (h^2)^2.
 297                  */
 298                 SQUARE_F128(h2w, h4w, h4x);
 299
 300                 while (num4 -- > 0) {
 301                         __m128i aw0, aw1, aw2, aw3;
 302                         __m128i ax0, ax1, ax2, ax3;
 303
 304                         aw0 = _mm_loadu_si128((void *)(buf1 +  0));
 305                         aw1 = _mm_loadu_si128((void *)(buf1 + 16));
 306                         aw2 = _mm_loadu_si128((void *)(buf1 + 32));
 307                         aw3 = _mm_loadu_si128((void *)(buf1 + 48));
 308                         BYTESWAP(aw0);
 309                         BYTESWAP(aw1);
 310                         BYTESWAP(aw2);
 311                         BYTESWAP(aw3);
 312                         buf1 += 64;
 313
 314                         aw0 = _mm_xor_si128(aw0, yw);
 315                         BK(aw1, ax1);
 316                         BK(aw2, ax2);
 317                         BK(aw3, ax3);
 318                         BK(aw0, ax0);
 319
 320                         t1 = _mm_xor_si128(
 321                                 _mm_xor_si128(
 322                                         pclmulqdq11(aw0, h4w),
 323                                         pclmulqdq11(aw1, h3w)),
 324                                 _mm_xor_si128(
 325                                         pclmulqdq11(aw2, h2w),
 326                                         pclmulqdq11(aw3, h1w)));
 327                         t3 = _mm_xor_si128(
 328                                 _mm_xor_si128(
 329                                         pclmulqdq00(aw0, h4w),
 330                                         pclmulqdq00(aw1, h3w)),
 331                                 _mm_xor_si128(
 332                                         pclmulqdq00(aw2, h2w),
 333                                         pclmulqdq00(aw3, h1w)));
 334                         t2 = _mm_xor_si128(
 335                                 _mm_xor_si128(
 336                                         pclmulqdq00(ax0, h4x),
 337                                         pclmulqdq00(ax1, h3x)),
 338                                 _mm_xor_si128(
 339                                         pclmulqdq00(ax2, h2x),
 340                                         pclmulqdq00(ax3, h1x)));
 341                         t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
 342                         t0 = _mm_shuffle_epi32(t1, 0x0E);
 343                         t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
 344                         t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
 345                         SL_256(t0, t1, t2, t3);
 346                         REDUCE_F128(t0, t1, t2, t3);
 347                         yw = _mm_unpacklo_epi64(t1, t0);
 348                 }
 349         }
 350
 351         while (num1 -- > 0) {
 352                 __m128i aw, ax;
 353                 __m128i t0, t1, t2, t3;
 354
 355                 aw = _mm_loadu_si128((void *)buf2);
 356                 BYTESWAP(aw);
 357                 buf2 += 16;
 358
 359                 aw = _mm_xor_si128(aw, yw);
 360                 BK(aw, ax);
 361
 362                 t1 = pclmulqdq11(aw, h1w);
 363                 t3 = pclmulqdq00(aw, h1w);
 364                 t2 = pclmulqdq00(ax, h1x);
 365                 t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
 366                 t0 = _mm_shuffle_epi32(t1, 0x0E);
 367                 t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
 368                 t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
 369                 SL_256(t0, t1, t2, t3);
 370                 REDUCE_F128(t0, t1, t2, t3);
 371                 yw = _mm_unpacklo_epi64(t1, t0);
 372         }
 373
 374         BYTESWAP(yw);
 375         _mm_storeu_si128(y, yw);
 376 }
 377
 378 BR_TARGETS_X86_DOWN
 379
 380 #else
 381
 382 /* see bearssl_hash.h */
 383 br_ghash
 384 br_ghash_pclmul_get(void)
 385 {
 386         return 0;
 387 }
 388
 389 #endif