sys/libkern/x86/crc32_sse42.c

   1 /*
   2  * Derived from crc32c.c version 1.1 by Mark Adler.
   3  *
   4  * Copyright (C) 2013 Mark Adler
   5  *
   6  * This software is provided 'as-is', without any express or implied warranty.
   7  * In no event will the author be held liable for any damages arising from the
   8  * use of this software.
   9  *
  10  * Permission is granted to anyone to use this software for any purpose,
  11  * including commercial applications, and to alter it and redistribute it
  12  * freely, subject to the following restrictions:
  13  *
  14  * 1. The origin of this software must not be misrepresented; you must not
  15  *    claim that you wrote the original software. If you use this software
  16  *    in a product, an acknowledgment in the product documentation would be
  17  *    appreciated but is not required.
  18  * 2. Altered source versions must be plainly marked as such, and must not be
  19  *    misrepresented as being the original software.
  20  * 3. This notice may not be removed or altered from any source distribution.
  21  *
  22  * Mark Adler
  23  * madler@alumni.caltech.edu
  24  */
  25
  26 #include <sys/cdefs.h>
  27 __FBSDID("$FreeBSD$");
  28
  29 /*
  30  * This file is compiled in userspace in order to run ATF unit tests.
  31  */
  32 #ifdef USERSPACE_TESTING
  33 #include <stdint.h>
  34 #include <stdlib.h>
  35 #else
  36 #include <sys/param.h>
  37 #include <sys/systm.h>
  38 #include <sys/kernel.h>
  39 #endif
  40
  41 static __inline uint32_t
  42 _mm_crc32_u8(uint32_t x, uint8_t y)
  43 {
  44         /*
  45          * clang (at least 3.9.[0-1]) pessimizes "rm" (y) and "m" (y)
  46          * significantly and "r" (y) a lot by copying y to a different
  47          * local variable (on the stack or in a register), so only use
  48          * the latter.  This costs a register and an instruction but
  49          * not a uop.
  50          */
  51         __asm("crc32b %1,%0" : "+r" (x) : "r" (y));
  52         return (x);
  53 }
  54
  55 #ifdef __amd64__
  56 static __inline uint64_t
  57 _mm_crc32_u64(uint64_t x, uint64_t y)
  58 {
  59         __asm("crc32q %1,%0" : "+r" (x) : "r" (y));
  60         return (x);
  61 }
  62 #else
  63 static __inline uint32_t
  64 _mm_crc32_u32(uint32_t x, uint32_t y)
  65 {
  66         __asm("crc32l %1,%0" : "+r" (x) : "r" (y));
  67         return (x);
  68 }
  69 #endif
  70
  71 /* CRC-32C (iSCSI) polynomial in reversed bit order. */
  72 #define POLY    0x82f63b78
  73
  74 /*
  75  * Block sizes for three-way parallel crc computation.  LONG and SHORT must
  76  * both be powers of two.
  77  */
  78 #define LONG    128
  79 #define SHORT   64
  80
  81 /*
  82  * Tables for updating a crc for LONG, 2 * LONG, SHORT and 2 * SHORT bytes
  83  * of value 0 later in the input stream, in the same way that the hardware
  84  * would, but in software without calculating intermediate steps.
  85  */
  86 static uint32_t crc32c_long[4][256];
  87 static uint32_t crc32c_2long[4][256];
  88 static uint32_t crc32c_short[4][256];
  89 static uint32_t crc32c_2short[4][256];
  90
  91 /*
  92  * Multiply a matrix times a vector over the Galois field of two elements,
  93  * GF(2).  Each element is a bit in an unsigned integer.  mat must have at
  94  * least as many entries as the power of two for most significant one bit in
  95  * vec.
  96  */
  97 static inline uint32_t
  98 gf2_matrix_times(uint32_t *mat, uint32_t vec)
  99 {
 100         uint32_t sum;
 101
 102         sum = 0;
 103         while (vec) {
 104                 if (vec & 1)
 105                         sum ^= *mat;
 106                 vec >>= 1;
 107                 mat++;
 108         }
 109         return (sum);
 110 }
 111
 112 /*
 113  * Multiply a matrix by itself over GF(2).  Both mat and square must have 32
 114  * rows.
 115  */
 116 static inline void
 117 gf2_matrix_square(uint32_t *square, uint32_t *mat)
 118 {
 119         int n;
 120
 121         for (n = 0; n < 32; n++)
 122                 square[n] = gf2_matrix_times(mat, mat[n]);
 123 }
 124
 125 /*
 126  * Construct an operator to apply len zeros to a crc.  len must be a power of
 127  * two.  If len is not a power of two, then the result is the same as for the
 128  * largest power of two less than len.  The result for len == 0 is the same as
 129  * for len == 1.  A version of this routine could be easily written for any
 130  * len, but that is not needed for this application.
 131  */
 132 static void
 133 crc32c_zeros_op(uint32_t *even, size_t len)
 134 {
 135         uint32_t odd[32];       /* odd-power-of-two zeros operator */
 136         uint32_t row;
 137         int n;
 138
 139         /* put operator for one zero bit in odd */
 140         odd[0] = POLY;              /* CRC-32C polynomial */
 141         row = 1;
 142         for (n = 1; n < 32; n++) {
 143                 odd[n] = row;
 144                 row <<= 1;
 145         }
 146
 147         /* put operator for two zero bits in even */
 148         gf2_matrix_square(even, odd);
 149
 150         /* put operator for four zero bits in odd */
 151         gf2_matrix_square(odd, even);
 152
 153         /*
 154          * first square will put the operator for one zero byte (eight zero
 155          * bits), in even -- next square puts operator for two zero bytes in
 156          * odd, and so on, until len has been rotated down to zero
 157          */
 158         do {
 159                 gf2_matrix_square(even, odd);
 160                 len >>= 1;
 161                 if (len == 0)
 162                         return;
 163                 gf2_matrix_square(odd, even);
 164                 len >>= 1;
 165         } while (len);
 166
 167         /* answer ended up in odd -- copy to even */
 168         for (n = 0; n < 32; n++)
 169                 even[n] = odd[n];
 170 }
 171
 172 /*
 173  * Take a length and build four lookup tables for applying the zeros operator
 174  * for that length, byte-by-byte on the operand.
 175  */
 176 static void
 177 crc32c_zeros(uint32_t zeros[][256], size_t len)
 178 {
 179         uint32_t op[32];
 180         uint32_t n;
 181
 182         crc32c_zeros_op(op, len);
 183         for (n = 0; n < 256; n++) {
 184                 zeros[0][n] = gf2_matrix_times(op, n);
 185                 zeros[1][n] = gf2_matrix_times(op, n << 8);
 186                 zeros[2][n] = gf2_matrix_times(op, n << 16);
 187                 zeros[3][n] = gf2_matrix_times(op, n << 24);
 188         }
 189 }
 190
 191 /* Apply the zeros operator table to crc. */
 192 static inline uint32_t
 193 crc32c_shift(uint32_t zeros[][256], uint32_t crc)
 194 {
 195
 196         return (zeros[0][crc & 0xff] ^ zeros[1][(crc >> 8) & 0xff] ^
 197             zeros[2][(crc >> 16) & 0xff] ^ zeros[3][crc >> 24]);
 198 }
 199
 200 /* Initialize tables for shifting crcs. */
 201 static void
 202 #ifdef USERSPACE_TESTING
 203 __attribute__((__constructor__))
 204 #endif
 205 crc32c_init_hw(void)
 206 {
 207         crc32c_zeros(crc32c_long, LONG);
 208         crc32c_zeros(crc32c_2long, 2 * LONG);
 209         crc32c_zeros(crc32c_short, SHORT);
 210         crc32c_zeros(crc32c_2short, 2 * SHORT);
 211 }
 212 #ifdef _KERNEL
 213 SYSINIT(crc32c_sse42, SI_SUB_LOCK, SI_ORDER_ANY, crc32c_init_hw, NULL);
 214 #endif
 215
 216 /* Compute CRC-32C using the Intel hardware instruction. */
 217 #ifdef USERSPACE_TESTING
 218 uint32_t sse42_crc32c(uint32_t, const unsigned char *, unsigned);
 219 #endif
 220 uint32_t
 221 sse42_crc32c(uint32_t crc, const unsigned char *buf, unsigned len)
 222 {
 223 #ifdef __amd64__
 224         const size_t align = 8;
 225 #else
 226         const size_t align = 4;
 227 #endif
 228         const unsigned char *next, *end;
 229 #ifdef __amd64__
 230         uint64_t crc0, crc1, crc2;
 231 #else
 232         uint32_t crc0, crc1, crc2;
 233 #endif
 234
 235         next = buf;
 236         crc0 = crc;
 237
 238         /* Compute the crc to bring the data pointer to an aligned boundary. */
 239         while (len && ((uintptr_t)next & (align - 1)) != 0) {
 240                 crc0 = _mm_crc32_u8(crc0, *next);
 241                 next++;
 242                 len--;
 243         }
 244
 245 #if LONG > SHORT
 246         /*
 247          * Compute the crc on sets of LONG*3 bytes, executing three independent
 248          * crc instructions, each on LONG bytes -- this is optimized for the
 249          * Nehalem, Westmere, Sandy Bridge, and Ivy Bridge architectures, which
 250          * have a throughput of one crc per cycle, but a latency of three
 251          * cycles.
 252          */
 253         crc = 0;
 254         while (len >= LONG * 3) {
 255                 crc1 = 0;
 256                 crc2 = 0;
 257                 end = next + LONG;
 258                 do {
 259 #ifdef __amd64__
 260                         crc0 = _mm_crc32_u64(crc0, *(const uint64_t *)next);
 261                         crc1 = _mm_crc32_u64(crc1,
 262                             *(const uint64_t *)(next + LONG));
 263                         crc2 = _mm_crc32_u64(crc2,
 264                             *(const uint64_t *)(next + (LONG * 2)));
 265 #else
 266                         crc0 = _mm_crc32_u32(crc0, *(const uint32_t *)next);
 267                         crc1 = _mm_crc32_u32(crc1,
 268                             *(const uint32_t *)(next + LONG));
 269                         crc2 = _mm_crc32_u32(crc2,
 270                             *(const uint32_t *)(next + (LONG * 2)));
 271 #endif
 272                         next += align;
 273                 } while (next < end);
 274                 /*-
 275                  * Update the crc.  Try to do it in parallel with the inner
 276                  * loop.  'crc' is used to accumulate crc0 and crc1
 277                  * produced by the inner loop so that the next iteration
 278                  * of the loop doesn't depend on anything except crc2.
 279                  *
 280                  * The full expression for the update is:
 281                  *     crc = S*S*S*crc + S*S*crc0 + S*crc1
 282                  * where the terms are polynomials modulo the CRC polynomial.
 283                  * We regroup this subtly as:
 284                  *     crc = S*S * (S*crc + crc0) + S*crc1.
 285                  * This has an extra dependency which reduces possible
 286                  * parallelism for the expression, but it turns out to be
 287                  * best to intentionally delay evaluation of this expression
 288                  * so that it competes less with the inner loop.
 289                  *
 290                  * We also intentionally reduce parallelism by feedng back
 291                  * crc2 to the inner loop as crc0 instead of accumulating
 292                  * it in crc.  This synchronizes the loop with crc update.
 293                  * CPU and/or compiler schedulers produced bad order without
 294                  * this.
 295                  *
 296                  * Shifts take about 12 cycles each, so 3 here with 2
 297                  * parallelizable take about 24 cycles and the crc update
 298                  * takes slightly longer.  8 dependent crc32 instructions
 299                  * can run in 24 cycles, so the 3-way blocking is worse
 300                  * than useless for sizes less than 8 * <word size> = 64
 301                  * on amd64.  In practice, SHORT = 32 confirms these
 302                  * timing calculations by giving a small improvement
 303                  * starting at size 96.  Then the inner loop takes about
 304                  * 12 cycles and the crc update about 24, but these are
 305                  * partly in parallel so the total time is less than the
 306                  * 36 cycles that 12 dependent crc32 instructions would
 307                  * take.
 308                  *
 309                  * To have a chance of completely hiding the overhead for
 310                  * the crc update, the inner loop must take considerably
 311                  * longer than 24 cycles.  LONG = 64 makes the inner loop
 312                  * take about 24 cycles, so is not quite large enough.
 313                  * LONG = 128 works OK.  Unhideable overheads are about
 314                  * 12 cycles per inner loop.  All assuming timing like
 315                  * Haswell.
 316                  */
 317                 crc = crc32c_shift(crc32c_long, crc) ^ crc0;
 318                 crc1 = crc32c_shift(crc32c_long, crc1);
 319                 crc = crc32c_shift(crc32c_2long, crc) ^ crc1;
 320                 crc0 = crc2;
 321                 next += LONG * 2;
 322                 len -= LONG * 3;
 323         }
 324         crc0 ^= crc;
 325 #endif /* LONG > SHORT */
 326
 327         /*
 328          * Do the same thing, but now on SHORT*3 blocks for the remaining data
 329          * less than a LONG*3 block
 330          */
 331         crc = 0;
 332         while (len >= SHORT * 3) {
 333                 crc1 = 0;
 334                 crc2 = 0;
 335                 end = next + SHORT;
 336                 do {
 337 #ifdef __amd64__
 338                         crc0 = _mm_crc32_u64(crc0, *(const uint64_t *)next);
 339                         crc1 = _mm_crc32_u64(crc1,
 340                             *(const uint64_t *)(next + SHORT));
 341                         crc2 = _mm_crc32_u64(crc2,
 342                             *(const uint64_t *)(next + (SHORT * 2)));
 343 #else
 344                         crc0 = _mm_crc32_u32(crc0, *(const uint32_t *)next);
 345                         crc1 = _mm_crc32_u32(crc1,
 346                             *(const uint32_t *)(next + SHORT));
 347                         crc2 = _mm_crc32_u32(crc2,
 348                             *(const uint32_t *)(next + (SHORT * 2)));
 349 #endif
 350                         next += align;
 351                 } while (next < end);
 352                 crc = crc32c_shift(crc32c_short, crc) ^ crc0;
 353                 crc1 = crc32c_shift(crc32c_short, crc1);
 354                 crc = crc32c_shift(crc32c_2short, crc) ^ crc1;
 355                 crc0 = crc2;
 356                 next += SHORT * 2;
 357                 len -= SHORT * 3;
 358         }
 359         crc0 ^= crc;
 360
 361         /* Compute the crc on the remaining bytes at native word size. */
 362         end = next + (len - (len & (align - 1)));
 363         while (next < end) {
 364 #ifdef __amd64__
 365                 crc0 = _mm_crc32_u64(crc0, *(const uint64_t *)next);
 366 #else
 367                 crc0 = _mm_crc32_u32(crc0, *(const uint32_t *)next);
 368 #endif
 369                 next += align;
 370         }
 371         len &= (align - 1);
 372
 373         /* Compute the crc for any trailing bytes. */
 374         while (len) {
 375                 crc0 = _mm_crc32_u8(crc0, *next);
 376                 next++;
 377                 len--;
 378         }
 379
 380         return ((uint32_t)crc0);
 381 }