sys/contrib/openzfs/module/zcommon/zfs_fletcher.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
  25  */
  26 /*
  27  * Copyright 2013 Saso Kiselkov. All rights reserved.
  28  */
  29
  30 /*
  31  * Copyright (c) 2016 by Delphix. All rights reserved.
  32  */
  33
  34 /*
  35  * Fletcher Checksums
  36  * ------------------
  37  *
  38  * ZFS's 2nd and 4th order Fletcher checksums are defined by the following
  39  * recurrence relations:
  40  *
  41  *      a  = a    + f
  42  *       i    i-1    i-1
  43  *
  44  *      b  = b    + a
  45  *       i    i-1    i
  46  *
  47  *      c  = c    + b           (fletcher-4 only)
  48  *       i    i-1    i
  49  *
  50  *      d  = d    + c           (fletcher-4 only)
  51  *       i    i-1    i
  52  *
  53  * Where
  54  *      a_0 = b_0 = c_0 = d_0 = 0
  55  * and
  56  *      f_0 .. f_(n-1) are the input data.
  57  *
  58  * Using standard techniques, these translate into the following series:
  59  *
  60  *           __n_                            __n_
  61  *           \   |                           \   |
  62  *      a  =  >     f                   b  =  >     i * f
  63  *       n   /___|   n - i               n   /___|       n - i
  64  *           i = 1                           i = 1
  65  *
  66  *
  67  *           __n_                            __n_
  68  *           \   |  i*(i+1)                  \   |  i*(i+1)*(i+2)
  69  *      c  =  >     ------- f           d  =  >     ------------- f
  70  *       n   /___|     2     n - i       n   /___|        6        n - i
  71  *           i = 1                           i = 1
  72  *
  73  * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators.
  74  * Since the additions are done mod (2^64), errors in the high bits may not
  75  * be noticed.  For this reason, fletcher-2 is deprecated.
  76  *
  77  * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators.
  78  * A conservative estimate of how big the buffer can get before we overflow
  79  * can be estimated using f_i = 0xffffffff for all i:
  80  *
  81  * % bc
  82  *  f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4
  83  * 2264
  84  *  quit
  85  * %
  86  *
  87  * So blocks of up to 2k will not overflow.  Our largest block size is
  88  * 128k, which has 32k 4-byte words, so we can compute the largest possible
  89  * accumulators, then divide by 2^64 to figure the max amount of overflow:
  90  *
  91  * % bc
  92  *  a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c }
  93  *  a/2^64;b/2^64;c/2^64;d/2^64
  94  * 0
  95  * 0
  96  * 1365
  97  * 11186858
  98  *  quit
  99  * %
 100  *
 101  * So a and b cannot overflow.  To make sure each bit of input has some
 102  * effect on the contents of c and d, we can look at what the factors of
 103  * the coefficients in the equations for c_n and d_n are.  The number of 2s
 104  * in the factors determines the lowest set bit in the multiplier.  Running
 105  * through the cases for n*(n+1)/2 reveals that the highest power of 2 is
 106  * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15.  So while some data may overflow
 107  * the 64-bit accumulators, every bit of every f_i effects every accumulator,
 108  * even for 128k blocks.
 109  *
 110  * If we wanted to make a stronger version of fletcher4 (fletcher4c?),
 111  * we could do our calculations mod (2^32 - 1) by adding in the carries
 112  * periodically, and store the number of carries in the top 32-bits.
 113  *
 114  * --------------------
 115  * Checksum Performance
 116  * --------------------
 117  *
 118  * There are two interesting components to checksum performance: cached and
 119  * uncached performance.  With cached data, fletcher-2 is about four times
 120  * faster than fletcher-4.  With uncached data, the performance difference is
 121  * negligible, since the cost of a cache fill dominates the processing time.
 122  * Even though fletcher-4 is slower than fletcher-2, it is still a pretty
 123  * efficient pass over the data.
 124  *
 125  * In normal operation, the data which is being checksummed is in a buffer
 126  * which has been filled either by:
 127  *
 128  *      1. a compression step, which will be mostly cached, or
 129  *      2. a bcopy() or copyin(), which will be uncached (because the
 130  *         copy is cache-bypassing).
 131  *
 132  * For both cached and uncached data, both fletcher checksums are much faster
 133  * than sha-256, and slower than 'off', which doesn't touch the data at all.
 134  */
 135
 136 #include <sys/types.h>
 137 #include <sys/sysmacros.h>
 138 #include <sys/byteorder.h>
 139 #include <sys/spa.h>
 140 #include <sys/simd.h>
 141 #include <sys/zio_checksum.h>
 142 #include <sys/zfs_context.h>
 143 #include <zfs_fletcher.h>
 144
 145 #define FLETCHER_MIN_SIMD_SIZE  64
 146
 147 static void fletcher_4_scalar_init(fletcher_4_ctx_t *ctx);
 148 static void fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp);
 149 static void fletcher_4_scalar_native(fletcher_4_ctx_t *ctx,
 150     const void *buf, uint64_t size);
 151 static void fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx,
 152     const void *buf, uint64_t size);
 153 static boolean_t fletcher_4_scalar_valid(void);
 154
 155 static const fletcher_4_ops_t fletcher_4_scalar_ops = {
 156         .init_native = fletcher_4_scalar_init,
 157         .fini_native = fletcher_4_scalar_fini,
 158         .compute_native = fletcher_4_scalar_native,
 159         .init_byteswap = fletcher_4_scalar_init,
 160         .fini_byteswap = fletcher_4_scalar_fini,
 161         .compute_byteswap = fletcher_4_scalar_byteswap,
 162         .valid = fletcher_4_scalar_valid,
 163         .name = "scalar"
 164 };
 165
 166 static fletcher_4_ops_t fletcher_4_fastest_impl = {
 167         .name = "fastest",
 168         .valid = fletcher_4_scalar_valid
 169 };
 170
 171 static const fletcher_4_ops_t *fletcher_4_impls[] = {
 172         &fletcher_4_scalar_ops,
 173         &fletcher_4_superscalar_ops,
 174         &fletcher_4_superscalar4_ops,
 175 #if defined(HAVE_SSE2)
 176         &fletcher_4_sse2_ops,
 177 #endif
 178 #if defined(HAVE_SSE2) && defined(HAVE_SSSE3)
 179         &fletcher_4_ssse3_ops,
 180 #endif
 181 #if defined(HAVE_AVX) && defined(HAVE_AVX2)
 182         &fletcher_4_avx2_ops,
 183 #endif
 184 #if defined(__x86_64) && defined(HAVE_AVX512F)
 185         &fletcher_4_avx512f_ops,
 186 #endif
 187 #if defined(__x86_64) && defined(HAVE_AVX512BW)
 188         &fletcher_4_avx512bw_ops,
 189 #endif
 190 #if defined(__aarch64__) && !defined(__FreeBSD__)
 191         &fletcher_4_aarch64_neon_ops,
 192 #endif
 193 };
 194
 195 /* Hold all supported implementations */
 196 static uint32_t fletcher_4_supp_impls_cnt = 0;
 197 static fletcher_4_ops_t *fletcher_4_supp_impls[ARRAY_SIZE(fletcher_4_impls)];
 198
 199 /* Select fletcher4 implementation */
 200 #define IMPL_FASTEST    (UINT32_MAX)
 201 #define IMPL_CYCLE      (UINT32_MAX - 1)
 202 #define IMPL_SCALAR     (0)
 203
 204 static uint32_t fletcher_4_impl_chosen = IMPL_FASTEST;
 205
 206 #define IMPL_READ(i)    (*(volatile uint32_t *) &(i))
 207
 208 static struct fletcher_4_impl_selector {
 209         const char      *fis_name;
 210         uint32_t        fis_sel;
 211 } fletcher_4_impl_selectors[] = {
 212         { "cycle",      IMPL_CYCLE },
 213         { "fastest",    IMPL_FASTEST },
 214         { "scalar",     IMPL_SCALAR }
 215 };
 216
 217 #if defined(_KERNEL)
 218 static kstat_t *fletcher_4_kstat;
 219
 220 static struct fletcher_4_kstat {
 221         uint64_t native;
 222         uint64_t byteswap;
 223 } fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1];
 224 #endif
 225
 226 /* Indicate that benchmark has been completed */
 227 static boolean_t fletcher_4_initialized = B_FALSE;
 228
 229 void
 230 fletcher_init(zio_cksum_t *zcp)
 231 {
 232         ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
 233 }
 234
 235 int
 236 fletcher_2_incremental_native(void *buf, size_t size, void *data)
 237 {
 238         zio_cksum_t *zcp = data;
 239
 240         const uint64_t *ip = buf;
 241         const uint64_t *ipend = ip + (size / sizeof (uint64_t));
 242         uint64_t a0, b0, a1, b1;
 243
 244         a0 = zcp->zc_word[0];
 245         a1 = zcp->zc_word[1];
 246         b0 = zcp->zc_word[2];
 247         b1 = zcp->zc_word[3];
 248
 249         for (; ip < ipend; ip += 2) {
 250                 a0 += ip[0];
 251                 a1 += ip[1];
 252                 b0 += a0;
 253                 b1 += a1;
 254         }
 255
 256         ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
 257         return (0);
 258 }
 259
 260 void
 261 fletcher_2_native(const void *buf, uint64_t size,
 262     const void *ctx_template, zio_cksum_t *zcp)
 263 {
 264         (void) ctx_template;
 265         fletcher_init(zcp);
 266         (void) fletcher_2_incremental_native((void *) buf, size, zcp);
 267 }
 268
 269 int
 270 fletcher_2_incremental_byteswap(void *buf, size_t size, void *data)
 271 {
 272         zio_cksum_t *zcp = data;
 273
 274         const uint64_t *ip = buf;
 275         const uint64_t *ipend = ip + (size / sizeof (uint64_t));
 276         uint64_t a0, b0, a1, b1;
 277
 278         a0 = zcp->zc_word[0];
 279         a1 = zcp->zc_word[1];
 280         b0 = zcp->zc_word[2];
 281         b1 = zcp->zc_word[3];
 282
 283         for (; ip < ipend; ip += 2) {
 284                 a0 += BSWAP_64(ip[0]);
 285                 a1 += BSWAP_64(ip[1]);
 286                 b0 += a0;
 287                 b1 += a1;
 288         }
 289
 290         ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
 291         return (0);
 292 }
 293
 294 void
 295 fletcher_2_byteswap(const void *buf, uint64_t size,
 296     const void *ctx_template, zio_cksum_t *zcp)
 297 {
 298         (void) ctx_template;
 299         fletcher_init(zcp);
 300         (void) fletcher_2_incremental_byteswap((void *) buf, size, zcp);
 301 }
 302
 303 static void
 304 fletcher_4_scalar_init(fletcher_4_ctx_t *ctx)
 305 {
 306         ZIO_SET_CHECKSUM(&ctx->scalar, 0, 0, 0, 0);
 307 }
 308
 309 static void
 310 fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
 311 {
 312         memcpy(zcp, &ctx->scalar, sizeof (zio_cksum_t));
 313 }
 314
 315 static void
 316 fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, const void *buf,
 317     uint64_t size)
 318 {
 319         const uint32_t *ip = buf;
 320         const uint32_t *ipend = ip + (size / sizeof (uint32_t));
 321         uint64_t a, b, c, d;
 322
 323         a = ctx->scalar.zc_word[0];
 324         b = ctx->scalar.zc_word[1];
 325         c = ctx->scalar.zc_word[2];
 326         d = ctx->scalar.zc_word[3];
 327
 328         for (; ip < ipend; ip++) {
 329                 a += ip[0];
 330                 b += a;
 331                 c += b;
 332                 d += c;
 333         }
 334
 335         ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);
 336 }
 337
 338 static void
 339 fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, const void *buf,
 340     uint64_t size)
 341 {
 342         const uint32_t *ip = buf;
 343         const uint32_t *ipend = ip + (size / sizeof (uint32_t));
 344         uint64_t a, b, c, d;
 345
 346         a = ctx->scalar.zc_word[0];
 347         b = ctx->scalar.zc_word[1];
 348         c = ctx->scalar.zc_word[2];
 349         d = ctx->scalar.zc_word[3];
 350
 351         for (; ip < ipend; ip++) {
 352                 a += BSWAP_32(ip[0]);
 353                 b += a;
 354                 c += b;
 355                 d += c;
 356         }
 357
 358         ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);
 359 }
 360
 361 static boolean_t
 362 fletcher_4_scalar_valid(void)
 363 {
 364         return (B_TRUE);
 365 }
 366
 367 int
 368 fletcher_4_impl_set(const char *val)
 369 {
 370         int err = -EINVAL;
 371         uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
 372         size_t i, val_len;
 373
 374         val_len = strlen(val);
 375         while ((val_len > 0) && !!isspace(val[val_len-1])) /* trim '\n' */
 376                 val_len--;
 377
 378         /* check mandatory implementations */
 379         for (i = 0; i < ARRAY_SIZE(fletcher_4_impl_selectors); i++) {
 380                 const char *name = fletcher_4_impl_selectors[i].fis_name;
 381
 382                 if (val_len == strlen(name) &&
 383                     strncmp(val, name, val_len) == 0) {
 384                         impl = fletcher_4_impl_selectors[i].fis_sel;
 385                         err = 0;
 386                         break;
 387                 }
 388         }
 389
 390         if (err != 0 && fletcher_4_initialized) {
 391                 /* check all supported implementations */
 392                 for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
 393                         const char *name = fletcher_4_supp_impls[i]->name;
 394
 395                         if (val_len == strlen(name) &&
 396                             strncmp(val, name, val_len) == 0) {
 397                                 impl = i;
 398                                 err = 0;
 399                                 break;
 400                         }
 401                 }
 402         }
 403
 404         if (err == 0) {
 405                 atomic_swap_32(&fletcher_4_impl_chosen, impl);
 406                 membar_producer();
 407         }
 408
 409         return (err);
 410 }
 411
 412 /*
 413  * Returns the Fletcher 4 operations for checksums.   When a SIMD
 414  * implementation is not allowed in the current context, then fallback
 415  * to the fastest generic implementation.
 416  */
 417 static inline const fletcher_4_ops_t *
 418 fletcher_4_impl_get(void)
 419 {
 420         if (!kfpu_allowed())
 421                 return (&fletcher_4_superscalar4_ops);
 422
 423         const fletcher_4_ops_t *ops = NULL;
 424         uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
 425
 426         switch (impl) {
 427         case IMPL_FASTEST:
 428                 ASSERT(fletcher_4_initialized);
 429                 ops = &fletcher_4_fastest_impl;
 430                 break;
 431         case IMPL_CYCLE:
 432                 /* Cycle through supported implementations */
 433                 ASSERT(fletcher_4_initialized);
 434                 ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
 435                 static uint32_t cycle_count = 0;
 436                 uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt;
 437                 ops = fletcher_4_supp_impls[idx];
 438                 break;
 439         default:
 440                 ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
 441                 ASSERT3U(impl, <, fletcher_4_supp_impls_cnt);
 442                 ops = fletcher_4_supp_impls[impl];
 443                 break;
 444         }
 445
 446         ASSERT3P(ops, !=, NULL);
 447
 448         return (ops);
 449 }
 450
 451 static inline void
 452 fletcher_4_native_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
 453 {
 454         fletcher_4_ctx_t ctx;
 455         const fletcher_4_ops_t *ops = fletcher_4_impl_get();
 456
 457         ops->init_native(&ctx);
 458         ops->compute_native(&ctx, buf, size);
 459         ops->fini_native(&ctx, zcp);
 460 }
 461
 462 void
 463 fletcher_4_native(const void *buf, uint64_t size,
 464     const void *ctx_template, zio_cksum_t *zcp)
 465 {
 466         (void) ctx_template;
 467         const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
 468
 469         ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
 470
 471         if (size == 0 || p2size == 0) {
 472                 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
 473
 474                 if (size > 0)
 475                         fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,
 476                             buf, size);
 477         } else {
 478                 fletcher_4_native_impl(buf, p2size, zcp);
 479
 480                 if (p2size < size)
 481                         fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,
 482                             (char *)buf + p2size, size - p2size);
 483         }
 484 }
 485
 486 void
 487 fletcher_4_native_varsize(const void *buf, uint64_t size, zio_cksum_t *zcp)
 488 {
 489         ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
 490         fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
 491 }
 492
 493 static inline void
 494 fletcher_4_byteswap_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
 495 {
 496         fletcher_4_ctx_t ctx;
 497         const fletcher_4_ops_t *ops = fletcher_4_impl_get();
 498
 499         ops->init_byteswap(&ctx);
 500         ops->compute_byteswap(&ctx, buf, size);
 501         ops->fini_byteswap(&ctx, zcp);
 502 }
 503
 504 void
 505 fletcher_4_byteswap(const void *buf, uint64_t size,
 506     const void *ctx_template, zio_cksum_t *zcp)
 507 {
 508         (void) ctx_template;
 509         const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
 510
 511         ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
 512
 513         if (size == 0 || p2size == 0) {
 514                 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
 515
 516                 if (size > 0)
 517                         fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
 518                             buf, size);
 519         } else {
 520                 fletcher_4_byteswap_impl(buf, p2size, zcp);
 521
 522                 if (p2size < size)
 523                         fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
 524                             (char *)buf + p2size, size - p2size);
 525         }
 526 }
 527
 528 /* Incremental Fletcher 4 */
 529
 530 #define ZFS_FLETCHER_4_INC_MAX_SIZE     (8ULL << 20)
 531
 532 static inline void
 533 fletcher_4_incremental_combine(zio_cksum_t *zcp, const uint64_t size,
 534     const zio_cksum_t *nzcp)
 535 {
 536         const uint64_t c1 = size / sizeof (uint32_t);
 537         const uint64_t c2 = c1 * (c1 + 1) / 2;
 538         const uint64_t c3 = c2 * (c1 + 2) / 3;
 539
 540         /*
 541          * Value of 'c3' overflows on buffer sizes close to 16MiB. For that
 542          * reason we split incremental fletcher4 computation of large buffers
 543          * to steps of (ZFS_FLETCHER_4_INC_MAX_SIZE) size.
 544          */
 545         ASSERT3U(size, <=, ZFS_FLETCHER_4_INC_MAX_SIZE);
 546
 547         zcp->zc_word[3] += nzcp->zc_word[3] + c1 * zcp->zc_word[2] +
 548             c2 * zcp->zc_word[1] + c3 * zcp->zc_word[0];
 549         zcp->zc_word[2] += nzcp->zc_word[2] + c1 * zcp->zc_word[1] +
 550             c2 * zcp->zc_word[0];
 551         zcp->zc_word[1] += nzcp->zc_word[1] + c1 * zcp->zc_word[0];
 552         zcp->zc_word[0] += nzcp->zc_word[0];
 553 }
 554
 555 static inline void
 556 fletcher_4_incremental_impl(boolean_t native, const void *buf, uint64_t size,
 557     zio_cksum_t *zcp)
 558 {
 559         while (size > 0) {
 560                 zio_cksum_t nzc;
 561                 uint64_t len = MIN(size, ZFS_FLETCHER_4_INC_MAX_SIZE);
 562
 563                 if (native)
 564                         fletcher_4_native(buf, len, NULL, &nzc);
 565                 else
 566                         fletcher_4_byteswap(buf, len, NULL, &nzc);
 567
 568                 fletcher_4_incremental_combine(zcp, len, &nzc);
 569
 570                 size -= len;
 571                 buf += len;
 572         }
 573 }
 574
 575 int
 576 fletcher_4_incremental_native(void *buf, size_t size, void *data)
 577 {
 578         zio_cksum_t *zcp = data;
 579         /* Use scalar impl to directly update cksum of small blocks */
 580         if (size < SPA_MINBLOCKSIZE)
 581                 fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
 582         else
 583                 fletcher_4_incremental_impl(B_TRUE, buf, size, zcp);
 584         return (0);
 585 }
 586
 587 int
 588 fletcher_4_incremental_byteswap(void *buf, size_t size, void *data)
 589 {
 590         zio_cksum_t *zcp = data;
 591         /* Use scalar impl to directly update cksum of small blocks */
 592         if (size < SPA_MINBLOCKSIZE)
 593                 fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size);
 594         else
 595                 fletcher_4_incremental_impl(B_FALSE, buf, size, zcp);
 596         return (0);
 597 }
 598
 599 #if defined(_KERNEL)
 600 /*
 601  * Fletcher 4 kstats
 602  */
 603 static int
 604 fletcher_4_kstat_headers(char *buf, size_t size)
 605 {
 606         ssize_t off = 0;
 607
 608         off += snprintf(buf + off, size, "%-17s", "implementation");
 609         off += snprintf(buf + off, size - off, "%-15s", "native");
 610         (void) snprintf(buf + off, size - off, "%-15s\n", "byteswap");
 611
 612         return (0);
 613 }
 614
 615 static int
 616 fletcher_4_kstat_data(char *buf, size_t size, void *data)
 617 {
 618         struct fletcher_4_kstat *fastest_stat =
 619             &fletcher_4_stat_data[fletcher_4_supp_impls_cnt];
 620         struct fletcher_4_kstat *curr_stat = (struct fletcher_4_kstat *)data;
 621         ssize_t off = 0;
 622
 623         if (curr_stat == fastest_stat) {
 624                 off += snprintf(buf + off, size - off, "%-17s", "fastest");
 625                 off += snprintf(buf + off, size - off, "%-15s",
 626                     fletcher_4_supp_impls[fastest_stat->native]->name);
 627                 off += snprintf(buf + off, size - off, "%-15s\n",
 628                     fletcher_4_supp_impls[fastest_stat->byteswap]->name);
 629         } else {
 630                 ptrdiff_t id = curr_stat - fletcher_4_stat_data;
 631
 632                 off += snprintf(buf + off, size - off, "%-17s",
 633                     fletcher_4_supp_impls[id]->name);
 634                 off += snprintf(buf + off, size - off, "%-15llu",
 635                     (u_longlong_t)curr_stat->native);
 636                 off += snprintf(buf + off, size - off, "%-15llu\n",
 637                     (u_longlong_t)curr_stat->byteswap);
 638         }
 639
 640         return (0);
 641 }
 642
 643 static void *
 644 fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
 645 {
 646         if (n <= fletcher_4_supp_impls_cnt)
 647                 ksp->ks_private = (void *) (fletcher_4_stat_data + n);
 648         else
 649                 ksp->ks_private = NULL;
 650
 651         return (ksp->ks_private);
 652 }
 653 #endif
 654
 655 #define FLETCHER_4_FASTEST_FN_COPY(type, src)                             \
 656 {                                                                         \
 657         fletcher_4_fastest_impl.init_ ## type = src->init_ ## type;       \
 658         fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type;       \
 659         fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \
 660 }
 661
 662 #define FLETCHER_4_BENCH_NS     (MSEC2NSEC(1))          /* 1ms */
 663
 664 typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *,
 665                                         zio_cksum_t *);
 666
 667 #if defined(_KERNEL)
 668 static void
 669 fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
 670 {
 671
 672         struct fletcher_4_kstat *fastest_stat =
 673             &fletcher_4_stat_data[fletcher_4_supp_impls_cnt];
 674         hrtime_t start;
 675         uint64_t run_bw, run_time_ns, best_run = 0;
 676         zio_cksum_t zc;
 677         uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen);
 678
 679         fletcher_checksum_func_t *fletcher_4_test = native ?
 680             fletcher_4_native : fletcher_4_byteswap;
 681
 682         for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
 683                 struct fletcher_4_kstat *stat = &fletcher_4_stat_data[i];
 684                 uint64_t run_count = 0;
 685
 686                 /* temporary set an implementation */
 687                 fletcher_4_impl_chosen = i;
 688
 689                 kpreempt_disable();
 690                 start = gethrtime();
 691                 do {
 692                         for (l = 0; l < 32; l++, run_count++)
 693                                 fletcher_4_test(data, data_size, NULL, &zc);
 694
 695                         run_time_ns = gethrtime() - start;
 696                 } while (run_time_ns < FLETCHER_4_BENCH_NS);
 697                 kpreempt_enable();
 698
 699                 run_bw = data_size * run_count * NANOSEC;
 700                 run_bw /= run_time_ns;  /* B/s */
 701
 702                 if (native)
 703                         stat->native = run_bw;
 704                 else
 705                         stat->byteswap = run_bw;
 706
 707                 if (run_bw > best_run) {
 708                         best_run = run_bw;
 709
 710                         if (native) {
 711                                 fastest_stat->native = i;
 712                                 FLETCHER_4_FASTEST_FN_COPY(native,
 713                                     fletcher_4_supp_impls[i]);
 714                         } else {
 715                                 fastest_stat->byteswap = i;
 716                                 FLETCHER_4_FASTEST_FN_COPY(byteswap,
 717                                     fletcher_4_supp_impls[i]);
 718                         }
 719                 }
 720         }
 721
 722         /* restore original selection */
 723         atomic_swap_32(&fletcher_4_impl_chosen, sel_save);
 724 }
 725 #endif /* _KERNEL */
 726
 727 /*
 728  * Initialize and benchmark all supported implementations.
 729  */
 730 static void
 731 fletcher_4_benchmark(void)
 732 {
 733         fletcher_4_ops_t *curr_impl;
 734         int i, c;
 735
 736         /* Move supported implementations into fletcher_4_supp_impls */
 737         for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) {
 738                 curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i];
 739
 740                 if (curr_impl->valid && curr_impl->valid())
 741                         fletcher_4_supp_impls[c++] = curr_impl;
 742         }
 743         membar_producer();      /* complete fletcher_4_supp_impls[] init */
 744         fletcher_4_supp_impls_cnt = c;  /* number of supported impl */
 745
 746 #if defined(_KERNEL)
 747         static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
 748         char *databuf = vmem_alloc(data_size, KM_SLEEP);
 749
 750         for (i = 0; i < data_size / sizeof (uint64_t); i++)
 751                 ((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */
 752
 753         fletcher_4_benchmark_impl(B_FALSE, databuf, data_size);
 754         fletcher_4_benchmark_impl(B_TRUE, databuf, data_size);
 755
 756         vmem_free(databuf, data_size);
 757 #else
 758         /*
 759          * Skip the benchmark in user space to avoid impacting libzpool
 760          * consumers (zdb, zhack, zinject, ztest).  The last implementation
 761          * is assumed to be the fastest and used by default.
 762          */
 763         memcpy(&fletcher_4_fastest_impl,
 764             fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1],
 765             sizeof (fletcher_4_fastest_impl));
 766         fletcher_4_fastest_impl.name = "fastest";
 767         membar_producer();
 768 #endif /* _KERNEL */
 769 }
 770
 771 void
 772 fletcher_4_init(void)
 773 {
 774         /* Determine the fastest available implementation. */
 775         fletcher_4_benchmark();
 776
 777 #if defined(_KERNEL)
 778         /* Install kstats for all implementations */
 779         fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc",
 780             KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
 781         if (fletcher_4_kstat != NULL) {
 782                 fletcher_4_kstat->ks_data = NULL;
 783                 fletcher_4_kstat->ks_ndata = UINT32_MAX;
 784                 kstat_set_raw_ops(fletcher_4_kstat,
 785                     fletcher_4_kstat_headers,
 786                     fletcher_4_kstat_data,
 787                     fletcher_4_kstat_addr);
 788                 kstat_install(fletcher_4_kstat);
 789         }
 790 #endif
 791
 792         /* Finish initialization */
 793         fletcher_4_initialized = B_TRUE;
 794 }
 795
 796 void
 797 fletcher_4_fini(void)
 798 {
 799 #if defined(_KERNEL)
 800         if (fletcher_4_kstat != NULL) {
 801                 kstat_delete(fletcher_4_kstat);
 802                 fletcher_4_kstat = NULL;
 803         }
 804 #endif
 805 }
 806
 807 /* ABD adapters */
 808
 809 static void
 810 abd_fletcher_4_init(zio_abd_checksum_data_t *cdp)
 811 {
 812         const fletcher_4_ops_t *ops = fletcher_4_impl_get();
 813         cdp->acd_private = (void *) ops;
 814
 815         if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE)
 816                 ops->init_native(cdp->acd_ctx);
 817         else
 818                 ops->init_byteswap(cdp->acd_ctx);
 819 }
 820
 821 static void
 822 abd_fletcher_4_fini(zio_abd_checksum_data_t *cdp)
 823 {
 824         fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private;
 825
 826         ASSERT(ops);
 827
 828         if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE)
 829                 ops->fini_native(cdp->acd_ctx, cdp->acd_zcp);
 830         else
 831                 ops->fini_byteswap(cdp->acd_ctx, cdp->acd_zcp);
 832 }
 833
 834 static void
 835 abd_fletcher_4_simd2scalar(boolean_t native, void *data, size_t size,
 836     zio_abd_checksum_data_t *cdp)
 837 {
 838         zio_cksum_t *zcp = cdp->acd_zcp;
 839
 840         ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE);
 841
 842         abd_fletcher_4_fini(cdp);
 843         cdp->acd_private = (void *)&fletcher_4_scalar_ops;
 844
 845         if (native)
 846                 fletcher_4_incremental_native(data, size, zcp);
 847         else
 848                 fletcher_4_incremental_byteswap(data, size, zcp);
 849 }
 850
 851 static int
 852 abd_fletcher_4_iter(void *data, size_t size, void *private)
 853 {
 854         zio_abd_checksum_data_t *cdp = (zio_abd_checksum_data_t *)private;
 855         fletcher_4_ctx_t *ctx = cdp->acd_ctx;
 856         fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private;
 857         boolean_t native = cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE;
 858         uint64_t asize = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
 859
 860         ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
 861
 862         if (asize > 0) {
 863                 if (native)
 864                         ops->compute_native(ctx, data, asize);
 865                 else
 866                         ops->compute_byteswap(ctx, data, asize);
 867
 868                 size -= asize;
 869                 data = (char *)data + asize;
 870         }
 871
 872         if (size > 0) {
 873                 ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE);
 874                 /* At this point we have to switch to scalar impl */
 875                 abd_fletcher_4_simd2scalar(native, data, size, cdp);
 876         }
 877
 878         return (0);
 879 }
 880
 881 zio_abd_checksum_func_t fletcher_4_abd_ops = {
 882         .acf_init = abd_fletcher_4_init,
 883         .acf_fini = abd_fletcher_4_fini,
 884         .acf_iter = abd_fletcher_4_iter
 885 };
 886
 887 #if defined(_KERNEL)
 888
 889 #define IMPL_FMT(impl, i)       (((impl) == (i)) ? "[%s] " : "%s ")
 890
 891 #if defined(__linux__)
 892
 893 static int
 894 fletcher_4_param_get(char *buffer, zfs_kernel_param_t *unused)
 895 {
 896         const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
 897         char *fmt;
 898         int cnt = 0;
 899
 900         /* list fastest */
 901         fmt = IMPL_FMT(impl, IMPL_FASTEST);
 902         cnt += sprintf(buffer + cnt, fmt, "fastest");
 903
 904         /* list all supported implementations */
 905         for (uint32_t i = 0; i < fletcher_4_supp_impls_cnt; ++i) {
 906                 fmt = IMPL_FMT(impl, i);
 907                 cnt += sprintf(buffer + cnt, fmt,
 908                     fletcher_4_supp_impls[i]->name);
 909         }
 910
 911         return (cnt);
 912 }
 913
 914 static int
 915 fletcher_4_param_set(const char *val, zfs_kernel_param_t *unused)
 916 {
 917         return (fletcher_4_impl_set(val));
 918 }
 919
 920 #else
 921
 922 #include <sys/sbuf.h>
 923
 924 static int
 925 fletcher_4_param(ZFS_MODULE_PARAM_ARGS)
 926 {
 927         int err;
 928
 929         if (req->newptr == NULL) {
 930                 const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
 931                 const int init_buflen = 64;
 932                 const char *fmt;
 933                 struct sbuf *s;
 934
 935                 s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req);
 936
 937                 /* list fastest */
 938                 fmt = IMPL_FMT(impl, IMPL_FASTEST);
 939                 (void) sbuf_printf(s, fmt, "fastest");
 940
 941                 /* list all supported implementations */
 942                 for (uint32_t i = 0; i < fletcher_4_supp_impls_cnt; ++i) {
 943                         fmt = IMPL_FMT(impl, i);
 944                         (void) sbuf_printf(s, fmt,
 945                             fletcher_4_supp_impls[i]->name);
 946                 }
 947
 948                 err = sbuf_finish(s);
 949                 sbuf_delete(s);
 950
 951                 return (err);
 952         }
 953
 954         char buf[16];
 955
 956         err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
 957         if (err)
 958                 return (err);
 959         return (-fletcher_4_impl_set(buf));
 960 }
 961
 962 #endif
 963
 964 #undef IMPL_FMT
 965
 966 /*
 967  * Choose a fletcher 4 implementation in ZFS.
 968  * Users can choose "cycle" to exercise all implementations, but this is
 969  * for testing purpose therefore it can only be set in user space.
 970  */
 971 /* BEGIN CSTYLED */
 972 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, fletcher_4_impl,
 973         fletcher_4_param_set, fletcher_4_param_get, ZMOD_RW,
 974         "Select fletcher 4 implementation.");
 975 /* END CSTYLED */
 976
 977 EXPORT_SYMBOL(fletcher_init);
 978 EXPORT_SYMBOL(fletcher_2_incremental_native);
 979 EXPORT_SYMBOL(fletcher_2_incremental_byteswap);
 980 EXPORT_SYMBOL(fletcher_4_init);
 981 EXPORT_SYMBOL(fletcher_4_fini);
 982 EXPORT_SYMBOL(fletcher_2_native);
 983 EXPORT_SYMBOL(fletcher_2_byteswap);
 984 EXPORT_SYMBOL(fletcher_4_native);
 985 EXPORT_SYMBOL(fletcher_4_native_varsize);
 986 EXPORT_SYMBOL(fletcher_4_byteswap);
 987 EXPORT_SYMBOL(fletcher_4_incremental_native);
 988 EXPORT_SYMBOL(fletcher_4_incremental_byteswap);
 989 EXPORT_SYMBOL(fletcher_4_abd_ops);
 990 #endif