4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
25 #include <sys/zfs_context.h>
26 #include <sys/types.h>
28 #include <sys/debug.h>
29 #include <sys/zfs_debug.h>
31 #include <sys/vdev_raidz.h>
32 #include <sys/vdev_raidz_impl.h>
34 extern boolean_t raidz_will_scalar_work(void);
36 /* Opaque implementation with NULL methods to represent original methods */
37 static const raidz_impl_ops_t vdev_raidz_original_impl = {
39 .is_supported = raidz_will_scalar_work,
42 /* RAIDZ parity op that contain the fastest methods */
43 static raidz_impl_ops_t vdev_raidz_fastest_impl = {
47 /* ABD BRINGUP -- not ready yet */
57 /* All compiled in implementations */
58 const raidz_impl_ops_t *raidz_all_maths[] = {
59 &vdev_raidz_original_impl,
60 &vdev_raidz_scalar_impl,
61 #if defined(__x86_64) && defined(HAVE_SSE2) /* only x86_64 for now */
62 &vdev_raidz_sse2_impl,
64 #if defined(__x86_64) && defined(HAVE_SSSE3) /* only x86_64 for now */
65 &vdev_raidz_ssse3_impl,
67 #if defined(__x86_64) && defined(HAVE_AVX2) /* only x86_64 for now */
68 &vdev_raidz_avx2_impl,
70 #if defined(__x86_64) && defined(HAVE_AVX512F) /* only x86_64 for now */
71 &vdev_raidz_avx512f_impl,
73 #if defined(__x86_64) && defined(HAVE_AVX512BW) /* only x86_64 for now */
74 &vdev_raidz_avx512bw_impl,
76 #if defined(__aarch64__)
77 &vdev_raidz_aarch64_neon_impl,
78 &vdev_raidz_aarch64_neonx2_impl,
82 /* Indicate that benchmark has been completed */
83 static boolean_t raidz_math_initialized = B_FALSE;
85 /* Select raidz implementation */
86 #define IMPL_FASTEST (UINT32_MAX)
87 #define IMPL_CYCLE (UINT32_MAX - 1)
88 #define IMPL_ORIGINAL (0)
89 #define IMPL_SCALAR (1)
91 #define RAIDZ_IMPL_READ(i) (*(volatile uint32_t *) &(i))
93 static uint32_t zfs_vdev_raidz_impl = IMPL_SCALAR;
94 static uint32_t user_sel_impl = IMPL_FASTEST;
96 /* Hold all supported implementations */
97 static size_t raidz_supp_impl_cnt = 0;
98 static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)];
101 * kstats values for supported implementations
102 * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s]
104 static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1];
106 /* kstat for benchmarked implementations */
107 static kstat_t *raidz_math_kstat = NULL;
110 * Selects the raidz operation for raidz_map
111 * If rm_ops is set to NULL original raidz implementation will be used
114 vdev_raidz_math_get_ops()
116 raidz_impl_ops_t *ops = NULL;
117 const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
121 ASSERT(raidz_math_initialized);
122 ops = &vdev_raidz_fastest_impl;
124 #if !defined(_KERNEL)
127 ASSERT(raidz_math_initialized);
128 ASSERT3U(raidz_supp_impl_cnt, >, 0);
129 /* Cycle through all supported implementations */
130 static size_t cycle_impl_idx = 0;
131 size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt;
132 ops = raidz_supp_impl[idx];
137 ops = (raidz_impl_ops_t *) &vdev_raidz_original_impl;
140 ops = (raidz_impl_ops_t *) &vdev_raidz_scalar_impl;
143 ASSERT3U(impl, <, raidz_supp_impl_cnt);
144 ASSERT3U(raidz_supp_impl_cnt, >, 0);
145 ops = raidz_supp_impl[impl];
149 ASSERT3P(ops, !=, NULL);
155 * Select parity generation method for raidz_map
158 vdev_raidz_math_generate(raidz_map_t *rm)
160 raidz_gen_f gen_parity = NULL;
162 /* ABD Bringup -- vector code not ready */
164 switch (raidz_parity(rm)) {
166 gen_parity = rm->rm_ops->gen[RAIDZ_GEN_P];
169 gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQ];
172 gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQR];
176 cmn_err(CE_PANIC, "invalid RAID-Z configuration %d",
182 /* if method is NULL execute the original implementation */
183 if (gen_parity == NULL)
184 return (RAIDZ_ORIGINAL_IMPL);
191 /* ABD Bringup -- vector code not ready */
194 reconstruct_fun_p_sel(raidz_map_t *rm, const int *parity_valid,
197 if (nbaddata == 1 && parity_valid[CODE_P]) {
198 return (rm->rm_ops->rec[RAIDZ_REC_P]);
200 return ((raidz_rec_f) NULL);
204 reconstruct_fun_pq_sel(raidz_map_t *rm, const int *parity_valid,
208 if (parity_valid[CODE_P]) {
209 return (rm->rm_ops->rec[RAIDZ_REC_P]);
210 } else if (parity_valid[CODE_Q]) {
211 return (rm->rm_ops->rec[RAIDZ_REC_Q]);
213 } else if (nbaddata == 2 &&
214 parity_valid[CODE_P] && parity_valid[CODE_Q]) {
215 return (rm->rm_ops->rec[RAIDZ_REC_PQ]);
217 return ((raidz_rec_f) NULL);
221 reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid,
225 if (parity_valid[CODE_P]) {
226 return (rm->rm_ops->rec[RAIDZ_REC_P]);
227 } else if (parity_valid[CODE_Q]) {
228 return (rm->rm_ops->rec[RAIDZ_REC_Q]);
229 } else if (parity_valid[CODE_R]) {
230 return (rm->rm_ops->rec[RAIDZ_REC_R]);
232 } else if (nbaddata == 2) {
233 if (parity_valid[CODE_P] && parity_valid[CODE_Q]) {
234 return (rm->rm_ops->rec[RAIDZ_REC_PQ]);
235 } else if (parity_valid[CODE_P] && parity_valid[CODE_R]) {
236 return (rm->rm_ops->rec[RAIDZ_REC_PR]);
237 } else if (parity_valid[CODE_Q] && parity_valid[CODE_R]) {
238 return (rm->rm_ops->rec[RAIDZ_REC_QR]);
240 } else if (nbaddata == 3 &&
241 parity_valid[CODE_P] && parity_valid[CODE_Q] &&
242 parity_valid[CODE_R]) {
243 return (rm->rm_ops->rec[RAIDZ_REC_PQR]);
245 return ((raidz_rec_f) NULL);
250 * Select data reconstruction method for raidz_map
251 * @parity_valid - Parity validity flag
252 * @dt - Failed data index array
253 * @nbaddata - Number of failed data columns
256 vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid,
257 const int *dt, const int nbaddata)
259 raidz_rec_f rec_data = NULL;
261 /* ABD Bringup -- vector code not ready */
263 switch (raidz_parity(rm)) {
265 rec_data = reconstruct_fun_p_sel(rm, parity_valid, nbaddata);
268 rec_data = reconstruct_fun_pq_sel(rm, parity_valid, nbaddata);
271 rec_data = reconstruct_fun_pqr_sel(rm, parity_valid, nbaddata);
274 cmn_err(CE_PANIC, "invalid RAID-Z configuration %d",
280 if (rec_data == NULL)
281 return (RAIDZ_ORIGINAL_IMPL);
283 return (rec_data(rm, dt));
286 const char *raidz_gen_name[] = {
287 "gen_p", "gen_pq", "gen_pqr"
289 const char *raidz_rec_name[] = {
290 "rec_p", "rec_q", "rec_r",
291 "rec_pq", "rec_pr", "rec_qr", "rec_pqr"
294 #define RAIDZ_KSTAT_LINE_LEN (17 + 10*12 + 1)
297 raidz_math_kstat_headers(char *buf, size_t size)
302 ASSERT3U(size, >=, RAIDZ_KSTAT_LINE_LEN);
304 off = snprintf(buf, size, "%-17s", "implementation");
306 for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++)
307 off += snprintf(buf + off, size - off, "%-16s",
310 for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++)
311 off += snprintf(buf + off, size - off, "%-16s",
314 (void) snprintf(buf + off, size - off, "\n");
320 raidz_math_kstat_data(char *buf, size_t size, void *data)
322 raidz_impl_kstat_t * fstat = &raidz_impl_kstats[raidz_supp_impl_cnt];
323 raidz_impl_kstat_t * cstat = (raidz_impl_kstat_t *) data;
327 ASSERT3U(size, >=, RAIDZ_KSTAT_LINE_LEN);
329 if (cstat == fstat) {
330 off += snprintf(buf + off, size - off, "%-17s", "fastest");
332 for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++) {
333 int id = fstat->gen[i];
334 off += snprintf(buf + off, size - off, "%-16s",
335 raidz_supp_impl[id]->name);
337 for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++) {
338 int id = fstat->rec[i];
339 off += snprintf(buf + off, size - off, "%-16s",
340 raidz_supp_impl[id]->name);
343 ptrdiff_t id = cstat - raidz_impl_kstats;
345 off += snprintf(buf + off, size - off, "%-17s",
346 raidz_supp_impl[id]->name);
348 for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++)
349 off += snprintf(buf + off, size - off, "%-16llu",
350 (u_longlong_t) cstat->gen[i]);
352 for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++)
353 off += snprintf(buf + off, size - off, "%-16llu",
354 (u_longlong_t) cstat->rec[i]);
357 (void) snprintf(buf + off, size - off, "\n");
363 raidz_math_kstat_addr(kstat_t *ksp, loff_t n)
365 if (n <= raidz_supp_impl_cnt)
366 ksp->ks_private = (void *) (raidz_impl_kstats + n);
368 ksp->ks_private = NULL;
370 return (ksp->ks_private);
373 #define BENCH_D_COLS (8ULL)
374 #define BENCH_COLS (BENCH_D_COLS + PARITY_PQR)
375 #define BENCH_ZIO_SIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) /* 128 kiB */
376 #define BENCH_NS MSEC2NSEC(25) /* 25ms */
378 typedef void (*benchmark_fn)(raidz_map_t *rm, const int fn);
381 benchmark_gen_impl(raidz_map_t *rm, const int fn)
384 vdev_raidz_generate_parity(rm);
388 benchmark_rec_impl(raidz_map_t *rm, const int fn)
390 static const int rec_tgt[7][3] = {
391 {1, 2, 3}, /* rec_p: bad QR & D[0] */
392 {0, 2, 3}, /* rec_q: bad PR & D[0] */
393 {0, 1, 3}, /* rec_r: bad PQ & D[0] */
394 {2, 3, 4}, /* rec_pq: bad R & D[0][1] */
395 {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */
396 {0, 3, 4}, /* rec_qr: bad P & D[0][1] */
397 {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */
400 vdev_raidz_reconstruct(rm, rec_tgt[fn], 3);
404 * Benchmarking of all supported implementations (raidz_supp_impl_cnt)
405 * is performed by setting the rm_ops pointer and calling the top level
406 * generate/reconstruct methods of bench_rm.
409 benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn)
411 uint64_t run_cnt, speed, best_speed = 0;
412 hrtime_t t_start, t_diff;
413 raidz_impl_ops_t *curr_impl;
414 raidz_impl_kstat_t * fstat = &raidz_impl_kstats[raidz_supp_impl_cnt];
417 for (impl = 0; impl < raidz_supp_impl_cnt; impl++) {
418 /* set an implementation to benchmark */
419 curr_impl = raidz_supp_impl[impl];
420 bench_rm->rm_ops = curr_impl;
423 t_start = gethrtime();
426 for (i = 0; i < 25; i++, run_cnt++)
427 bench_fn(bench_rm, fn);
429 t_diff = gethrtime() - t_start;
430 } while (t_diff < BENCH_NS);
432 speed = run_cnt * BENCH_ZIO_SIZE * NANOSEC;
433 speed /= (t_diff * BENCH_COLS);
435 if (bench_fn == benchmark_gen_impl)
436 raidz_impl_kstats[impl].gen[fn] = speed;
438 raidz_impl_kstats[impl].rec[fn] = speed;
440 /* Update fastest implementation method */
441 if (speed > best_speed) {
444 if (bench_fn == benchmark_gen_impl) {
445 fstat->gen[fn] = impl;
446 vdev_raidz_fastest_impl.gen[fn] =
449 fstat->rec[fn] = impl;
450 vdev_raidz_fastest_impl.rec[fn] =
458 vdev_raidz_math_init(void)
460 raidz_impl_ops_t *curr_impl;
461 zio_t *bench_zio = NULL;
462 raidz_map_t *bench_rm = NULL;
463 uint64_t bench_parity;
466 /* move supported impl into raidz_supp_impl */
467 for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
468 curr_impl = (raidz_impl_ops_t *) raidz_all_maths[i];
470 /* initialize impl */
474 if (curr_impl->is_supported())
475 raidz_supp_impl[c++] = (raidz_impl_ops_t *) curr_impl;
477 membar_producer(); /* complete raidz_supp_impl[] init */
478 raidz_supp_impl_cnt = c; /* number of supported impl */
480 #if !defined(_KERNEL)
481 /* Skip benchmarking and use last implementation as fastest */
482 memcpy(&vdev_raidz_fastest_impl, raidz_supp_impl[raidz_supp_impl_cnt-1],
483 sizeof (vdev_raidz_fastest_impl));
484 strcpy(vdev_raidz_fastest_impl.name, "fastest");
486 raidz_math_initialized = B_TRUE;
488 /* Use 'cycle' math selection method for userspace */
489 VERIFY0(vdev_raidz_impl_set("cycle"));
493 /* Fake an zio and run the benchmark on a warmed up buffer */
494 bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
495 bench_zio->io_offset = 0;
496 bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */
497 bench_zio->io_abd = abd_alloc_linear(BENCH_ZIO_SIZE, B_TRUE);
498 memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE);
500 /* Benchmark parity generation methods */
501 for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
502 bench_parity = fn + 1;
503 /* New raidz_map is needed for each generate_p/q/r */
504 bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
505 BENCH_D_COLS + bench_parity, bench_parity);
507 benchmark_raidz_impl(bench_rm, fn, benchmark_gen_impl);
509 vdev_raidz_map_free(bench_rm);
512 /* Benchmark data reconstruction methods */
513 bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
514 BENCH_COLS, PARITY_PQR);
516 for (fn = 0; fn < RAIDZ_REC_NUM; fn++)
517 benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl);
519 vdev_raidz_map_free(bench_rm);
521 /* cleanup the bench zio */
522 abd_free(bench_zio->io_abd);
523 kmem_free(bench_zio, sizeof (zio_t));
525 /* install kstats for all impl */
526 raidz_math_kstat = kstat_create("zfs", 0, "vdev_raidz_bench", "misc",
527 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
529 if (raidz_math_kstat != NULL) {
530 raidz_math_kstat->ks_data = NULL;
531 raidz_math_kstat->ks_ndata = UINT32_MAX;
532 kstat_set_raw_ops(raidz_math_kstat,
533 raidz_math_kstat_headers,
534 raidz_math_kstat_data,
535 raidz_math_kstat_addr);
536 kstat_install(raidz_math_kstat);
539 /* Finish initialization */
540 atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl);
541 raidz_math_initialized = B_TRUE;
545 vdev_raidz_math_fini(void)
547 raidz_impl_ops_t const *curr_impl;
550 if (raidz_math_kstat != NULL) {
551 kstat_delete(raidz_math_kstat);
552 raidz_math_kstat = NULL;
556 for (i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
557 curr_impl = raidz_all_maths[i];
563 static const struct {
566 } math_impl_opts[] = {
567 #if !defined(_KERNEL)
568 { "cycle", IMPL_CYCLE },
570 { "fastest", IMPL_FASTEST },
571 { "original", IMPL_ORIGINAL },
572 { "scalar", IMPL_SCALAR }
576 * Function sets desired raidz implementation.
578 * If we are called before init(), user preference will be saved in
579 * user_sel_impl, and applied in later init() call. This occurs when module
580 * parameter is specified on module load. Otherwise, directly update
581 * zfs_vdev_raidz_impl.
583 * @val Name of raidz implementation to use
587 vdev_raidz_impl_set(const char *val)
590 char req_name[RAIDZ_IMPL_NAME_MAX];
591 uint32_t impl = RAIDZ_IMPL_READ(user_sel_impl);
595 i = strnlen(val, RAIDZ_IMPL_NAME_MAX);
596 if (i == 0 || i == RAIDZ_IMPL_NAME_MAX)
599 strlcpy(req_name, val, RAIDZ_IMPL_NAME_MAX);
600 while (i > 0 && !!isspace(req_name[i-1]))
604 /* Check mandatory options */
605 for (i = 0; i < ARRAY_SIZE(math_impl_opts); i++) {
606 if (strcmp(req_name, math_impl_opts[i].name) == 0) {
607 impl = math_impl_opts[i].sel;
613 /* check all supported impl if init() was already called */
614 if (err != 0 && raidz_math_initialized) {
615 /* check all supported implementations */
616 for (i = 0; i < raidz_supp_impl_cnt; i++) {
617 if (strcmp(req_name, raidz_supp_impl[i]->name) == 0) {
626 if (raidz_math_initialized)
627 atomic_swap_32(&zfs_vdev_raidz_impl, impl);
629 atomic_swap_32(&user_sel_impl, impl);
635 #if defined(_KERNEL) && defined(HAVE_SPL)
636 #include <linux/mod_compat.h>
639 zfs_vdev_raidz_impl_set(const char *val, zfs_kernel_param_t *kp)
641 return (vdev_raidz_impl_set(val));
645 zfs_vdev_raidz_impl_get(char *buffer, zfs_kernel_param_t *kp)
649 const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
651 ASSERT(raidz_math_initialized);
653 /* list mandatory options */
654 for (i = 0; i < ARRAY_SIZE(math_impl_opts) - 2; i++) {
655 fmt = (impl == math_impl_opts[i].sel) ? "[%s] " : "%s ";
656 cnt += sprintf(buffer + cnt, fmt, math_impl_opts[i].name);
659 /* list all supported implementations */
660 for (i = 0; i < raidz_supp_impl_cnt; i++) {
661 fmt = (i == impl) ? "[%s] " : "%s ";
662 cnt += sprintf(buffer + cnt, fmt, raidz_supp_impl[i]->name);
668 module_param_call(zfs_vdev_raidz_impl, zfs_vdev_raidz_impl_set,
669 zfs_vdev_raidz_impl_get, NULL, 0644);
670 MODULE_PARM_DESC(zfs_vdev_raidz_impl, "Select raidz implementation.");