sys/contrib/openzfs/man/man5/zfs-module-parameters.5

   1 '\" te
   2 .\" Copyright (c) 2013 by Turbo Fredriksson <turbo@bayour.com>. All rights reserved.
   3 .\" Copyright (c) 2019, 2020 by Delphix. All rights reserved.
   4 .\" Copyright (c) 2019 Datto Inc.
   5 .\" The contents of this file are subject to the terms of the Common Development
   6 .\" and Distribution License (the "License").  You may not use this file except
   7 .\" in compliance with the License. You can obtain a copy of the license at
   8 .\" usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.
   9 .\"
  10 .\" See the License for the specific language governing permissions and
  11 .\" limitations under the License. When distributing Covered Code, include this
  12 .\" CDDL HEADER in each file and include the License file at
  13 .\" usr/src/OPENSOLARIS.LICENSE.  If applicable, add the following below this
  14 .\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your
  15 .\" own identifying information:
  16 .\" Portions Copyright [yyyy] [name of copyright owner]
  17 .TH ZFS-MODULE-PARAMETERS 5 "Aug 24, 2020" OpenZFS
  18 .SH NAME
  19 zfs\-module\-parameters \- ZFS module parameters
  20 .SH DESCRIPTION
  21 .sp
  22 .LP
  23 Description of the different parameters to the ZFS module.
  24
  25 .SS "Module parameters"
  26 .sp
  27 .LP
  28
  29 .sp
  30 .ne 2
  31 .na
  32 \fBdbuf_cache_max_bytes\fR (ulong)
  33 .ad
  34 .RS 12n
  35 Maximum size in bytes of the dbuf cache.  The target size is determined by the
  36 MIN versus \fB1/2^dbuf_cache_shift\fR (1/32) of the target ARC size.  The
  37 behavior of the dbuf cache and its associated settings can be observed via the
  38 \fB/proc/spl/kstat/zfs/dbufstats\fR kstat.
  39 .sp
  40 Default value: \fBULONG_MAX\fR.
  41 .RE
  42
  43 .sp
  44 .ne 2
  45 .na
  46 \fBdbuf_metadata_cache_max_bytes\fR (ulong)
  47 .ad
  48 .RS 12n
  49 Maximum size in bytes of the metadata dbuf cache.  The target size is
  50 determined by the MIN versus \fB1/2^dbuf_metadata_cache_shift\fR (1/64) of the
  51 target ARC size.  The behavior of the metadata dbuf cache and its associated
  52 settings can be observed via the \fB/proc/spl/kstat/zfs/dbufstats\fR kstat.
  53 .sp
  54 Default value: \fBULONG_MAX\fR.
  55 .RE
  56
  57 .sp
  58 .ne 2
  59 .na
  60 \fBdbuf_cache_hiwater_pct\fR (uint)
  61 .ad
  62 .RS 12n
  63 The percentage over \fBdbuf_cache_max_bytes\fR when dbufs must be evicted
  64 directly.
  65 .sp
  66 Default value: \fB10\fR%.
  67 .RE
  68
  69 .sp
  70 .ne 2
  71 .na
  72 \fBdbuf_cache_lowater_pct\fR (uint)
  73 .ad
  74 .RS 12n
  75 The percentage below \fBdbuf_cache_max_bytes\fR when the evict thread stops
  76 evicting dbufs.
  77 .sp
  78 Default value: \fB10\fR%.
  79 .RE
  80
  81 .sp
  82 .ne 2
  83 .na
  84 \fBdbuf_cache_shift\fR (int)
  85 .ad
  86 .RS 12n
  87 Set the size of the dbuf cache, \fBdbuf_cache_max_bytes\fR, to a log2 fraction
  88 of the target ARC size.
  89 .sp
  90 Default value: \fB5\fR.
  91 .RE
  92
  93 .sp
  94 .ne 2
  95 .na
  96 \fBdbuf_metadata_cache_shift\fR (int)
  97 .ad
  98 .RS 12n
  99 Set the size of the dbuf metadata cache, \fBdbuf_metadata_cache_max_bytes\fR,
 100 to a log2 fraction of the target ARC size.
 101 .sp
 102 Default value: \fB6\fR.
 103 .RE
 104
 105 .sp
 106 .ne 2
 107 .na
 108 \fBdmu_object_alloc_chunk_shift\fR (int)
 109 .ad
 110 .RS 12n
 111 dnode slots allocated in a single operation as a power of 2. The default value
 112 minimizes lock contention for the bulk operation performed.
 113 .sp
 114 Default value: \fB7\fR (128).
 115 .RE
 116
 117 .sp
 118 .ne 2
 119 .na
 120 \fBdmu_prefetch_max\fR (int)
 121 .ad
 122 .RS 12n
 123 Limit the amount we can prefetch with one call to this amount (in bytes).
 124 This helps to limit the amount of memory that can be used by prefetching.
 125 .sp
 126 Default value: \fB134,217,728\fR (128MB).
 127 .RE
 128
 129 .sp
 130 .ne 2
 131 .na
 132 \fBignore_hole_birth\fR (int)
 133 .ad
 134 .RS 12n
 135 This is an alias for \fBsend_holes_without_birth_time\fR.
 136 .RE
 137
 138 .sp
 139 .ne 2
 140 .na
 141 \fBl2arc_feed_again\fR (int)
 142 .ad
 143 .RS 12n
 144 Turbo L2ARC warm-up. When the L2ARC is cold the fill interval will be set as
 145 fast as possible.
 146 .sp
 147 Use \fB1\fR for yes (default) and \fB0\fR to disable.
 148 .RE
 149
 150 .sp
 151 .ne 2
 152 .na
 153 \fBl2arc_feed_min_ms\fR (ulong)
 154 .ad
 155 .RS 12n
 156 Min feed interval in milliseconds. Requires \fBl2arc_feed_again=1\fR and only
 157 applicable in related situations.
 158 .sp
 159 Default value: \fB200\fR.
 160 .RE
 161
 162 .sp
 163 .ne 2
 164 .na
 165 \fBl2arc_feed_secs\fR (ulong)
 166 .ad
 167 .RS 12n
 168 Seconds between L2ARC writing
 169 .sp
 170 Default value: \fB1\fR.
 171 .RE
 172
 173 .sp
 174 .ne 2
 175 .na
 176 \fBl2arc_headroom\fR (ulong)
 177 .ad
 178 .RS 12n
 179 How far through the ARC lists to search for L2ARC cacheable content, expressed
 180 as a multiplier of \fBl2arc_write_max\fR.
 181 ARC persistence across reboots can be achieved with persistent L2ARC by setting
 182 this parameter to \fB0\fR allowing the full length of ARC lists to be searched
 183 for cacheable content.
 184 .sp
 185 Default value: \fB2\fR.
 186 .RE
 187
 188 .sp
 189 .ne 2
 190 .na
 191 \fBl2arc_headroom_boost\fR (ulong)
 192 .ad
 193 .RS 12n
 194 Scales \fBl2arc_headroom\fR by this percentage when L2ARC contents are being
 195 successfully compressed before writing. A value of \fB100\fR disables this
 196 feature.
 197 .sp
 198 Default value: \fB200\fR%.
 199 .RE
 200
 201 .sp
 202 .ne 2
 203 .na
 204 \fBl2arc_mfuonly\fR (int)
 205 .ad
 206 .RS 12n
 207 Controls whether only MFU metadata and data are cached from ARC into L2ARC.
 208 This may be desired to avoid wasting space on L2ARC when reading/writing large
 209 amounts of data that are not expected to be accessed more than once. The
 210 default is \fB0\fR, meaning both MRU and MFU data and metadata are cached.
 211 When turning off (\fB0\fR) this feature some MRU buffers will still be present
 212 in ARC and eventually cached on L2ARC.
 213 .sp
 214 Use \fB0\fR for no (default) and \fB1\fR for yes.
 215 .RE
 216
 217 .sp
 218 .ne 2
 219 .na
 220 \fBl2arc_meta_percent\fR (int)
 221 .ad
 222 .RS 12n
 223 Percent of ARC size allowed for L2ARC-only headers.
 224 Since L2ARC buffers are not evicted on memory pressure, too large amount of
 225 headers on system with irrationaly large L2ARC can render it slow or unusable.
 226 This parameter limits L2ARC writes and rebuild to achieve it.
 227 .sp
 228 Default value: \fB33\fR%.
 229 .RE
 230
 231 .sp
 232 .ne 2
 233 .na
 234 \fBl2arc_trim_ahead\fR (ulong)
 235 .ad
 236 .RS 12n
 237 Trims ahead of the current write size (\fBl2arc_write_max\fR) on L2ARC devices
 238 by this percentage of write size if we have filled the device. If set to
 239 \fB100\fR we TRIM twice the space required to accommodate upcoming writes. A
 240 minimum of 64MB will be trimmed. It also enables TRIM of the whole L2ARC device
 241 upon creation or addition to an existing pool or if the header of the device is
 242 invalid upon importing a pool or onlining a cache device. A value of \fB0\fR
 243 disables TRIM on L2ARC altogether and is the default as it can put significant
 244 stress on the underlying storage devices. This will vary depending of how well
 245 the specific device handles these commands.
 246 .sp
 247 Default value: \fB0\fR%.
 248 .RE
 249
 250 .sp
 251 .ne 2
 252 .na
 253 \fBl2arc_noprefetch\fR (int)
 254 .ad
 255 .RS 12n
 256 Do not write buffers to L2ARC if they were prefetched but not used by
 257 applications.
 258 .sp
 259 Use \fB1\fR for yes (default) and \fB0\fR to disable.
 260 .RE
 261
 262 .sp
 263 .ne 2
 264 .na
 265 \fBl2arc_norw\fR (int)
 266 .ad
 267 .RS 12n
 268 No reads during writes.
 269 .sp
 270 Use \fB1\fR for yes and \fB0\fR for no (default).
 271 .RE
 272
 273 .sp
 274 .ne 2
 275 .na
 276 \fBl2arc_write_boost\fR (ulong)
 277 .ad
 278 .RS 12n
 279 Cold L2ARC devices will have \fBl2arc_write_max\fR increased by this amount
 280 while they remain cold.
 281 .sp
 282 Default value: \fB8,388,608\fR.
 283 .RE
 284
 285 .sp
 286 .ne 2
 287 .na
 288 \fBl2arc_write_max\fR (ulong)
 289 .ad
 290 .RS 12n
 291 Max write bytes per interval.
 292 .sp
 293 Default value: \fB8,388,608\fR.
 294 .RE
 295
 296 .sp
 297 .ne 2
 298 .na
 299 \fBl2arc_rebuild_enabled\fR (int)
 300 .ad
 301 .RS 12n
 302 Rebuild the L2ARC when importing a pool (persistent L2ARC). This can be
 303 disabled if there are problems importing a pool or attaching an L2ARC device
 304 (e.g. the L2ARC device is slow in reading stored log metadata, or the metadata
 305 has become somehow fragmented/unusable).
 306 .sp
 307 Use \fB1\fR for yes (default) and \fB0\fR for no.
 308 .RE
 309
 310 .sp
 311 .ne 2
 312 .na
 313 \fBl2arc_rebuild_blocks_min_l2size\fR (ulong)
 314 .ad
 315 .RS 12n
 316 Min size (in bytes) of an L2ARC device required in order to write log blocks
 317 in it. The log blocks are used upon importing the pool to rebuild
 318 the L2ARC (persistent L2ARC). Rationale: for L2ARC devices less than 1GB, the
 319 amount of data l2arc_evict() evicts is significant compared to the amount of
 320 restored L2ARC data. In this case do not write log blocks in L2ARC in order not
 321 to waste space.
 322 .sp
 323 Default value: \fB1,073,741,824\fR (1GB).
 324 .RE
 325
 326 .sp
 327 .ne 2
 328 .na
 329 \fBmetaslab_aliquot\fR (ulong)
 330 .ad
 331 .RS 12n
 332 Metaslab granularity, in bytes. This is roughly similar to what would be
 333 referred to as the "stripe size" in traditional RAID arrays. In normal
 334 operation, ZFS will try to write this amount of data to a top-level vdev
 335 before moving on to the next one.
 336 .sp
 337 Default value: \fB524,288\fR.
 338 .RE
 339
 340 .sp
 341 .ne 2
 342 .na
 343 \fBmetaslab_bias_enabled\fR (int)
 344 .ad
 345 .RS 12n
 346 Enable metaslab group biasing based on its vdev's over- or under-utilization
 347 relative to the pool.
 348 .sp
 349 Use \fB1\fR for yes (default) and \fB0\fR for no.
 350 .RE
 351
 352 .sp
 353 .ne 2
 354 .na
 355 \fBmetaslab_force_ganging\fR (ulong)
 356 .ad
 357 .RS 12n
 358 Make some blocks above a certain size be gang blocks.  This option is used
 359 by the test suite to facilitate testing.
 360 .sp
 361 Default value: \fB16,777,217\fR.
 362 .RE
 363
 364 .sp
 365 .ne 2
 366 .na
 367 \fBzfs_keep_log_spacemaps_at_export\fR (int)
 368 .ad
 369 .RS 12n
 370 Prevent log spacemaps from being destroyed during pool exports and destroys.
 371 .sp
 372 Use \fB1\fR for yes and \fB0\fR for no (default).
 373 .RE
 374
 375 .sp
 376 .ne 2
 377 .na
 378 \fBzfs_metaslab_segment_weight_enabled\fR (int)
 379 .ad
 380 .RS 12n
 381 Enable/disable segment-based metaslab selection.
 382 .sp
 383 Use \fB1\fR for yes (default) and \fB0\fR for no.
 384 .RE
 385
 386 .sp
 387 .ne 2
 388 .na
 389 \fBzfs_metaslab_switch_threshold\fR (int)
 390 .ad
 391 .RS 12n
 392 When using segment-based metaslab selection, continue allocating
 393 from the active metaslab until \fBzfs_metaslab_switch_threshold\fR
 394 worth of buckets have been exhausted.
 395 .sp
 396 Default value: \fB2\fR.
 397 .RE
 398
 399 .sp
 400 .ne 2
 401 .na
 402 \fBmetaslab_debug_load\fR (int)
 403 .ad
 404 .RS 12n
 405 Load all metaslabs during pool import.
 406 .sp
 407 Use \fB1\fR for yes and \fB0\fR for no (default).
 408 .RE
 409
 410 .sp
 411 .ne 2
 412 .na
 413 \fBmetaslab_debug_unload\fR (int)
 414 .ad
 415 .RS 12n
 416 Prevent metaslabs from being unloaded.
 417 .sp
 418 Use \fB1\fR for yes and \fB0\fR for no (default).
 419 .RE
 420
 421 .sp
 422 .ne 2
 423 .na
 424 \fBmetaslab_fragmentation_factor_enabled\fR (int)
 425 .ad
 426 .RS 12n
 427 Enable use of the fragmentation metric in computing metaslab weights.
 428 .sp
 429 Use \fB1\fR for yes (default) and \fB0\fR for no.
 430 .RE
 431
 432 .sp
 433 .ne 2
 434 .na
 435 \fBmetaslab_df_max_search\fR (int)
 436 .ad
 437 .RS 12n
 438 Maximum distance to search forward from the last offset. Without this limit,
 439 fragmented pools can see >100,000 iterations and metaslab_block_picker()
 440 becomes the performance limiting factor on high-performance storage.
 441
 442 With the default setting of 16MB, we typically see less than 500 iterations,
 443 even with very fragmented, ashift=9 pools. The maximum number of iterations
 444 possible is: \fBmetaslab_df_max_search / (2 * (1<<ashift))\fR.
 445 With the default setting of 16MB this is 16*1024 (with ashift=9) or 2048
 446 (with ashift=12).
 447 .sp
 448 Default value: \fB16,777,216\fR (16MB)
 449 .RE
 450
 451 .sp
 452 .ne 2
 453 .na
 454 \fBmetaslab_df_use_largest_segment\fR (int)
 455 .ad
 456 .RS 12n
 457 If we are not searching forward (due to metaslab_df_max_search,
 458 metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable controls
 459 what segment is used.  If it is set, we will use the largest free segment.
 460 If it is not set, we will use a segment of exactly the requested size (or
 461 larger).
 462 .sp
 463 Use \fB1\fR for yes and \fB0\fR for no (default).
 464 .RE
 465
 466 .sp
 467 .ne 2
 468 .na
 469 \fBzfs_metaslab_max_size_cache_sec\fR (ulong)
 470 .ad
 471 .RS 12n
 472 When we unload a metaslab, we cache the size of the largest free chunk. We use
 473 that cached size to determine whether or not to load a metaslab for a given
 474 allocation. As more frees accumulate in that metaslab while it's unloaded, the
 475 cached max size becomes less and less accurate. After a number of seconds
 476 controlled by this tunable, we stop considering the cached max size and start
 477 considering only the histogram instead.
 478 .sp
 479 Default value: \fB3600 seconds\fR (one hour)
 480 .RE
 481
 482 .sp
 483 .ne 2
 484 .na
 485 \fBzfs_metaslab_mem_limit\fR (int)
 486 .ad
 487 .RS 12n
 488 When we are loading a new metaslab, we check the amount of memory being used
 489 to store metaslab range trees. If it is over a threshold, we attempt to unload
 490 the least recently used metaslab to prevent the system from clogging all of
 491 its memory with range trees. This tunable sets the percentage of total system
 492 memory that is the threshold.
 493 .sp
 494 Default value: \fB25 percent\fR
 495 .RE
 496
 497 .sp
 498 .ne 2
 499 .na
 500 \fBzfs_vdev_default_ms_count\fR (int)
 501 .ad
 502 .RS 12n
 503 When a vdev is added target this number of metaslabs per top-level vdev.
 504 .sp
 505 Default value: \fB200\fR.
 506 .RE
 507
 508 .sp
 509 .ne 2
 510 .na
 511 \fBzfs_vdev_default_ms_shift\fR (int)
 512 .ad
 513 .RS 12n
 514 Default limit for metaslab size.
 515 .sp
 516 Default value: \fB29\fR [meaning (1 << 29) = 512MB].
 517 .RE
 518
 519 .sp
 520 .ne 2
 521 .na
 522 \fBzfs_vdev_max_auto_ashift\fR (ulong)
 523 .ad
 524 .RS 12n
 525 Maximum ashift used when optimizing for logical -> physical sector size on new
 526 top-level vdevs.
 527 .sp
 528 Default value: \fBASHIFT_MAX\fR (16).
 529 .RE
 530
 531 .sp
 532 .ne 2
 533 .na
 534 \fBzfs_vdev_min_auto_ashift\fR (ulong)
 535 .ad
 536 .RS 12n
 537 Minimum ashift used when creating new top-level vdevs.
 538 .sp
 539 Default value: \fBASHIFT_MIN\fR (9).
 540 .RE
 541
 542 .sp
 543 .ne 2
 544 .na
 545 \fBzfs_vdev_min_ms_count\fR (int)
 546 .ad
 547 .RS 12n
 548 Minimum number of metaslabs to create in a top-level vdev.
 549 .sp
 550 Default value: \fB16\fR.
 551 .RE
 552
 553 .sp
 554 .ne 2
 555 .na
 556 \fBvdev_validate_skip\fR (int)
 557 .ad
 558 .RS 12n
 559 Skip label validation steps during pool import. Changing is not recommended
 560 unless you know what you are doing and are recovering a damaged label.
 561 .sp
 562 Default value: \fB0\fR.
 563 .RE
 564
 565 .sp
 566 .ne 2
 567 .na
 568 \fBzfs_vdev_ms_count_limit\fR (int)
 569 .ad
 570 .RS 12n
 571 Practical upper limit of total metaslabs per top-level vdev.
 572 .sp
 573 Default value: \fB131,072\fR.
 574 .RE
 575
 576 .sp
 577 .ne 2
 578 .na
 579 \fBmetaslab_preload_enabled\fR (int)
 580 .ad
 581 .RS 12n
 582 Enable metaslab group preloading.
 583 .sp
 584 Use \fB1\fR for yes (default) and \fB0\fR for no.
 585 .RE
 586
 587 .sp
 588 .ne 2
 589 .na
 590 \fBmetaslab_lba_weighting_enabled\fR (int)
 591 .ad
 592 .RS 12n
 593 Give more weight to metaslabs with lower LBAs, assuming they have
 594 greater bandwidth as is typically the case on a modern constant
 595 angular velocity disk drive.
 596 .sp
 597 Use \fB1\fR for yes (default) and \fB0\fR for no.
 598 .RE
 599
 600 .sp
 601 .ne 2
 602 .na
 603 \fBmetaslab_unload_delay\fR (int)
 604 .ad
 605 .RS 12n
 606 After a metaslab is used, we keep it loaded for this many txgs, to attempt to
 607 reduce unnecessary reloading. Note that both this many txgs and
 608 \fBmetaslab_unload_delay_ms\fR milliseconds must pass before unloading will
 609 occur.
 610 .sp
 611 Default value: \fB32\fR.
 612 .RE
 613
 614 .sp
 615 .ne 2
 616 .na
 617 \fBmetaslab_unload_delay_ms\fR (int)
 618 .ad
 619 .RS 12n
 620 After a metaslab is used, we keep it loaded for this many milliseconds, to
 621 attempt to reduce unnecessary reloading. Note that both this many
 622 milliseconds and \fBmetaslab_unload_delay\fR txgs must pass before unloading
 623 will occur.
 624 .sp
 625 Default value: \fB600000\fR (ten minutes).
 626 .RE
 627
 628 .sp
 629 .ne 2
 630 .na
 631 \fBsend_holes_without_birth_time\fR (int)
 632 .ad
 633 .RS 12n
 634 When set, the hole_birth optimization will not be used, and all holes will
 635 always be sent on zfs send.  This is useful if you suspect your datasets are
 636 affected by a bug in hole_birth.
 637 .sp
 638 Use \fB1\fR for on (default) and \fB0\fR for off.
 639 .RE
 640
 641 .sp
 642 .ne 2
 643 .na
 644 \fBspa_config_path\fR (charp)
 645 .ad
 646 .RS 12n
 647 SPA config file
 648 .sp
 649 Default value: \fB/etc/zfs/zpool.cache\fR.
 650 .RE
 651
 652 .sp
 653 .ne 2
 654 .na
 655 \fBspa_asize_inflation\fR (int)
 656 .ad
 657 .RS 12n
 658 Multiplication factor used to estimate actual disk consumption from the
 659 size of data being written. The default value is a worst case estimate,
 660 but lower values may be valid for a given pool depending on its
 661 configuration.  Pool administrators who understand the factors involved
 662 may wish to specify a more realistic inflation factor, particularly if
 663 they operate close to quota or capacity limits.
 664 .sp
 665 Default value: \fB24\fR.
 666 .RE
 667
 668 .sp
 669 .ne 2
 670 .na
 671 \fBspa_load_print_vdev_tree\fR (int)
 672 .ad
 673 .RS 12n
 674 Whether to print the vdev tree in the debugging message buffer during pool import.
 675 Use 0 to disable and 1 to enable.
 676 .sp
 677 Default value: \fB0\fR.
 678 .RE
 679
 680 .sp
 681 .ne 2
 682 .na
 683 \fBspa_load_verify_data\fR (int)
 684 .ad
 685 .RS 12n
 686 Whether to traverse data blocks during an "extreme rewind" (\fB-X\fR)
 687 import.  Use 0 to disable and 1 to enable.
 688
 689 An extreme rewind import normally performs a full traversal of all
 690 blocks in the pool for verification.  If this parameter is set to 0,
 691 the traversal skips non-metadata blocks.  It can be toggled once the
 692 import has started to stop or start the traversal of non-metadata blocks.
 693 .sp
 694 Default value: \fB1\fR.
 695 .RE
 696
 697 .sp
 698 .ne 2
 699 .na
 700 \fBspa_load_verify_metadata\fR (int)
 701 .ad
 702 .RS 12n
 703 Whether to traverse blocks during an "extreme rewind" (\fB-X\fR)
 704 pool import.  Use 0 to disable and 1 to enable.
 705
 706 An extreme rewind import normally performs a full traversal of all
 707 blocks in the pool for verification.  If this parameter is set to 0,
 708 the traversal is not performed.  It can be toggled once the import has
 709 started to stop or start the traversal.
 710 .sp
 711 Default value: \fB1\fR.
 712 .RE
 713
 714 .sp
 715 .ne 2
 716 .na
 717 \fBspa_load_verify_shift\fR (int)
 718 .ad
 719 .RS 12n
 720 Sets the maximum number of bytes to consume during pool import to the log2
 721 fraction of the target ARC size.
 722 .sp
 723 Default value: \fB4\fR.
 724 .RE
 725
 726 .sp
 727 .ne 2
 728 .na
 729 \fBspa_slop_shift\fR (int)
 730 .ad
 731 .RS 12n
 732 Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space
 733 in the pool to be consumed.  This ensures that we don't run the pool
 734 completely out of space, due to unaccounted changes (e.g. to the MOS).
 735 It also limits the worst-case time to allocate space.  If we have
 736 less than this amount of free space, most ZPL operations (e.g. write,
 737 create) will return ENOSPC.
 738 .sp
 739 Default value: \fB5\fR.
 740 .RE
 741
 742 .sp
 743 .ne 2
 744 .na
 745 \fBvdev_removal_max_span\fR (int)
 746 .ad
 747 .RS 12n
 748 During top-level vdev removal, chunks of data are copied from the vdev
 749 which may include free space in order to trade bandwidth for IOPS.
 750 This parameter determines the maximum span of free space (in bytes)
 751 which will be included as "unnecessary" data in a chunk of copied data.
 752
 753 The default value here was chosen to align with
 754 \fBzfs_vdev_read_gap_limit\fR, which is a similar concept when doing
 755 regular reads (but there's no reason it has to be the same).
 756 .sp
 757 Default value: \fB32,768\fR.
 758 .RE
 759
 760 .sp
 761 .ne 2
 762 .na
 763 \fBvdev_file_logical_ashift\fR (ulong)
 764 .ad
 765 .RS 12n
 766 Logical ashift for file-based devices.
 767 .sp
 768 Default value: \fB9\fR.
 769 .RE
 770
 771 .sp
 772 .ne 2
 773 .na
 774 \fBvdev_file_physical_ashift\fR (ulong)
 775 .ad
 776 .RS 12n
 777 Physical ashift for file-based devices.
 778 .sp
 779 Default value: \fB9\fR.
 780 .RE
 781
 782 .sp
 783 .ne 2
 784 .na
 785 \fBzap_iterate_prefetch\fR (int)
 786 .ad
 787 .RS 12n
 788 If this is set, when we start iterating over a ZAP object, zfs will prefetch
 789 the entire object (all leaf blocks).  However, this is limited by
 790 \fBdmu_prefetch_max\fR.
 791 .sp
 792 Use \fB1\fR for on (default) and \fB0\fR for off.
 793 .RE
 794
 795 .sp
 796 .ne 2
 797 .na
 798 \fBzfetch_array_rd_sz\fR (ulong)
 799 .ad
 800 .RS 12n
 801 If prefetching is enabled, disable prefetching for reads larger than this size.
 802 .sp
 803 Default value: \fB1,048,576\fR.
 804 .RE
 805
 806 .sp
 807 .ne 2
 808 .na
 809 \fBzfetch_max_distance\fR (uint)
 810 .ad
 811 .RS 12n
 812 Max bytes to prefetch per stream.
 813 .sp
 814 Default value: \fB8,388,608\fR (8MB).
 815 .RE
 816
 817 .sp
 818 .ne 2
 819 .na
 820 \fBzfetch_max_idistance\fR (uint)
 821 .ad
 822 .RS 12n
 823 Max bytes to prefetch indirects for per stream.
 824 .sp
 825 Default vaule: \fB67,108,864\fR (64MB).
 826 .RE
 827
 828 .sp
 829 .ne 2
 830 .na
 831 \fBzfetch_max_streams\fR (uint)
 832 .ad
 833 .RS 12n
 834 Max number of streams per zfetch (prefetch streams per file).
 835 .sp
 836 Default value: \fB8\fR.
 837 .RE
 838
 839 .sp
 840 .ne 2
 841 .na
 842 \fBzfetch_min_sec_reap\fR (uint)
 843 .ad
 844 .RS 12n
 845 Min time before an active prefetch stream can be reclaimed
 846 .sp
 847 Default value: \fB2\fR.
 848 .RE
 849
 850 .sp
 851 .ne 2
 852 .na
 853 \fBzfs_abd_scatter_enabled\fR (int)
 854 .ad
 855 .RS 12n
 856 Enables ARC from using scatter/gather lists and forces all allocations to be
 857 linear in kernel memory. Disabling can improve performance in some code paths
 858 at the expense of fragmented kernel memory.
 859 .sp
 860 Default value: \fB1\fR.
 861 .RE
 862
 863 .sp
 864 .ne 2
 865 .na
 866 \fBzfs_abd_scatter_max_order\fR (iunt)
 867 .ad
 868 .RS 12n
 869 Maximum number of consecutive memory pages allocated in a single block for
 870 scatter/gather lists. Default value is specified by the kernel itself.
 871 .sp
 872 Default value: \fB10\fR at the time of this writing.
 873 .RE
 874
 875 .sp
 876 .ne 2
 877 .na
 878 \fBzfs_abd_scatter_min_size\fR (uint)
 879 .ad
 880 .RS 12n
 881 This is the minimum allocation size that will use scatter (page-based)
 882 ABD's.  Smaller allocations will use linear ABD's.
 883 .sp
 884 Default value: \fB1536\fR (512B and 1KB allocations will be linear).
 885 .RE
 886
 887 .sp
 888 .ne 2
 889 .na
 890 \fBzfs_arc_dnode_limit\fR (ulong)
 891 .ad
 892 .RS 12n
 893 When the number of bytes consumed by dnodes in the ARC exceeds this number of
 894 bytes, try to unpin some of it in response to demand for non-metadata. This
 895 value acts as a ceiling to the amount of dnode metadata, and defaults to 0 which
 896 indicates that a percent which is based on \fBzfs_arc_dnode_limit_percent\fR of
 897 the ARC meta buffers that may be used for dnodes.
 898
 899 See also \fBzfs_arc_meta_prune\fR which serves a similar purpose but is used
 900 when the amount of metadata in the ARC exceeds \fBzfs_arc_meta_limit\fR rather
 901 than in response to overall demand for non-metadata.
 902
 903 .sp
 904 Default value: \fB0\fR.
 905 .RE
 906
 907 .sp
 908 .ne 2
 909 .na
 910 \fBzfs_arc_dnode_limit_percent\fR (ulong)
 911 .ad
 912 .RS 12n
 913 Percentage that can be consumed by dnodes of ARC meta buffers.
 914 .sp
 915 See also \fBzfs_arc_dnode_limit\fR which serves a similar purpose but has a
 916 higher priority if set to nonzero value.
 917 .sp
 918 Default value: \fB10\fR%.
 919 .RE
 920
 921 .sp
 922 .ne 2
 923 .na
 924 \fBzfs_arc_dnode_reduce_percent\fR (ulong)
 925 .ad
 926 .RS 12n
 927 Percentage of ARC dnodes to try to scan in response to demand for non-metadata
 928 when the number of bytes consumed by dnodes exceeds \fBzfs_arc_dnode_limit\fR.
 929
 930 .sp
 931 Default value: \fB10\fR% of the number of dnodes in the ARC.
 932 .RE
 933
 934 .sp
 935 .ne 2
 936 .na
 937 \fBzfs_arc_average_blocksize\fR (int)
 938 .ad
 939 .RS 12n
 940 The ARC's buffer hash table is sized based on the assumption of an average
 941 block size of \fBzfs_arc_average_blocksize\fR (default 8K).  This works out
 942 to roughly 1MB of hash table per 1GB of physical memory with 8-byte pointers.
 943 For configurations with a known larger average block size this value can be
 944 increased to reduce the memory footprint.
 945
 946 .sp
 947 Default value: \fB8192\fR.
 948 .RE
 949
 950 .sp
 951 .ne 2
 952 .na
 953 \fBzfs_arc_eviction_pct\fR (int)
 954 .ad
 955 .RS 12n
 956 When \fBarc_is_overflowing()\fR, \fBarc_get_data_impl()\fR waits for this
 957 percent of the requested amount of data to be evicted.  For example, by
 958 default for every 2KB that's evicted, 1KB of it may be "reused" by a new
 959 allocation. Since this is above 100%, it ensures that progress is made
 960 towards getting \fBarc_size\fR under \fBarc_c\fR.  Since this is finite, it
 961 ensures that allocations can still happen, even during the potentially long
 962 time that \fBarc_size\fR is more than \fBarc_c\fR.
 963 .sp
 964 Default value: \fB200\fR.
 965 .RE
 966
 967 .sp
 968 .ne 2
 969 .na
 970 \fBzfs_arc_evict_batch_limit\fR (int)
 971 .ad
 972 .RS 12n
 973 Number ARC headers to evict per sub-list before proceeding to another sub-list.
 974 This batch-style operation prevents entire sub-lists from being evicted at once
 975 but comes at a cost of additional unlocking and locking.
 976 .sp
 977 Default value: \fB10\fR.
 978 .RE
 979
 980 .sp
 981 .ne 2
 982 .na
 983 \fBzfs_arc_grow_retry\fR (int)
 984 .ad
 985 .RS 12n
 986 If set to a non zero value, it will replace the arc_grow_retry value with this value.
 987 The arc_grow_retry value (default 5) is the number of seconds the ARC will wait before
 988 trying to resume growth after a memory pressure event.
 989 .sp
 990 Default value: \fB0\fR.
 991 .RE
 992
 993 .sp
 994 .ne 2
 995 .na
 996 \fBzfs_arc_lotsfree_percent\fR (int)
 997 .ad
 998 .RS 12n
 999 Throttle I/O when free system memory drops below this percentage of total
1000 system memory.  Setting this value to 0 will disable the throttle.
1001 .sp
1002 Default value: \fB10\fR%.
1003 .RE
1004
1005 .sp
1006 .ne 2
1007 .na
1008 \fBzfs_arc_max\fR (ulong)
1009 .ad
1010 .RS 12n
1011 Max size of ARC in bytes.  If set to 0 then the max size of ARC is determined
1012 by the amount of system memory installed.  For Linux, 1/2 of system memory will
1013 be used as the limit.  For FreeBSD, the larger of all system memory - 1GB or
1014 5/8 of system memory will be used as the limit.  This value must be at least
1015 67108864 (64 megabytes).
1016 .sp
1017 This value can be changed dynamically with some caveats. It cannot be set back
1018 to 0 while running and reducing it below the current ARC size will not cause
1019 the ARC to shrink without memory pressure to induce shrinking.
1020 .sp
1021 Default value: \fB0\fR.
1022 .RE
1023
1024 .sp
1025 .ne 2
1026 .na
1027 \fBzfs_arc_meta_adjust_restarts\fR (ulong)
1028 .ad
1029 .RS 12n
1030 The number of restart passes to make while scanning the ARC attempting
1031 the free buffers in order to stay below the \fBzfs_arc_meta_limit\fR.
1032 This value should not need to be tuned but is available to facilitate
1033 performance analysis.
1034 .sp
1035 Default value: \fB4096\fR.
1036 .RE
1037
1038 .sp
1039 .ne 2
1040 .na
1041 \fBzfs_arc_meta_limit\fR (ulong)
1042 .ad
1043 .RS 12n
1044 The maximum allowed size in bytes that meta data buffers are allowed to
1045 consume in the ARC.  When this limit is reached meta data buffers will
1046 be reclaimed even if the overall arc_c_max has not been reached.  This
1047 value defaults to 0 which indicates that a percent which is based on
1048 \fBzfs_arc_meta_limit_percent\fR of the ARC may be used for meta data.
1049 .sp
1050 This value my be changed dynamically except that it cannot be set back to 0
1051 for a specific percent of the ARC; it must be set to an explicit value.
1052 .sp
1053 Default value: \fB0\fR.
1054 .RE
1055
1056 .sp
1057 .ne 2
1058 .na
1059 \fBzfs_arc_meta_limit_percent\fR (ulong)
1060 .ad
1061 .RS 12n
1062 Percentage of ARC buffers that can be used for meta data.
1063
1064 See also \fBzfs_arc_meta_limit\fR which serves a similar purpose but has a
1065 higher priority if set to nonzero value.
1066
1067 .sp
1068 Default value: \fB75\fR%.
1069 .RE
1070
1071 .sp
1072 .ne 2
1073 .na
1074 \fBzfs_arc_meta_min\fR (ulong)
1075 .ad
1076 .RS 12n
1077 The minimum allowed size in bytes that meta data buffers may consume in
1078 the ARC.  This value defaults to 0 which disables a floor on the amount
1079 of the ARC devoted meta data.
1080 .sp
1081 Default value: \fB0\fR.
1082 .RE
1083
1084 .sp
1085 .ne 2
1086 .na
1087 \fBzfs_arc_meta_prune\fR (int)
1088 .ad
1089 .RS 12n
1090 The number of dentries and inodes to be scanned looking for entries
1091 which can be dropped.  This may be required when the ARC reaches the
1092 \fBzfs_arc_meta_limit\fR because dentries and inodes can pin buffers
1093 in the ARC.  Increasing this value will cause to dentry and inode caches
1094 to be pruned more aggressively.  Setting this value to 0 will disable
1095 pruning the inode and dentry caches.
1096 .sp
1097 Default value: \fB10,000\fR.
1098 .RE
1099
1100 .sp
1101 .ne 2
1102 .na
1103 \fBzfs_arc_meta_strategy\fR (int)
1104 .ad
1105 .RS 12n
1106 Define the strategy for ARC meta data buffer eviction (meta reclaim strategy).
1107 A value of 0 (META_ONLY) will evict only the ARC meta data buffers.
1108 A value of 1 (BALANCED) indicates that additional data buffers may be evicted if
1109 that is required to in order to evict the required number of meta data buffers.
1110 .sp
1111 Default value: \fB1\fR.
1112 .RE
1113
1114 .sp
1115 .ne 2
1116 .na
1117 \fBzfs_arc_min\fR (ulong)
1118 .ad
1119 .RS 12n
1120 Min size of ARC in bytes. If set to 0 then arc_c_min will default to
1121 consuming the larger of 32M or 1/32 of total system memory.
1122 .sp
1123 Default value: \fB0\fR.
1124 .RE
1125
1126 .sp
1127 .ne 2
1128 .na
1129 \fBzfs_arc_min_prefetch_ms\fR (int)
1130 .ad
1131 .RS 12n
1132 Minimum time prefetched blocks are locked in the ARC, specified in ms.
1133 A value of \fB0\fR will default to 1000 ms.
1134 .sp
1135 Default value: \fB0\fR.
1136 .RE
1137
1138 .sp
1139 .ne 2
1140 .na
1141 \fBzfs_arc_min_prescient_prefetch_ms\fR (int)
1142 .ad
1143 .RS 12n
1144 Minimum time "prescient prefetched" blocks are locked in the ARC, specified
1145 in ms. These blocks are meant to be prefetched fairly aggressively ahead of
1146 the code that may use them. A value of \fB0\fR will default to 6000 ms.
1147 .sp
1148 Default value: \fB0\fR.
1149 .RE
1150
1151 .sp
1152 .ne 2
1153 .na
1154 \fBzfs_max_missing_tvds\fR (int)
1155 .ad
1156 .RS 12n
1157 Number of missing top-level vdevs which will be allowed during
1158 pool import (only in read-only mode).
1159 .sp
1160 Default value: \fB0\fR
1161 .RE
1162
1163 .sp
1164 .ne 2
1165 .na
1166 \fBzfs_max_nvlist_src_size\fR (ulong)
1167 .ad
1168 .RS 12n
1169 Maximum size in bytes allowed to be passed as zc_nvlist_src_size for ioctls on
1170 /dev/zfs. This prevents a user from causing the kernel to allocate an excessive
1171 amount of memory. When the limit is exceeded, the ioctl fails with EINVAL and a
1172 description of the error is sent to the zfs-dbgmsg log. This parameter should
1173 not need to be touched under normal circumstances. On FreeBSD, the default is
1174 based on the system limit on user wired memory. On Linux, the default is
1175 \fBKMALLOC_MAX_SIZE\fR .
1176 .sp
1177 Default value: \fB0\fR (kernel decides)
1178 .RE
1179
1180 .sp
1181 .ne 2
1182 .na
1183 \fBzfs_multilist_num_sublists\fR (int)
1184 .ad
1185 .RS 12n
1186 To allow more fine-grained locking, each ARC state contains a series
1187 of lists for both data and meta data objects.  Locking is performed at
1188 the level of these "sub-lists".  This parameters controls the number of
1189 sub-lists per ARC state, and also applies to other uses of the
1190 multilist data structure.
1191 .sp
1192 Default value: \fB4\fR or the number of online CPUs, whichever is greater
1193 .RE
1194
1195 .sp
1196 .ne 2
1197 .na
1198 \fBzfs_arc_overflow_shift\fR (int)
1199 .ad
1200 .RS 12n
1201 The ARC size is considered to be overflowing if it exceeds the current
1202 ARC target size (arc_c) by a threshold determined by this parameter.
1203 The threshold is calculated as a fraction of arc_c using the formula
1204 "arc_c >> \fBzfs_arc_overflow_shift\fR".
1205
1206 The default value of 8 causes the ARC to be considered to be overflowing
1207 if it exceeds the target size by 1/256th (0.3%) of the target size.
1208
1209 When the ARC is overflowing, new buffer allocations are stalled until
1210 the reclaim thread catches up and the overflow condition no longer exists.
1211 .sp
1212 Default value: \fB8\fR.
1213 .RE
1214
1215 .sp
1216 .ne 2
1217 .na
1218
1219 \fBzfs_arc_p_min_shift\fR (int)
1220 .ad
1221 .RS 12n
1222 If set to a non zero value, this will update arc_p_min_shift (default 4)
1223 with the new value.
1224 arc_p_min_shift is used to shift of arc_c for calculating both min and max
1225 max arc_p
1226 .sp
1227 Default value: \fB0\fR.
1228 .RE
1229
1230 .sp
1231 .ne 2
1232 .na
1233 \fBzfs_arc_p_dampener_disable\fR (int)
1234 .ad
1235 .RS 12n
1236 Disable arc_p adapt dampener
1237 .sp
1238 Use \fB1\fR for yes (default) and \fB0\fR to disable.
1239 .RE
1240
1241 .sp
1242 .ne 2
1243 .na
1244 \fBzfs_arc_shrink_shift\fR (int)
1245 .ad
1246 .RS 12n
1247 If set to a non zero value, this will update arc_shrink_shift (default 7)
1248 with the new value.
1249 .sp
1250 Default value: \fB0\fR.
1251 .RE
1252
1253 .sp
1254 .ne 2
1255 .na
1256 \fBzfs_arc_pc_percent\fR (uint)
1257 .ad
1258 .RS 12n
1259 Percent of pagecache to reclaim arc to
1260
1261 This tunable allows ZFS arc to play more nicely with the kernel's LRU
1262 pagecache. It can guarantee that the ARC size won't collapse under scanning
1263 pressure on the pagecache, yet still allows arc to be reclaimed down to
1264 zfs_arc_min if necessary. This value is specified as percent of pagecache
1265 size (as measured by NR_FILE_PAGES) where that percent may exceed 100. This
1266 only operates during memory pressure/reclaim.
1267 .sp
1268 Default value: \fB0\fR% (disabled).
1269 .RE
1270
1271 .sp
1272 .ne 2
1273 .na
1274 \fBzfs_arc_shrinker_limit\fR (int)
1275 .ad
1276 .RS 12n
1277 This is a limit on how many pages the ARC shrinker makes available for
1278 eviction in response to one page allocation attempt.  Note that in
1279 practice, the kernel's shrinker can ask us to evict up to about 4x this
1280 for one allocation attempt.
1281 .sp
1282 The default limit of 10,000 (in practice, 160MB per allocation attempt with
1283 4K pages) limits the amount of time spent attempting to reclaim ARC memory to
1284 less than 100ms per allocation attempt, even with a small average compressed
1285 block size of ~8KB.
1286 .sp
1287 The parameter can be set to 0 (zero) to disable the limit.
1288 .sp
1289 This parameter only applies on Linux.
1290 .sp
1291 Default value: \fB10,000\fR.
1292 .RE
1293
1294 .sp
1295 .ne 2
1296 .na
1297 \fBzfs_arc_sys_free\fR (ulong)
1298 .ad
1299 .RS 12n
1300 The target number of bytes the ARC should leave as free memory on the system.
1301 Defaults to the larger of 1/64 of physical memory or 512K.  Setting this
1302 option to a non-zero value will override the default.
1303 .sp
1304 Default value: \fB0\fR.
1305 .RE
1306
1307 .sp
1308 .ne 2
1309 .na
1310 \fBzfs_autoimport_disable\fR (int)
1311 .ad
1312 .RS 12n
1313 Disable pool import at module load by ignoring the cache file (typically \fB/etc/zfs/zpool.cache\fR).
1314 .sp
1315 Use \fB1\fR for yes (default) and \fB0\fR for no.
1316 .RE
1317
1318 .sp
1319 .ne 2
1320 .na
1321 \fBzfs_checksum_events_per_second\fR (uint)
1322 .ad
1323 .RS 12n
1324 Rate limit checksum events to this many per second.  Note that this should
1325 not be set below the zed thresholds (currently 10 checksums over 10 sec)
1326 or else zed may not trigger any action.
1327 .sp
1328 Default value: 20
1329 .RE
1330
1331 .sp
1332 .ne 2
1333 .na
1334 \fBzfs_commit_timeout_pct\fR (int)
1335 .ad
1336 .RS 12n
1337 This controls the amount of time that a ZIL block (lwb) will remain "open"
1338 when it isn't "full", and it has a thread waiting for it to be committed to
1339 stable storage.  The timeout is scaled based on a percentage of the last lwb
1340 latency to avoid significantly impacting the latency of each individual
1341 transaction record (itx).
1342 .sp
1343 Default value: \fB5\fR%.
1344 .RE
1345
1346 .sp
1347 .ne 2
1348 .na
1349 \fBzfs_condense_indirect_commit_entry_delay_ms\fR (int)
1350 .ad
1351 .RS 12n
1352 Vdev indirection layer (used for device removal) sleeps for this many
1353 milliseconds during mapping generation. Intended for use with the test suite
1354 to throttle vdev removal speed.
1355 .sp
1356 Default value: \fB0\fR (no throttle).
1357 .RE
1358
1359 .sp
1360 .ne 2
1361 .na
1362 \fBzfs_condense_indirect_vdevs_enable\fR (int)
1363 .ad
1364 .RS 12n
1365 Enable condensing indirect vdev mappings.  When set to a non-zero value,
1366 attempt to condense indirect vdev mappings if the mapping uses more than
1367 \fBzfs_condense_min_mapping_bytes\fR bytes of memory and if the obsolete
1368 space map object uses more than \fBzfs_condense_max_obsolete_bytes\fR
1369 bytes on-disk.  The condensing process is an attempt to save memory by
1370 removing obsolete mappings.
1371 .sp
1372 Default value: \fB1\fR.
1373 .RE
1374
1375 .sp
1376 .ne 2
1377 .na
1378 \fBzfs_condense_max_obsolete_bytes\fR (ulong)
1379 .ad
1380 .RS 12n
1381 Only attempt to condense indirect vdev mappings if the on-disk size
1382 of the obsolete space map object is greater than this number of bytes
1383 (see \fBfBzfs_condense_indirect_vdevs_enable\fR).
1384 .sp
1385 Default value: \fB1,073,741,824\fR.
1386 .RE
1387
1388 .sp
1389 .ne 2
1390 .na
1391 \fBzfs_condense_min_mapping_bytes\fR (ulong)
1392 .ad
1393 .RS 12n
1394 Minimum size vdev mapping to attempt to condense (see
1395 \fBzfs_condense_indirect_vdevs_enable\fR).
1396 .sp
1397 Default value: \fB131,072\fR.
1398 .RE
1399
1400 .sp
1401 .ne 2
1402 .na
1403 \fBzfs_dbgmsg_enable\fR (int)
1404 .ad
1405 .RS 12n
1406 Internally ZFS keeps a small log to facilitate debugging.  By default the log
1407 is disabled, to enable it set this option to 1.  The contents of the log can
1408 be accessed by reading the /proc/spl/kstat/zfs/dbgmsg file.  Writing 0 to
1409 this proc file clears the log.
1410 .sp
1411 Default value: \fB0\fR.
1412 .RE
1413
1414 .sp
1415 .ne 2
1416 .na
1417 \fBzfs_dbgmsg_maxsize\fR (int)
1418 .ad
1419 .RS 12n
1420 The maximum size in bytes of the internal ZFS debug log.
1421 .sp
1422 Default value: \fB4M\fR.
1423 .RE
1424
1425 .sp
1426 .ne 2
1427 .na
1428 \fBzfs_dbuf_state_index\fR (int)
1429 .ad
1430 .RS 12n
1431 This feature is currently unused. It is normally used for controlling what
1432 reporting is available under /proc/spl/kstat/zfs.
1433 .sp
1434 Default value: \fB0\fR.
1435 .RE
1436
1437 .sp
1438 .ne 2
1439 .na
1440 \fBzfs_deadman_enabled\fR (int)
1441 .ad
1442 .RS 12n
1443 When a pool sync operation takes longer than \fBzfs_deadman_synctime_ms\fR
1444 milliseconds, or when an individual I/O takes longer than
1445 \fBzfs_deadman_ziotime_ms\fR milliseconds, then the operation is considered to
1446 be "hung".  If \fBzfs_deadman_enabled\fR is set then the deadman behavior is
1447 invoked as described by the \fBzfs_deadman_failmode\fR module option.
1448 By default the deadman is enabled and configured to \fBwait\fR which results
1449 in "hung" I/Os only being logged.  The deadman is automatically disabled
1450 when a pool gets suspended.
1451 .sp
1452 Default value: \fB1\fR.
1453 .RE
1454
1455 .sp
1456 .ne 2
1457 .na
1458 \fBzfs_deadman_failmode\fR (charp)
1459 .ad
1460 .RS 12n
1461 Controls the failure behavior when the deadman detects a "hung" I/O.  Valid
1462 values are \fBwait\fR, \fBcontinue\fR, and \fBpanic\fR.
1463 .sp
1464 \fBwait\fR - Wait for a "hung" I/O to complete.  For each "hung" I/O a
1465 "deadman" event will be posted describing that I/O.
1466 .sp
1467 \fBcontinue\fR - Attempt to recover from a "hung" I/O by re-dispatching it
1468 to the I/O pipeline if possible.
1469 .sp
1470 \fBpanic\fR - Panic the system.  This can be used to facilitate an automatic
1471 fail-over to a properly configured fail-over partner.
1472 .sp
1473 Default value: \fBwait\fR.
1474 .RE
1475
1476 .sp
1477 .ne 2
1478 .na
1479 \fBzfs_deadman_checktime_ms\fR (int)
1480 .ad
1481 .RS 12n
1482 Check time in milliseconds. This defines the frequency at which we check
1483 for hung I/O and potentially invoke the \fBzfs_deadman_failmode\fR behavior.
1484 .sp
1485 Default value: \fB60,000\fR.
1486 .RE
1487
1488 .sp
1489 .ne 2
1490 .na
1491 \fBzfs_deadman_synctime_ms\fR (ulong)
1492 .ad
1493 .RS 12n
1494 Interval in milliseconds after which the deadman is triggered and also
1495 the interval after which a pool sync operation is considered to be "hung".
1496 Once this limit is exceeded the deadman will be invoked every
1497 \fBzfs_deadman_checktime_ms\fR milliseconds until the pool sync completes.
1498 .sp
1499 Default value: \fB600,000\fR.
1500 .RE
1501
1502 .sp
1503 .ne 2
1504 .na
1505 \fBzfs_deadman_ziotime_ms\fR (ulong)
1506 .ad
1507 .RS 12n
1508 Interval in milliseconds after which the deadman is triggered and an
1509 individual I/O operation is considered to be "hung".  As long as the I/O
1510 remains "hung" the deadman will be invoked every \fBzfs_deadman_checktime_ms\fR
1511 milliseconds until the I/O completes.
1512 .sp
1513 Default value: \fB300,000\fR.
1514 .RE
1515
1516 .sp
1517 .ne 2
1518 .na
1519 \fBzfs_dedup_prefetch\fR (int)
1520 .ad
1521 .RS 12n
1522 Enable prefetching dedup-ed blks
1523 .sp
1524 Use \fB1\fR for yes and \fB0\fR to disable (default).
1525 .RE
1526
1527 .sp
1528 .ne 2
1529 .na
1530 \fBzfs_delay_min_dirty_percent\fR (int)
1531 .ad
1532 .RS 12n
1533 Start to delay each transaction once there is this amount of dirty data,
1534 expressed as a percentage of \fBzfs_dirty_data_max\fR.
1535 This value should be >= zfs_vdev_async_write_active_max_dirty_percent.
1536 See the section "ZFS TRANSACTION DELAY".
1537 .sp
1538 Default value: \fB60\fR%.
1539 .RE
1540
1541 .sp
1542 .ne 2
1543 .na
1544 \fBzfs_delay_scale\fR (int)
1545 .ad
1546 .RS 12n
1547 This controls how quickly the transaction delay approaches infinity.
1548 Larger values cause longer delays for a given amount of dirty data.
1549 .sp
1550 For the smoothest delay, this value should be about 1 billion divided
1551 by the maximum number of operations per second.  This will smoothly
1552 handle between 10x and 1/10th this number.
1553 .sp
1554 See the section "ZFS TRANSACTION DELAY".
1555 .sp
1556 Note: \fBzfs_delay_scale\fR * \fBzfs_dirty_data_max\fR must be < 2^64.
1557 .sp
1558 Default value: \fB500,000\fR.
1559 .RE
1560
1561 .sp
1562 .ne 2
1563 .na
1564 \fBzfs_disable_ivset_guid_check\fR (int)
1565 .ad
1566 .RS 12n
1567 Disables requirement for IVset guids to be present and match when doing a raw
1568 receive of encrypted datasets. Intended for users whose pools were created with
1569 OpenZFS pre-release versions and now have compatibility issues.
1570 .sp
1571 Default value: \fB0\fR.
1572 .RE
1573
1574 .sp
1575 .ne 2
1576 .na
1577 \fBzfs_key_max_salt_uses\fR (ulong)
1578 .ad
1579 .RS 12n
1580 Maximum number of uses of a single salt value before generating a new one for
1581 encrypted datasets. The default value is also the maximum that will be
1582 accepted.
1583 .sp
1584 Default value: \fB400,000,000\fR.
1585 .RE
1586
1587 .sp
1588 .ne 2
1589 .na
1590 \fBzfs_object_mutex_size\fR (uint)
1591 .ad
1592 .RS 12n
1593 Size of the znode hashtable used for holds.
1594
1595 Due to the need to hold locks on objects that may not exist yet, kernel mutexes
1596 are not created per-object and instead a hashtable is used where collisions
1597 will result in objects waiting when there is not actually contention on the
1598 same object.
1599 .sp
1600 Default value: \fB64\fR.
1601 .RE
1602
1603 .sp
1604 .ne 2
1605 .na
1606 \fBzfs_slow_io_events_per_second\fR (int)
1607 .ad
1608 .RS 12n
1609 Rate limit delay zevents (which report slow I/Os) to this many per second.
1610 .sp
1611 Default value: 20
1612 .RE
1613
1614 .sp
1615 .ne 2
1616 .na
1617 \fBzfs_unflushed_max_mem_amt\fR (ulong)
1618 .ad
1619 .RS 12n
1620 Upper-bound limit for unflushed metadata changes to be held by the
1621 log spacemap in memory (in bytes).
1622 .sp
1623 Default value: \fB1,073,741,824\fR (1GB).
1624 .RE
1625
1626 .sp
1627 .ne 2
1628 .na
1629 \fBzfs_unflushed_max_mem_ppm\fR (ulong)
1630 .ad
1631 .RS 12n
1632 Percentage of the overall system memory that ZFS allows to be used
1633 for unflushed metadata changes by the log spacemap.
1634 (value is calculated over 1000000 for finer granularity).
1635 .sp
1636 Default value: \fB1000\fR (which is divided by 1000000, resulting in
1637 the limit to be \fB0.1\fR% of memory)
1638 .RE
1639
1640 .sp
1641 .ne 2
1642 .na
1643 \fBzfs_unflushed_log_block_max\fR (ulong)
1644 .ad
1645 .RS 12n
1646 Describes the maximum number of log spacemap blocks allowed for each pool.
1647 The default value of 262144 means that the space in all the log spacemaps
1648 can add up to no more than 262144 blocks (which means 32GB of logical
1649 space before compression and ditto blocks, assuming that blocksize is
1650 128k).
1651 .sp
1652 This tunable is important because it involves a trade-off between import
1653 time after an unclean export and the frequency of flushing metaslabs.
1654 The higher this number is, the more log blocks we allow when the pool is
1655 active which means that we flush metaslabs less often and thus decrease
1656 the number of I/Os for spacemap updates per TXG.
1657 At the same time though, that means that in the event of an unclean export,
1658 there will be more log spacemap blocks for us to read, inducing overhead
1659 in the import time of the pool.
1660 The lower the number, the amount of flushing increases destroying log
1661 blocks quicker as they become obsolete faster, which leaves less blocks
1662 to be read during import time after a crash.
1663 .sp
1664 Each log spacemap block existing during pool import leads to approximately
1665 one extra logical I/O issued.
1666 This is the reason why this tunable is exposed in terms of blocks rather
1667 than space used.
1668 .sp
1669 Default value: \fB262144\fR (256K).
1670 .RE
1671
1672 .sp
1673 .ne 2
1674 .na
1675 \fBzfs_unflushed_log_block_min\fR (ulong)
1676 .ad
1677 .RS 12n
1678 If the number of metaslabs is small and our incoming rate is high, we
1679 could get into a situation that we are flushing all our metaslabs every
1680 TXG.
1681 Thus we always allow at least this many log blocks.
1682 .sp
1683 Default value: \fB1000\fR.
1684 .RE
1685
1686 .sp
1687 .ne 2
1688 .na
1689 \fBzfs_unflushed_log_block_pct\fR (ulong)
1690 .ad
1691 .RS 12n
1692 Tunable used to determine the number of blocks that can be used for
1693 the spacemap log, expressed as a percentage of the total number of
1694 metaslabs in the pool.
1695 .sp
1696 Default value: \fB400\fR (read as \fB400\fR% - meaning that the number
1697 of log spacemap blocks are capped at 4 times the number of
1698 metaslabs in the pool).
1699 .RE
1700
1701 .sp
1702 .ne 2
1703 .na
1704 \fBzfs_unlink_suspend_progress\fR (uint)
1705 .ad
1706 .RS 12n
1707 When enabled, files will not be asynchronously removed from the list of pending
1708 unlinks and the space they consume will be leaked. Once this option has been
1709 disabled and the dataset is remounted, the pending unlinks will be processed
1710 and the freed space returned to the pool.
1711 This option is used by the test suite to facilitate testing.
1712 .sp
1713 Uses \fB0\fR (default) to allow progress and \fB1\fR to pause progress.
1714 .RE
1715
1716 .sp
1717 .ne 2
1718 .na
1719 \fBzfs_delete_blocks\fR (ulong)
1720 .ad
1721 .RS 12n
1722 This is the used to define a large file for the purposes of delete.  Files
1723 containing more than \fBzfs_delete_blocks\fR will be deleted asynchronously
1724 while smaller files are deleted synchronously.  Decreasing this value will
1725 reduce the time spent in an unlink(2) system call at the expense of a longer
1726 delay before the freed space is available.
1727 .sp
1728 Default value: \fB20,480\fR.
1729 .RE
1730
1731 .sp
1732 .ne 2
1733 .na
1734 \fBzfs_dirty_data_max\fR (int)
1735 .ad
1736 .RS 12n
1737 Determines the dirty space limit in bytes.  Once this limit is exceeded, new
1738 writes are halted until space frees up. This parameter takes precedence
1739 over \fBzfs_dirty_data_max_percent\fR.
1740 See the section "ZFS TRANSACTION DELAY".
1741 .sp
1742 Default value: \fB10\fR% of physical RAM, capped at \fBzfs_dirty_data_max_max\fR.
1743 .RE
1744
1745 .sp
1746 .ne 2
1747 .na
1748 \fBzfs_dirty_data_max_max\fR (int)
1749 .ad
1750 .RS 12n
1751 Maximum allowable value of \fBzfs_dirty_data_max\fR, expressed in bytes.
1752 This limit is only enforced at module load time, and will be ignored if
1753 \fBzfs_dirty_data_max\fR is later changed.  This parameter takes
1754 precedence over \fBzfs_dirty_data_max_max_percent\fR. See the section
1755 "ZFS TRANSACTION DELAY".
1756 .sp
1757 Default value: \fB25\fR% of physical RAM.
1758 .RE
1759
1760 .sp
1761 .ne 2
1762 .na
1763 \fBzfs_dirty_data_max_max_percent\fR (int)
1764 .ad
1765 .RS 12n
1766 Maximum allowable value of \fBzfs_dirty_data_max\fR, expressed as a
1767 percentage of physical RAM.  This limit is only enforced at module load
1768 time, and will be ignored if \fBzfs_dirty_data_max\fR is later changed.
1769 The parameter \fBzfs_dirty_data_max_max\fR takes precedence over this
1770 one. See the section "ZFS TRANSACTION DELAY".
1771 .sp
1772 Default value: \fB25\fR%.
1773 .RE
1774
1775 .sp
1776 .ne 2
1777 .na
1778 \fBzfs_dirty_data_max_percent\fR (int)
1779 .ad
1780 .RS 12n
1781 Determines the dirty space limit, expressed as a percentage of all
1782 memory.  Once this limit is exceeded, new writes are halted until space frees
1783 up.  The parameter \fBzfs_dirty_data_max\fR takes precedence over this
1784 one.  See the section "ZFS TRANSACTION DELAY".
1785 .sp
1786 Default value: \fB10\fR%, subject to \fBzfs_dirty_data_max_max\fR.
1787 .RE
1788
1789 .sp
1790 .ne 2
1791 .na
1792 \fBzfs_dirty_data_sync_percent\fR (int)
1793 .ad
1794 .RS 12n
1795 Start syncing out a transaction group if there's at least this much dirty data
1796 as a percentage of \fBzfs_dirty_data_max\fR.  This should be less than
1797 \fBzfs_vdev_async_write_active_min_dirty_percent\fR.
1798 .sp
1799 Default value: \fB20\fR% of \fBzfs_dirty_data_max\fR.
1800 .RE
1801
1802 .sp
1803 .ne 2
1804 .na
1805 \fBzfs_fallocate_reserve_percent\fR (uint)
1806 .ad
1807 .RS 12n
1808 Since ZFS is a copy-on-write filesystem with snapshots, blocks cannot be
1809 preallocated for a file in order to guarantee that later writes will not
1810 run out of space.  Instead, fallocate() space preallocation only checks
1811 that sufficient space is currently available in the pool or the user's
1812 project quota allocation, and then creates a sparse file of the requested
1813 size. The requested space is multiplied by \fBzfs_fallocate_reserve_percent\fR
1814 to allow additional space for indirect blocks and other internal metadata.
1815 Setting this value to 0 disables support for fallocate(2) and returns
1816 EOPNOTSUPP for fallocate() space preallocation again.
1817 .sp
1818 Default value: \fB110\fR%
1819 .RE
1820
1821 .sp
1822 .ne 2
1823 .na
1824 \fBzfs_fletcher_4_impl\fR (string)
1825 .ad
1826 .RS 12n
1827 Select a fletcher 4 implementation.
1828 .sp
1829 Supported selectors are: \fBfastest\fR, \fBscalar\fR, \fBsse2\fR, \fBssse3\fR,
1830 \fBavx2\fR, \fBavx512f\fR, \fBavx512bw\fR, and \fBaarch64_neon\fR.
1831 All of the selectors except \fBfastest\fR and \fBscalar\fR require instruction
1832 set extensions to be available and will only appear if ZFS detects that they are
1833 present at runtime. If multiple implementations of fletcher 4 are available,
1834 the \fBfastest\fR will be chosen using a micro benchmark. Selecting \fBscalar\fR
1835 results in the original, CPU based calculation, being used. Selecting any option
1836 other than \fBfastest\fR and \fBscalar\fR results in vector instructions from
1837 the respective CPU instruction set being used.
1838 .sp
1839 Default value: \fBfastest\fR.
1840 .RE
1841
1842 .sp
1843 .ne 2
1844 .na
1845 \fBzfs_free_bpobj_enabled\fR (int)
1846 .ad
1847 .RS 12n
1848 Enable/disable the processing of the free_bpobj object.
1849 .sp
1850 Default value: \fB1\fR.
1851 .RE
1852
1853 .sp
1854 .ne 2
1855 .na
1856 \fBzfs_async_block_max_blocks\fR (ulong)
1857 .ad
1858 .RS 12n
1859 Maximum number of blocks freed in a single txg.
1860 .sp
1861 Default value: \fBULONG_MAX\fR (unlimited).
1862 .RE
1863
1864 .sp
1865 .ne 2
1866 .na
1867 \fBzfs_max_async_dedup_frees\fR (ulong)
1868 .ad
1869 .RS 12n
1870 Maximum number of dedup blocks freed in a single txg.
1871 .sp
1872 Default value: \fB100,000\fR.
1873 .RE
1874
1875 .sp
1876 .ne 2
1877 .na
1878 \fBzfs_override_estimate_recordsize\fR (ulong)
1879 .ad
1880 .RS 12n
1881 Record size calculation override for zfs send estimates.
1882 .sp
1883 Default value: \fB0\fR.
1884 .RE
1885
1886 .sp
1887 .ne 2
1888 .na
1889 \fBzfs_vdev_async_read_max_active\fR (int)
1890 .ad
1891 .RS 12n
1892 Maximum asynchronous read I/Os active to each device.
1893 See the section "ZFS I/O SCHEDULER".
1894 .sp
1895 Default value: \fB3\fR.
1896 .RE
1897
1898 .sp
1899 .ne 2
1900 .na
1901 \fBzfs_vdev_async_read_min_active\fR (int)
1902 .ad
1903 .RS 12n
1904 Minimum asynchronous read I/Os active to each device.
1905 See the section "ZFS I/O SCHEDULER".
1906 .sp
1907 Default value: \fB1\fR.
1908 .RE
1909
1910 .sp
1911 .ne 2
1912 .na
1913 \fBzfs_vdev_async_write_active_max_dirty_percent\fR (int)
1914 .ad
1915 .RS 12n
1916 When the pool has more than
1917 \fBzfs_vdev_async_write_active_max_dirty_percent\fR dirty data, use
1918 \fBzfs_vdev_async_write_max_active\fR to limit active async writes.  If
1919 the dirty data is between min and max, the active I/O limit is linearly
1920 interpolated. See the section "ZFS I/O SCHEDULER".
1921 .sp
1922 Default value: \fB60\fR%.
1923 .RE
1924
1925 .sp
1926 .ne 2
1927 .na
1928 \fBzfs_vdev_async_write_active_min_dirty_percent\fR (int)
1929 .ad
1930 .RS 12n
1931 When the pool has less than
1932 \fBzfs_vdev_async_write_active_min_dirty_percent\fR dirty data, use
1933 \fBzfs_vdev_async_write_min_active\fR to limit active async writes.  If
1934 the dirty data is between min and max, the active I/O limit is linearly
1935 interpolated. See the section "ZFS I/O SCHEDULER".
1936 .sp
1937 Default value: \fB30\fR%.
1938 .RE
1939
1940 .sp
1941 .ne 2
1942 .na
1943 \fBzfs_vdev_async_write_max_active\fR (int)
1944 .ad
1945 .RS 12n
1946 Maximum asynchronous write I/Os active to each device.
1947 See the section "ZFS I/O SCHEDULER".
1948 .sp
1949 Default value: \fB10\fR.
1950 .RE
1951
1952 .sp
1953 .ne 2
1954 .na
1955 \fBzfs_vdev_async_write_min_active\fR (int)
1956 .ad
1957 .RS 12n
1958 Minimum asynchronous write I/Os active to each device.
1959 See the section "ZFS I/O SCHEDULER".
1960 .sp
1961 Lower values are associated with better latency on rotational media but poorer
1962 resilver performance. The default value of 2 was chosen as a compromise. A
1963 value of 3 has been shown to improve resilver performance further at a cost of
1964 further increasing latency.
1965 .sp
1966 Default value: \fB2\fR.
1967 .RE
1968
1969 .sp
1970 .ne 2
1971 .na
1972 \fBzfs_vdev_initializing_max_active\fR (int)
1973 .ad
1974 .RS 12n
1975 Maximum initializing I/Os active to each device.
1976 See the section "ZFS I/O SCHEDULER".
1977 .sp
1978 Default value: \fB1\fR.
1979 .RE
1980
1981 .sp
1982 .ne 2
1983 .na
1984 \fBzfs_vdev_initializing_min_active\fR (int)
1985 .ad
1986 .RS 12n
1987 Minimum initializing I/Os active to each device.
1988 See the section "ZFS I/O SCHEDULER".
1989 .sp
1990 Default value: \fB1\fR.
1991 .RE
1992
1993 .sp
1994 .ne 2
1995 .na
1996 \fBzfs_vdev_max_active\fR (int)
1997 .ad
1998 .RS 12n
1999 The maximum number of I/Os active to each device.  Ideally, this will be >=
2000 the sum of each queue's max_active.  It must be at least the sum of each
2001 queue's min_active.  See the section "ZFS I/O SCHEDULER".
2002 .sp
2003 Default value: \fB1,000\fR.
2004 .RE
2005
2006 .sp
2007 .ne 2
2008 .na
2009 \fBzfs_vdev_rebuild_max_active\fR (int)
2010 .ad
2011 .RS 12n
2012 Maximum sequential resilver I/Os active to each device.
2013 See the section "ZFS I/O SCHEDULER".
2014 .sp
2015 Default value: \fB3\fR.
2016 .RE
2017
2018 .sp
2019 .ne 2
2020 .na
2021 \fBzfs_vdev_rebuild_min_active\fR (int)
2022 .ad
2023 .RS 12n
2024 Minimum sequential resilver I/Os active to each device.
2025 See the section "ZFS I/O SCHEDULER".
2026 .sp
2027 Default value: \fB1\fR.
2028 .RE
2029
2030 .sp
2031 .ne 2
2032 .na
2033 \fBzfs_vdev_removal_max_active\fR (int)
2034 .ad
2035 .RS 12n
2036 Maximum removal I/Os active to each device.
2037 See the section "ZFS I/O SCHEDULER".
2038 .sp
2039 Default value: \fB2\fR.
2040 .RE
2041
2042 .sp
2043 .ne 2
2044 .na
2045 \fBzfs_vdev_removal_min_active\fR (int)
2046 .ad
2047 .RS 12n
2048 Minimum removal I/Os active to each device.
2049 See the section "ZFS I/O SCHEDULER".
2050 .sp
2051 Default value: \fB1\fR.
2052 .RE
2053
2054 .sp
2055 .ne 2
2056 .na
2057 \fBzfs_vdev_scrub_max_active\fR (int)
2058 .ad
2059 .RS 12n
2060 Maximum scrub I/Os active to each device.
2061 See the section "ZFS I/O SCHEDULER".
2062 .sp
2063 Default value: \fB2\fR.
2064 .RE
2065
2066 .sp
2067 .ne 2
2068 .na
2069 \fBzfs_vdev_scrub_min_active\fR (int)
2070 .ad
2071 .RS 12n
2072 Minimum scrub I/Os active to each device.
2073 See the section "ZFS I/O SCHEDULER".
2074 .sp
2075 Default value: \fB1\fR.
2076 .RE
2077
2078 .sp
2079 .ne 2
2080 .na
2081 \fBzfs_vdev_sync_read_max_active\fR (int)
2082 .ad
2083 .RS 12n
2084 Maximum synchronous read I/Os active to each device.
2085 See the section "ZFS I/O SCHEDULER".
2086 .sp
2087 Default value: \fB10\fR.
2088 .RE
2089
2090 .sp
2091 .ne 2
2092 .na
2093 \fBzfs_vdev_sync_read_min_active\fR (int)
2094 .ad
2095 .RS 12n
2096 Minimum synchronous read I/Os active to each device.
2097 See the section "ZFS I/O SCHEDULER".
2098 .sp
2099 Default value: \fB10\fR.
2100 .RE
2101
2102 .sp
2103 .ne 2
2104 .na
2105 \fBzfs_vdev_sync_write_max_active\fR (int)
2106 .ad
2107 .RS 12n
2108 Maximum synchronous write I/Os active to each device.
2109 See the section "ZFS I/O SCHEDULER".
2110 .sp
2111 Default value: \fB10\fR.
2112 .RE
2113
2114 .sp
2115 .ne 2
2116 .na
2117 \fBzfs_vdev_sync_write_min_active\fR (int)
2118 .ad
2119 .RS 12n
2120 Minimum synchronous write I/Os active to each device.
2121 See the section "ZFS I/O SCHEDULER".
2122 .sp
2123 Default value: \fB10\fR.
2124 .RE
2125
2126 .sp
2127 .ne 2
2128 .na
2129 \fBzfs_vdev_trim_max_active\fR (int)
2130 .ad
2131 .RS 12n
2132 Maximum trim/discard I/Os active to each device.
2133 See the section "ZFS I/O SCHEDULER".
2134 .sp
2135 Default value: \fB2\fR.
2136 .RE
2137
2138 .sp
2139 .ne 2
2140 .na
2141 \fBzfs_vdev_trim_min_active\fR (int)
2142 .ad
2143 .RS 12n
2144 Minimum trim/discard I/Os active to each device.
2145 See the section "ZFS I/O SCHEDULER".
2146 .sp
2147 Default value: \fB1\fR.
2148 .RE
2149
2150 .sp
2151 .ne 2
2152 .na
2153 \fBzfs_vdev_queue_depth_pct\fR (int)
2154 .ad
2155 .RS 12n
2156 Maximum number of queued allocations per top-level vdev expressed as
2157 a percentage of \fBzfs_vdev_async_write_max_active\fR which allows the
2158 system to detect devices that are more capable of handling allocations
2159 and to allocate more blocks to those devices.  It allows for dynamic
2160 allocation distribution when devices are imbalanced as fuller devices
2161 will tend to be slower than empty devices.
2162
2163 See also \fBzio_dva_throttle_enabled\fR.
2164 .sp
2165 Default value: \fB1000\fR%.
2166 .RE
2167
2168 .sp
2169 .ne 2
2170 .na
2171 \fBzfs_expire_snapshot\fR (int)
2172 .ad
2173 .RS 12n
2174 Seconds to expire .zfs/snapshot
2175 .sp
2176 Default value: \fB300\fR.
2177 .RE
2178
2179 .sp
2180 .ne 2
2181 .na
2182 \fBzfs_admin_snapshot\fR (int)
2183 .ad
2184 .RS 12n
2185 Allow the creation, removal, or renaming of entries in the .zfs/snapshot
2186 directory to cause the creation, destruction, or renaming of snapshots.
2187 When enabled this functionality works both locally and over NFS exports
2188 which have the 'no_root_squash' option set. This functionality is disabled
2189 by default.
2190 .sp
2191 Use \fB1\fR for yes and \fB0\fR for no (default).
2192 .RE
2193
2194 .sp
2195 .ne 2
2196 .na
2197 \fBzfs_flags\fR (int)
2198 .ad
2199 .RS 12n
2200 Set additional debugging flags. The following flags may be bitwise-or'd
2201 together.
2202 .sp
2203 .TS
2204 box;
2205 rB lB
2206 lB lB
2207 r l.
2208 Value   Symbolic Name
2209         Description
2210 _
2211 1       ZFS_DEBUG_DPRINTF
2212         Enable dprintf entries in the debug log.
2213 _
2214 2       ZFS_DEBUG_DBUF_VERIFY *
2215         Enable extra dbuf verifications.
2216 _
2217 4       ZFS_DEBUG_DNODE_VERIFY *
2218         Enable extra dnode verifications.
2219 _
2220 8       ZFS_DEBUG_SNAPNAMES
2221         Enable snapshot name verification.
2222 _
2223 16      ZFS_DEBUG_MODIFY
2224         Check for illegally modified ARC buffers.
2225 _
2226 64      ZFS_DEBUG_ZIO_FREE
2227         Enable verification of block frees.
2228 _
2229 128     ZFS_DEBUG_HISTOGRAM_VERIFY
2230         Enable extra spacemap histogram verifications.
2231 _
2232 256     ZFS_DEBUG_METASLAB_VERIFY
2233         Verify space accounting on disk matches in-core range_trees.
2234 _
2235 512     ZFS_DEBUG_SET_ERROR
2236         Enable SET_ERROR and dprintf entries in the debug log.
2237 _
2238 1024    ZFS_DEBUG_INDIRECT_REMAP
2239         Verify split blocks created by device removal.
2240 _
2241 2048    ZFS_DEBUG_TRIM
2242         Verify TRIM ranges are always within the allocatable range tree.
2243 _
2244 4096    ZFS_DEBUG_LOG_SPACEMAP
2245         Verify that the log summary is consistent with the spacemap log
2246         and enable zfs_dbgmsgs for metaslab loading and flushing.
2247 .TE
2248 .sp
2249 * Requires debug build.
2250 .sp
2251 Default value: \fB0\fR.
2252 .RE
2253
2254 .sp
2255 .ne 2
2256 .na
2257 \fBzfs_free_leak_on_eio\fR (int)
2258 .ad
2259 .RS 12n
2260 If destroy encounters an EIO while reading metadata (e.g. indirect
2261 blocks), space referenced by the missing metadata can not be freed.
2262 Normally this causes the background destroy to become "stalled", as
2263 it is unable to make forward progress.  While in this stalled state,
2264 all remaining space to free from the error-encountering filesystem is
2265 "temporarily leaked".  Set this flag to cause it to ignore the EIO,
2266 permanently leak the space from indirect blocks that can not be read,
2267 and continue to free everything else that it can.
2268
2269 The default, "stalling" behavior is useful if the storage partially
2270 fails (i.e. some but not all i/os fail), and then later recovers.  In
2271 this case, we will be able to continue pool operations while it is
2272 partially failed, and when it recovers, we can continue to free the
2273 space, with no leaks.  However, note that this case is actually
2274 fairly rare.
2275
2276 Typically pools either (a) fail completely (but perhaps temporarily,
2277 e.g. a top-level vdev going offline), or (b) have localized,
2278 permanent errors (e.g. disk returns the wrong data due to bit flip or
2279 firmware bug).  In case (a), this setting does not matter because the
2280 pool will be suspended and the sync thread will not be able to make
2281 forward progress regardless.  In case (b), because the error is
2282 permanent, the best we can do is leak the minimum amount of space,
2283 which is what setting this flag will do.  Therefore, it is reasonable
2284 for this flag to normally be set, but we chose the more conservative
2285 approach of not setting it, so that there is no possibility of
2286 leaking space in the "partial temporary" failure case.
2287 .sp
2288 Default value: \fB0\fR.
2289 .RE
2290
2291 .sp
2292 .ne 2
2293 .na
2294 \fBzfs_free_min_time_ms\fR (int)
2295 .ad
2296 .RS 12n
2297 During a \fBzfs destroy\fR operation using \fBfeature@async_destroy\fR a minimum
2298 of this much time will be spent working on freeing blocks per txg.
2299 .sp
2300 Default value: \fB1,000\fR.
2301 .RE
2302
2303 .sp
2304 .ne 2
2305 .na
2306 \fBzfs_obsolete_min_time_ms\fR (int)
2307 .ad
2308 .RS 12n
2309 Similar to \fBzfs_free_min_time_ms\fR but for cleanup of old indirection records
2310 for removed vdevs.
2311 .sp
2312 Default value: \fB500\fR.
2313 .RE
2314
2315 .sp
2316 .ne 2
2317 .na
2318 \fBzfs_immediate_write_sz\fR (long)
2319 .ad
2320 .RS 12n
2321 Largest data block to write to zil. Larger blocks will be treated as if the
2322 dataset being written to had the property setting \fBlogbias=throughput\fR.
2323 .sp
2324 Default value: \fB32,768\fR.
2325 .RE
2326
2327 .sp
2328 .ne 2
2329 .na
2330 \fBzfs_initialize_value\fR (ulong)
2331 .ad
2332 .RS 12n
2333 Pattern written to vdev free space by \fBzpool initialize\fR.
2334 .sp
2335 Default value: \fB16,045,690,984,833,335,022\fR (0xdeadbeefdeadbeee).
2336 .RE
2337
2338 .sp
2339 .ne 2
2340 .na
2341 \fBzfs_initialize_chunk_size\fR (ulong)
2342 .ad
2343 .RS 12n
2344 Size of writes used by \fBzpool initialize\fR.
2345 This option is used by the test suite to facilitate testing.
2346 .sp
2347 Default value: \fB1,048,576\fR
2348 .RE
2349
2350 .sp
2351 .ne 2
2352 .na
2353 \fBzfs_livelist_max_entries\fR (ulong)
2354 .ad
2355 .RS 12n
2356 The threshold size (in block pointers) at which we create a new sub-livelist.
2357 Larger sublists are more costly from a memory perspective but the fewer
2358 sublists there are, the lower the cost of insertion.
2359 .sp
2360 Default value: \fB500,000\fR.
2361 .RE
2362
2363 .sp
2364 .ne 2
2365 .na
2366 \fBzfs_livelist_min_percent_shared\fR (int)
2367 .ad
2368 .RS 12n
2369 If the amount of shared space between a snapshot and its clone drops below
2370 this threshold, the clone turns off the livelist and reverts to the old deletion
2371 method. This is in place because once a clone has been overwritten enough
2372 livelists no long give us a benefit.
2373 .sp
2374 Default value: \fB75\fR.
2375 .RE
2376
2377 .sp
2378 .ne 2
2379 .na
2380 \fBzfs_livelist_condense_new_alloc\fR (int)
2381 .ad
2382 .RS 12n
2383 Incremented each time an extra ALLOC blkptr is added to a livelist entry while
2384 it is being condensed.
2385 This option is used by the test suite to track race conditions.
2386 .sp
2387 Default value: \fB0\fR.
2388 .RE
2389
2390 .sp
2391 .ne 2
2392 .na
2393 \fBzfs_livelist_condense_sync_cancel\fR (int)
2394 .ad
2395 .RS 12n
2396 Incremented each time livelist condensing is canceled while in
2397 spa_livelist_condense_sync.
2398 This option is used by the test suite to track race conditions.
2399 .sp
2400 Default value: \fB0\fR.
2401 .RE
2402
2403 .sp
2404 .ne 2
2405 .na
2406 \fBzfs_livelist_condense_sync_pause\fR (int)
2407 .ad
2408 .RS 12n
2409 When set, the livelist condense process pauses indefinitely before
2410 executing the synctask - spa_livelist_condense_sync.
2411 This option is used by the test suite to trigger race conditions.
2412 .sp
2413 Default value: \fB0\fR.
2414 .RE
2415
2416 .sp
2417 .ne 2
2418 .na
2419 \fBzfs_livelist_condense_zthr_cancel\fR (int)
2420 .ad
2421 .RS 12n
2422 Incremented each time livelist condensing is canceled while in
2423 spa_livelist_condense_cb.
2424 This option is used by the test suite to track race conditions.
2425 .sp
2426 Default value: \fB0\fR.
2427 .RE
2428
2429 .sp
2430 .ne 2
2431 .na
2432 \fBzfs_livelist_condense_zthr_pause\fR (int)
2433 .ad
2434 .RS 12n
2435 When set, the livelist condense process pauses indefinitely before
2436 executing the open context condensing work in spa_livelist_condense_cb.
2437 This option is used by the test suite to trigger race conditions.
2438 .sp
2439 Default value: \fB0\fR.
2440 .RE
2441
2442 .sp
2443 .ne 2
2444 .na
2445 \fBzfs_lua_max_instrlimit\fR (ulong)
2446 .ad
2447 .RS 12n
2448 The maximum execution time limit that can be set for a ZFS channel program,
2449 specified as a number of Lua instructions.
2450 .sp
2451 Default value: \fB100,000,000\fR.
2452 .RE
2453
2454 .sp
2455 .ne 2
2456 .na
2457 \fBzfs_lua_max_memlimit\fR (ulong)
2458 .ad
2459 .RS 12n
2460 The maximum memory limit that can be set for a ZFS channel program, specified
2461 in bytes.
2462 .sp
2463 Default value: \fB104,857,600\fR.
2464 .RE
2465
2466 .sp
2467 .ne 2
2468 .na
2469 \fBzfs_max_dataset_nesting\fR (int)
2470 .ad
2471 .RS 12n
2472 The maximum depth of nested datasets.  This value can be tuned temporarily to
2473 fix existing datasets that exceed the predefined limit.
2474 .sp
2475 Default value: \fB50\fR.
2476 .RE
2477
2478 .sp
2479 .ne 2
2480 .na
2481 \fBzfs_max_log_walking\fR (ulong)
2482 .ad
2483 .RS 12n
2484 The number of past TXGs that the flushing algorithm of the log spacemap
2485 feature uses to estimate incoming log blocks.
2486 .sp
2487 Default value: \fB5\fR.
2488 .RE
2489
2490 .sp
2491 .ne 2
2492 .na
2493 \fBzfs_max_logsm_summary_length\fR (ulong)
2494 .ad
2495 .RS 12n
2496 Maximum number of rows allowed in the summary of the spacemap log.
2497 .sp
2498 Default value: \fB10\fR.
2499 .RE
2500
2501 .sp
2502 .ne 2
2503 .na
2504 \fBzfs_max_recordsize\fR (int)
2505 .ad
2506 .RS 12n
2507 We currently support block sizes from 512 bytes to 16MB.  The benefits of
2508 larger blocks, and thus larger I/O, need to be weighed against the cost of
2509 COWing a giant block to modify one byte.  Additionally, very large blocks
2510 can have an impact on i/o latency, and also potentially on the memory
2511 allocator.  Therefore, we do not allow the recordsize to be set larger than
2512 zfs_max_recordsize (default 1MB).  Larger blocks can be created by changing
2513 this tunable, and pools with larger blocks can always be imported and used,
2514 regardless of this setting.
2515 .sp
2516 Default value: \fB1,048,576\fR.
2517 .RE
2518
2519 .sp
2520 .ne 2
2521 .na
2522 \fBzfs_allow_redacted_dataset_mount\fR (int)
2523 .ad
2524 .RS 12n
2525 Allow datasets received with redacted send/receive to be mounted. Normally
2526 disabled because these datasets may be missing key data.
2527 .sp
2528 Default value: \fB0\fR.
2529 .RE
2530
2531 .sp
2532 .ne 2
2533 .na
2534 \fBzfs_min_metaslabs_to_flush\fR (ulong)
2535 .ad
2536 .RS 12n
2537 Minimum number of metaslabs to flush per dirty TXG
2538 .sp
2539 Default value: \fB1\fR.
2540 .RE
2541
2542 .sp
2543 .ne 2
2544 .na
2545 \fBzfs_metaslab_fragmentation_threshold\fR (int)
2546 .ad
2547 .RS 12n
2548 Allow metaslabs to keep their active state as long as their fragmentation
2549 percentage is less than or equal to this value. An active metaslab that
2550 exceeds this threshold will no longer keep its active status allowing
2551 better metaslabs to be selected.
2552 .sp
2553 Default value: \fB70\fR.
2554 .RE
2555
2556 .sp
2557 .ne 2
2558 .na
2559 \fBzfs_mg_fragmentation_threshold\fR (int)
2560 .ad
2561 .RS 12n
2562 Metaslab groups are considered eligible for allocations if their
2563 fragmentation metric (measured as a percentage) is less than or equal to
2564 this value. If a metaslab group exceeds this threshold then it will be
2565 skipped unless all metaslab groups within the metaslab class have also
2566 crossed this threshold.
2567 .sp
2568 Default value: \fB95\fR.
2569 .RE
2570
2571 .sp
2572 .ne 2
2573 .na
2574 \fBzfs_mg_noalloc_threshold\fR (int)
2575 .ad
2576 .RS 12n
2577 Defines a threshold at which metaslab groups should be eligible for
2578 allocations.  The value is expressed as a percentage of free space
2579 beyond which a metaslab group is always eligible for allocations.
2580 If a metaslab group's free space is less than or equal to the
2581 threshold, the allocator will avoid allocating to that group
2582 unless all groups in the pool have reached the threshold.  Once all
2583 groups have reached the threshold, all groups are allowed to accept
2584 allocations.  The default value of 0 disables the feature and causes
2585 all metaslab groups to be eligible for allocations.
2586
2587 This parameter allows one to deal with pools having heavily imbalanced
2588 vdevs such as would be the case when a new vdev has been added.
2589 Setting the threshold to a non-zero percentage will stop allocations
2590 from being made to vdevs that aren't filled to the specified percentage
2591 and allow lesser filled vdevs to acquire more allocations than they
2592 otherwise would under the old \fBzfs_mg_alloc_failures\fR facility.
2593 .sp
2594 Default value: \fB0\fR.
2595 .RE
2596
2597 .sp
2598 .ne 2
2599 .na
2600 \fBzfs_ddt_data_is_special\fR (int)
2601 .ad
2602 .RS 12n
2603 If enabled, ZFS will place DDT data into the special allocation class.
2604 .sp
2605 Default value: \fB1\fR.
2606 .RE
2607
2608 .sp
2609 .ne 2
2610 .na
2611 \fBzfs_user_indirect_is_special\fR (int)
2612 .ad
2613 .RS 12n
2614 If enabled, ZFS will place user data (both file and zvol) indirect blocks
2615 into the special allocation class.
2616 .sp
2617 Default value: \fB1\fR.
2618 .RE
2619
2620 .sp
2621 .ne 2
2622 .na
2623 \fBzfs_multihost_history\fR (int)
2624 .ad
2625 .RS 12n
2626 Historical statistics for the last N multihost updates will be available in
2627 \fB/proc/spl/kstat/zfs/<pool>/multihost\fR
2628 .sp
2629 Default value: \fB0\fR.
2630 .RE
2631
2632 .sp
2633 .ne 2
2634 .na
2635 \fBzfs_multihost_interval\fR (ulong)
2636 .ad
2637 .RS 12n
2638 Used to control the frequency of multihost writes which are performed when the
2639 \fBmultihost\fR pool property is on.  This is one factor used to determine the
2640 length of the activity check during import.
2641 .sp
2642 The multihost write period is \fBzfs_multihost_interval / leaf-vdevs\fR
2643 milliseconds.  On average a multihost write will be issued for each leaf vdev
2644 every \fBzfs_multihost_interval\fR milliseconds.  In practice, the observed
2645 period can vary with the I/O load and this observed value is the delay which is
2646 stored in the uberblock.
2647 .sp
2648 Default value: \fB1000\fR.
2649 .RE
2650
2651 .sp
2652 .ne 2
2653 .na
2654 \fBzfs_multihost_import_intervals\fR (uint)
2655 .ad
2656 .RS 12n
2657 Used to control the duration of the activity test on import.  Smaller values of
2658 \fBzfs_multihost_import_intervals\fR will reduce the import time but increase
2659 the risk of failing to detect an active pool.  The total activity check time is
2660 never allowed to drop below one second.
2661 .sp
2662 On import the activity check waits a minimum amount of time determined by
2663 \fBzfs_multihost_interval * zfs_multihost_import_intervals\fR, or the same
2664 product computed on the host which last had the pool imported (whichever is
2665 greater).  The activity check time may be further extended if the value of mmp
2666 delay found in the best uberblock indicates actual multihost updates happened
2667 at longer intervals than \fBzfs_multihost_interval\fR.  A minimum value of
2668 \fB100ms\fR is enforced.
2669 .sp
2670 A value of 0 is ignored and treated as if it was set to 1.
2671 .sp
2672 Default value: \fB20\fR.
2673 .RE
2674
2675 .sp
2676 .ne 2
2677 .na
2678 \fBzfs_multihost_fail_intervals\fR (uint)
2679 .ad
2680 .RS 12n
2681 Controls the behavior of the pool when multihost write failures or delays are
2682 detected.
2683 .sp
2684 When \fBzfs_multihost_fail_intervals = 0\fR, multihost write failures or delays
2685 are ignored.  The failures will still be reported to the ZED which depending on
2686 its configuration may take action such as suspending the pool or offlining a
2687 device.
2688
2689 .sp
2690 When \fBzfs_multihost_fail_intervals > 0\fR, the pool will be suspended if
2691 \fBzfs_multihost_fail_intervals * zfs_multihost_interval\fR milliseconds pass
2692 without a successful mmp write.  This guarantees the activity test will see
2693 mmp writes if the pool is imported.  A value of 1 is ignored and treated as
2694 if it was set to 2.  This is necessary to prevent the pool from being suspended
2695 due to normal, small I/O latency variations.
2696
2697 .sp
2698 Default value: \fB10\fR.
2699 .RE
2700
2701 .sp
2702 .ne 2
2703 .na
2704 \fBzfs_no_scrub_io\fR (int)
2705 .ad
2706 .RS 12n
2707 Set for no scrub I/O. This results in scrubs not actually scrubbing data and
2708 simply doing a metadata crawl of the pool instead.
2709 .sp
2710 Use \fB1\fR for yes and \fB0\fR for no (default).
2711 .RE
2712
2713 .sp
2714 .ne 2
2715 .na
2716 \fBzfs_no_scrub_prefetch\fR (int)
2717 .ad
2718 .RS 12n
2719 Set to disable block prefetching for scrubs.
2720 .sp
2721 Use \fB1\fR for yes and \fB0\fR for no (default).
2722 .RE
2723
2724 .sp
2725 .ne 2
2726 .na
2727 \fBzfs_nocacheflush\fR (int)
2728 .ad
2729 .RS 12n
2730 Disable cache flush operations on disks when writing.  Setting this will
2731 cause pool corruption on power loss if a volatile out-of-order write cache
2732 is enabled.
2733 .sp
2734 Use \fB1\fR for yes and \fB0\fR for no (default).
2735 .RE
2736
2737 .sp
2738 .ne 2
2739 .na
2740 \fBzfs_nopwrite_enabled\fR (int)
2741 .ad
2742 .RS 12n
2743 Enable NOP writes
2744 .sp
2745 Use \fB1\fR for yes (default) and \fB0\fR to disable.
2746 .RE
2747
2748 .sp
2749 .ne 2
2750 .na
2751 \fBzfs_dmu_offset_next_sync\fR (int)
2752 .ad
2753 .RS 12n
2754 Enable forcing txg sync to find holes. When enabled forces ZFS to act
2755 like prior versions when SEEK_HOLE or SEEK_DATA flags are used, which
2756 when a dnode is dirty causes txg's to be synced so that this data can be
2757 found.
2758 .sp
2759 Use \fB1\fR for yes and \fB0\fR to disable (default).
2760 .RE
2761
2762 .sp
2763 .ne 2
2764 .na
2765 \fBzfs_pd_bytes_max\fR (int)
2766 .ad
2767 .RS 12n
2768 The number of bytes which should be prefetched during a pool traversal
2769 (eg: \fBzfs send\fR or other data crawling operations)
2770 .sp
2771 Default value: \fB52,428,800\fR.
2772 .RE
2773
2774 .sp
2775 .ne 2
2776 .na
2777 \fBzfs_per_txg_dirty_frees_percent \fR (ulong)
2778 .ad
2779 .RS 12n
2780 Tunable to control percentage of dirtied indirect blocks from frees allowed
2781 into one TXG. After this threshold is crossed, additional frees will wait until
2782 the next TXG.
2783 A value of zero will disable this throttle.
2784 .sp
2785 Default value: \fB5\fR, set to \fB0\fR to disable.
2786 .RE
2787
2788 .sp
2789 .ne 2
2790 .na
2791 \fBzfs_prefetch_disable\fR (int)
2792 .ad
2793 .RS 12n
2794 This tunable disables predictive prefetch.  Note that it leaves "prescient"
2795 prefetch (e.g. prefetch for zfs send) intact.  Unlike predictive prefetch,
2796 prescient prefetch never issues i/os that end up not being needed, so it
2797 can't hurt performance.
2798 .sp
2799 Use \fB1\fR for yes and \fB0\fR for no (default).
2800 .RE
2801
2802 .sp
2803 .ne 2
2804 .na
2805 \fBzfs_qat_checksum_disable\fR (int)
2806 .ad
2807 .RS 12n
2808 This tunable disables qat hardware acceleration for sha256 checksums. It
2809 may be set after the zfs modules have been loaded to initialize the qat
2810 hardware as long as support is compiled in and the qat driver is present.
2811 .sp
2812 Use \fB1\fR for yes and \fB0\fR for no (default).
2813 .RE
2814
2815 .sp
2816 .ne 2
2817 .na
2818 \fBzfs_qat_compress_disable\fR (int)
2819 .ad
2820 .RS 12n
2821 This tunable disables qat hardware acceleration for gzip compression. It
2822 may be set after the zfs modules have been loaded to initialize the qat
2823 hardware as long as support is compiled in and the qat driver is present.
2824 .sp
2825 Use \fB1\fR for yes and \fB0\fR for no (default).
2826 .RE
2827
2828 .sp
2829 .ne 2
2830 .na
2831 \fBzfs_qat_encrypt_disable\fR (int)
2832 .ad
2833 .RS 12n
2834 This tunable disables qat hardware acceleration for AES-GCM encryption. It
2835 may be set after the zfs modules have been loaded to initialize the qat
2836 hardware as long as support is compiled in and the qat driver is present.
2837 .sp
2838 Use \fB1\fR for yes and \fB0\fR for no (default).
2839 .RE
2840
2841 .sp
2842 .ne 2
2843 .na
2844 \fBzfs_read_chunk_size\fR (long)
2845 .ad
2846 .RS 12n
2847 Bytes to read per chunk
2848 .sp
2849 Default value: \fB1,048,576\fR.
2850 .RE
2851
2852 .sp
2853 .ne 2
2854 .na
2855 \fBzfs_read_history\fR (int)
2856 .ad
2857 .RS 12n
2858 Historical statistics for the last N reads will be available in
2859 \fB/proc/spl/kstat/zfs/<pool>/reads\fR
2860 .sp
2861 Default value: \fB0\fR (no data is kept).
2862 .RE
2863
2864 .sp
2865 .ne 2
2866 .na
2867 \fBzfs_read_history_hits\fR (int)
2868 .ad
2869 .RS 12n
2870 Include cache hits in read history
2871 .sp
2872 Use \fB1\fR for yes and \fB0\fR for no (default).
2873 .RE
2874
2875 .sp
2876 .ne 2
2877 .na
2878 \fBzfs_rebuild_max_segment\fR (ulong)
2879 .ad
2880 .RS 12n
2881 Maximum read segment size to issue when sequentially resilvering a
2882 top-level vdev.
2883 .sp
2884 Default value: \fB1,048,576\fR.
2885 .RE
2886
2887 .sp
2888 .ne 2
2889 .na
2890 \fBzfs_reconstruct_indirect_combinations_max\fR (int)
2891 .ad
2892 .RS 12na
2893 If an indirect split block contains more than this many possible unique
2894 combinations when being reconstructed, consider it too computationally
2895 expensive to check them all. Instead, try at most
2896 \fBzfs_reconstruct_indirect_combinations_max\fR randomly-selected
2897 combinations each time the block is accessed.  This allows all segment
2898 copies to participate fairly in the reconstruction when all combinations
2899 cannot be checked and prevents repeated use of one bad copy.
2900 .sp
2901 Default value: \fB4096\fR.
2902 .RE
2903
2904 .sp
2905 .ne 2
2906 .na
2907 \fBzfs_recover\fR (int)
2908 .ad
2909 .RS 12n
2910 Set to attempt to recover from fatal errors. This should only be used as a
2911 last resort, as it typically results in leaked space, or worse.
2912 .sp
2913 Use \fB1\fR for yes and \fB0\fR for no (default).
2914 .RE
2915
2916 .sp
2917 .ne 2
2918 .na
2919 \fBzfs_removal_ignore_errors\fR (int)
2920 .ad
2921 .RS 12n
2922 .sp
2923 Ignore hard IO errors during device removal.  When set, if a device encounters
2924 a hard IO error during the removal process the removal will not be cancelled.
2925 This can result in a normally recoverable block becoming permanently damaged
2926 and is not recommended.  This should only be used as a last resort when the
2927 pool cannot be returned to a healthy state prior to removing the device.
2928 .sp
2929 Default value: \fB0\fR.
2930 .RE
2931
2932 .sp
2933 .ne 2
2934 .na
2935 \fBzfs_removal_suspend_progress\fR (int)
2936 .ad
2937 .RS 12n
2938 .sp
2939 This is used by the test suite so that it can ensure that certain actions
2940 happen while in the middle of a removal.
2941 .sp
2942 Default value: \fB0\fR.
2943 .RE
2944
2945 .sp
2946 .ne 2
2947 .na
2948 \fBzfs_remove_max_segment\fR (int)
2949 .ad
2950 .RS 12n
2951 .sp
2952 The largest contiguous segment that we will attempt to allocate when removing
2953 a device.  This can be no larger than 16MB.  If there is a performance
2954 problem with attempting to allocate large blocks, consider decreasing this.
2955 .sp
2956 Default value: \fB16,777,216\fR (16MB).
2957 .RE
2958
2959 .sp
2960 .ne 2
2961 .na
2962 \fBzfs_resilver_disable_defer\fR (int)
2963 .ad
2964 .RS 12n
2965 Disables the \fBresilver_defer\fR feature, causing an operation that would
2966 start a resilver to restart one in progress immediately.
2967 .sp
2968 Default value: \fB0\fR (feature enabled).
2969 .RE
2970
2971 .sp
2972 .ne 2
2973 .na
2974 \fBzfs_resilver_min_time_ms\fR (int)
2975 .ad
2976 .RS 12n
2977 Resilvers are processed by the sync thread. While resilvering it will spend
2978 at least this much time working on a resilver between txg flushes.
2979 .sp
2980 Default value: \fB3,000\fR.
2981 .RE
2982
2983 .sp
2984 .ne 2
2985 .na
2986 \fBzfs_scan_ignore_errors\fR (int)
2987 .ad
2988 .RS 12n
2989 If set to a nonzero value, remove the DTL (dirty time list) upon
2990 completion of a pool scan (scrub) even if there were unrepairable
2991 errors.  It is intended to be used during pool repair or recovery to
2992 stop resilvering when the pool is next imported.
2993 .sp
2994 Default value: \fB0\fR.
2995 .RE
2996
2997 .sp
2998 .ne 2
2999 .na
3000 \fBzfs_scrub_min_time_ms\fR (int)
3001 .ad
3002 .RS 12n
3003 Scrubs are processed by the sync thread. While scrubbing it will spend
3004 at least this much time working on a scrub between txg flushes.
3005 .sp
3006 Default value: \fB1,000\fR.
3007 .RE
3008
3009 .sp
3010 .ne 2
3011 .na
3012 \fBzfs_scan_checkpoint_intval\fR (int)
3013 .ad
3014 .RS 12n
3015 To preserve progress across reboots the sequential scan algorithm periodically
3016 needs to stop metadata scanning and issue all the verifications I/Os to disk.
3017 The frequency of this flushing is determined by the
3018 \fBzfs_scan_checkpoint_intval\fR tunable.
3019 .sp
3020 Default value: \fB7200\fR seconds (every 2 hours).
3021 .RE
3022
3023 .sp
3024 .ne 2
3025 .na
3026 \fBzfs_scan_fill_weight\fR (int)
3027 .ad
3028 .RS 12n
3029 This tunable affects how scrub and resilver I/O segments are ordered. A higher
3030 number indicates that we care more about how filled in a segment is, while a
3031 lower number indicates we care more about the size of the extent without
3032 considering the gaps within a segment. This value is only tunable upon module
3033 insertion. Changing the value afterwards will have no affect on scrub or
3034 resilver performance.
3035 .sp
3036 Default value: \fB3\fR.
3037 .RE
3038
3039 .sp
3040 .ne 2
3041 .na
3042 \fBzfs_scan_issue_strategy\fR (int)
3043 .ad
3044 .RS 12n
3045 Determines the order that data will be verified while scrubbing or resilvering.
3046 If set to \fB1\fR, data will be verified as sequentially as possible, given the
3047 amount of memory reserved for scrubbing (see \fBzfs_scan_mem_lim_fact\fR). This
3048 may improve scrub performance if the pool's data is very fragmented. If set to
3049 \fB2\fR, the largest mostly-contiguous chunk of found data will be verified
3050 first. By deferring scrubbing of small segments, we may later find adjacent data
3051 to coalesce and increase the segment size. If set to \fB0\fR, zfs will use
3052 strategy \fB1\fR during normal verification and strategy \fB2\fR while taking a
3053 checkpoint.
3054 .sp
3055 Default value: \fB0\fR.
3056 .RE
3057
3058 .sp
3059 .ne 2
3060 .na
3061 \fBzfs_scan_legacy\fR (int)
3062 .ad
3063 .RS 12n
3064 A value of 0 indicates that scrubs and resilvers will gather metadata in
3065 memory before issuing sequential I/O. A value of 1 indicates that the legacy
3066 algorithm will be used where I/O is initiated as soon as it is discovered.
3067 Changing this value to 0 will not affect scrubs or resilvers that are already
3068 in progress.
3069 .sp
3070 Default value: \fB0\fR.
3071 .RE
3072
3073 .sp
3074 .ne 2
3075 .na
3076 \fBzfs_scan_max_ext_gap\fR (int)
3077 .ad
3078 .RS 12n
3079 Indicates the largest gap in bytes between scrub / resilver I/Os that will still
3080 be considered sequential for sorting purposes. Changing this value will not
3081 affect scrubs or resilvers that are already in progress.
3082 .sp
3083 Default value: \fB2097152 (2 MB)\fR.
3084 .RE
3085
3086 .sp
3087 .ne 2
3088 .na
3089 \fBzfs_scan_mem_lim_fact\fR (int)
3090 .ad
3091 .RS 12n
3092 Maximum fraction of RAM used for I/O sorting by sequential scan algorithm.
3093 This tunable determines the hard limit for I/O sorting memory usage.
3094 When the hard limit is reached we stop scanning metadata and start issuing
3095 data verification I/O. This is done until we get below the soft limit.
3096 .sp
3097 Default value: \fB20\fR which is 5% of RAM (1/20).
3098 .RE
3099
3100 .sp
3101 .ne 2
3102 .na
3103 \fBzfs_scan_mem_lim_soft_fact\fR (int)
3104 .ad
3105 .RS 12n
3106 The fraction of the hard limit used to determined the soft limit for I/O sorting
3107 by the sequential scan algorithm. When we cross this limit from below no action
3108 is taken. When we cross this limit from above it is because we are issuing
3109 verification I/O. In this case (unless the metadata scan is done) we stop
3110 issuing verification I/O and start scanning metadata again until we get to the
3111 hard limit.
3112 .sp
3113 Default value: \fB20\fR which is 5% of the hard limit (1/20).
3114 .RE
3115
3116 .sp
3117 .ne 2
3118 .na
3119 \fBzfs_scan_strict_mem_lim\fR (int)
3120 .ad
3121 .RS 12n
3122 Enforces tight memory limits on pool scans when a sequential scan is in
3123 progress. When disabled the memory limit may be exceeded by fast disks.
3124 .sp
3125 Default value: \fB0\fR.
3126 .RE
3127
3128 .sp
3129 .ne 2
3130 .na
3131 \fBzfs_scan_suspend_progress\fR (int)
3132 .ad
3133 .RS 12n
3134 Freezes a scrub/resilver in progress without actually pausing it. Intended for
3135 testing/debugging.
3136 .sp
3137 Default value: \fB0\fR.
3138 .RE
3139
3140
3141 .sp
3142 .ne 2
3143 .na
3144 \fBzfs_scan_vdev_limit\fR (int)
3145 .ad
3146 .RS 12n
3147 Maximum amount of data that can be concurrently issued at once for scrubs and
3148 resilvers per leaf device, given in bytes.
3149 .sp
3150 Default value: \fB41943040\fR.
3151 .RE
3152
3153 .sp
3154 .ne 2
3155 .na
3156 \fBzfs_send_corrupt_data\fR (int)
3157 .ad
3158 .RS 12n
3159 Allow sending of corrupt data (ignore read/checksum errors when sending data)
3160 .sp
3161 Use \fB1\fR for yes and \fB0\fR for no (default).
3162 .RE
3163
3164 .sp
3165 .ne 2
3166 .na
3167 \fBzfs_send_unmodified_spill_blocks\fR (int)
3168 .ad
3169 .RS 12n
3170 Include unmodified spill blocks in the send stream. Under certain circumstances
3171 previous versions of ZFS could incorrectly remove the spill block from an
3172 existing object.  Including unmodified copies of the spill blocks creates a
3173 backwards compatible stream which will recreate a spill block if it was
3174 incorrectly removed.
3175 .sp
3176 Use \fB1\fR for yes (default) and \fB0\fR for no.
3177 .RE
3178
3179 .sp
3180 .ne 2
3181 .na
3182 \fBzfs_send_no_prefetch_queue_ff\fR (int)
3183 .ad
3184 .RS 12n
3185 The fill fraction of the \fBzfs send\fR internal queues. The fill fraction
3186 controls the timing with which internal threads are woken up.
3187 .sp
3188 Default value: \fB20\fR.
3189 .RE
3190
3191 .sp
3192 .ne 2
3193 .na
3194 \fBzfs_send_no_prefetch_queue_length\fR (int)
3195 .ad
3196 .RS 12n
3197 The maximum number of bytes allowed in \fBzfs send\fR's internal queues.
3198 .sp
3199 Default value: \fB1,048,576\fR.
3200 .RE
3201
3202 .sp
3203 .ne 2
3204 .na
3205 \fBzfs_send_queue_ff\fR (int)
3206 .ad
3207 .RS 12n
3208 The fill fraction of the \fBzfs send\fR prefetch queue. The fill fraction
3209 controls the timing with which internal threads are woken up.
3210 .sp
3211 Default value: \fB20\fR.
3212 .RE
3213
3214 .sp
3215 .ne 2
3216 .na
3217 \fBzfs_send_queue_length\fR (int)
3218 .ad
3219 .RS 12n
3220 The maximum number of bytes allowed that will be prefetched by \fBzfs send\fR.
3221 This value must be at least twice the maximum block size in use.
3222 .sp
3223 Default value: \fB16,777,216\fR.
3224 .RE
3225
3226 .sp
3227 .ne 2
3228 .na
3229 \fBzfs_recv_queue_ff\fR (int)
3230 .ad
3231 .RS 12n
3232 The fill fraction of the \fBzfs receive\fR queue. The fill fraction
3233 controls the timing with which internal threads are woken up.
3234 .sp
3235 Default value: \fB20\fR.
3236 .RE
3237
3238 .sp
3239 .ne 2
3240 .na
3241 \fBzfs_recv_queue_length\fR (int)
3242 .ad
3243 .RS 12n
3244 The maximum number of bytes allowed in the \fBzfs receive\fR queue. This value
3245 must be at least twice the maximum block size in use.
3246 .sp
3247 Default value: \fB16,777,216\fR.
3248 .RE
3249
3250 .sp
3251 .ne 2
3252 .na
3253 \fBzfs_recv_write_batch_size\fR (int)
3254 .ad
3255 .RS 12n
3256 The maximum amount of data (in bytes) that \fBzfs receive\fR will write in
3257 one DMU transaction.  This is the uncompressed size, even when receiving a
3258 compressed send stream.  This setting will not reduce the write size below
3259 a single block. Capped at a maximum of 32MB
3260 .sp
3261 Default value: \fB1MB\fR.
3262 .RE
3263
3264 .sp
3265 .ne 2
3266 .na
3267 \fBzfs_override_estimate_recordsize\fR (ulong)
3268 .ad
3269 .RS 12n
3270 Setting this variable overrides the default logic for estimating block
3271 sizes when doing a zfs send. The default heuristic is that the average
3272 block size will be the current recordsize. Override this value if most data
3273 in your dataset is not of that size and you require accurate zfs send size
3274 estimates.
3275 .sp
3276 Default value: \fB0\fR.
3277 .RE
3278
3279 .sp
3280 .ne 2
3281 .na
3282 \fBzfs_sync_pass_deferred_free\fR (int)
3283 .ad
3284 .RS 12n
3285 Flushing of data to disk is done in passes. Defer frees starting in this pass
3286 .sp
3287 Default value: \fB2\fR.
3288 .RE
3289
3290 .sp
3291 .ne 2
3292 .na
3293 \fBzfs_spa_discard_memory_limit\fR (int)
3294 .ad
3295 .RS 12n
3296 Maximum memory used for prefetching a checkpoint's space map on each
3297 vdev while discarding the checkpoint.
3298 .sp
3299 Default value: \fB16,777,216\fR.
3300 .RE
3301
3302 .sp
3303 .ne 2
3304 .na
3305 \fBzfs_special_class_metadata_reserve_pct\fR (int)
3306 .ad
3307 .RS 12n
3308 Only allow small data blocks to be allocated on the special and dedup vdev
3309 types when the available free space percentage on these vdevs exceeds this
3310 value. This ensures reserved space is available for pool meta data as the
3311 special vdevs approach capacity.
3312 .sp
3313 Default value: \fB25\fR.
3314 .RE
3315
3316 .sp
3317 .ne 2
3318 .na
3319 \fBzfs_sync_pass_dont_compress\fR (int)
3320 .ad
3321 .RS 12n
3322 Starting in this sync pass, we disable compression (including of metadata).
3323 With the default setting, in practice, we don't have this many sync passes,
3324 so this has no effect.
3325 .sp
3326 The original intent was that disabling compression would help the sync passes
3327 to converge. However, in practice disabling compression increases the average
3328 number of sync passes, because when we turn compression off, a lot of block's
3329 size will change and thus we have to re-allocate (not overwrite) them. It
3330 also increases the number of 128KB allocations (e.g. for indirect blocks and
3331 spacemaps) because these will not be compressed. The 128K allocations are
3332 especially detrimental to performance on highly fragmented systems, which may
3333 have very few free segments of this size, and may need to load new metaslabs
3334 to satisfy 128K allocations.
3335 .sp
3336 Default value: \fB8\fR.
3337 .RE
3338
3339 .sp
3340 .ne 2
3341 .na
3342 \fBzfs_sync_pass_rewrite\fR (int)
3343 .ad
3344 .RS 12n
3345 Rewrite new block pointers starting in this pass
3346 .sp
3347 Default value: \fB2\fR.
3348 .RE
3349
3350 .sp
3351 .ne 2
3352 .na
3353 \fBzfs_sync_taskq_batch_pct\fR (int)
3354 .ad
3355 .RS 12n
3356 This controls the number of threads used by the dp_sync_taskq.  The default
3357 value of 75% will create a maximum of one thread per cpu.
3358 .sp
3359 Default value: \fB75\fR%.
3360 .RE
3361
3362 .sp
3363 .ne 2
3364 .na
3365 \fBzfs_trim_extent_bytes_max\fR (uint)
3366 .ad
3367 .RS 12n
3368 Maximum size of TRIM command.  Ranges larger than this will be split in to
3369 chunks no larger than \fBzfs_trim_extent_bytes_max\fR bytes before being
3370 issued to the device.
3371 .sp
3372 Default value: \fB134,217,728\fR.
3373 .RE
3374
3375 .sp
3376 .ne 2
3377 .na
3378 \fBzfs_trim_extent_bytes_min\fR (uint)
3379 .ad
3380 .RS 12n
3381 Minimum size of TRIM commands.  TRIM ranges smaller than this will be skipped
3382 unless they're part of a larger range which was broken in to chunks.  This is
3383 done because it's common for these small TRIMs to negatively impact overall
3384 performance.  This value can be set to 0 to TRIM all unallocated space.
3385 .sp
3386 Default value: \fB32,768\fR.
3387 .RE
3388
3389 .sp
3390 .ne 2
3391 .na
3392 \fBzfs_trim_metaslab_skip\fR (uint)
3393 .ad
3394 .RS 12n
3395 Skip uninitialized metaslabs during the TRIM process.  This option is useful
3396 for pools constructed from large thinly-provisioned devices where TRIM
3397 operations are slow.  As a pool ages an increasing fraction of the pools
3398 metaslabs will be initialized progressively degrading the usefulness of
3399 this option.  This setting is stored when starting a manual TRIM and will
3400 persist for the duration of the requested TRIM.
3401 .sp
3402 Default value: \fB0\fR.
3403 .RE
3404
3405 .sp
3406 .ne 2
3407 .na
3408 \fBzfs_trim_queue_limit\fR (uint)
3409 .ad
3410 .RS 12n
3411 Maximum number of queued TRIMs outstanding per leaf vdev.  The number of
3412 concurrent TRIM commands issued to the device is controlled by the
3413 \fBzfs_vdev_trim_min_active\fR and \fBzfs_vdev_trim_max_active\fR module
3414 options.
3415 .sp
3416 Default value: \fB10\fR.
3417 .RE
3418
3419 .sp
3420 .ne 2
3421 .na
3422 \fBzfs_trim_txg_batch\fR (uint)
3423 .ad
3424 .RS 12n
3425 The number of transaction groups worth of frees which should be aggregated
3426 before TRIM operations are issued to the device.  This setting represents a
3427 trade-off between issuing larger, more efficient TRIM operations and the
3428 delay before the recently trimmed space is available for use by the device.
3429 .sp
3430 Increasing this value will allow frees to be aggregated for a longer time.
3431 This will result is larger TRIM operations and potentially increased memory
3432 usage.  Decreasing this value will have the opposite effect.  The default
3433 value of 32 was determined to be a reasonable compromise.
3434 .sp
3435 Default value: \fB32\fR.
3436 .RE
3437
3438 .sp
3439 .ne 2
3440 .na
3441 \fBzfs_txg_history\fR (int)
3442 .ad
3443 .RS 12n
3444 Historical statistics for the last N txgs will be available in
3445 \fB/proc/spl/kstat/zfs/<pool>/txgs\fR
3446 .sp
3447 Default value: \fB0\fR.
3448 .RE
3449
3450 .sp
3451 .ne 2
3452 .na
3453 \fBzfs_txg_timeout\fR (int)
3454 .ad
3455 .RS 12n
3456 Flush dirty data to disk at least every N seconds (maximum txg duration)
3457 .sp
3458 Default value: \fB5\fR.
3459 .RE
3460
3461 .sp
3462 .ne 2
3463 .na
3464 \fBzfs_vdev_aggregate_trim\fR (int)
3465 .ad
3466 .RS 12n
3467 Allow TRIM I/Os to be aggregated.  This is normally not helpful because
3468 the extents to be trimmed will have been already been aggregated by the
3469 metaslab.  This option is provided for debugging and performance analysis.
3470 .sp
3471 Default value: \fB0\fR.
3472 .RE
3473
3474 .sp
3475 .ne 2
3476 .na
3477 \fBzfs_vdev_aggregation_limit\fR (int)
3478 .ad
3479 .RS 12n
3480 Max vdev I/O aggregation size
3481 .sp
3482 Default value: \fB1,048,576\fR.
3483 .RE
3484
3485 .sp
3486 .ne 2
3487 .na
3488 \fBzfs_vdev_aggregation_limit_non_rotating\fR (int)
3489 .ad
3490 .RS 12n
3491 Max vdev I/O aggregation size for non-rotating media
3492 .sp
3493 Default value: \fB131,072\fR.
3494 .RE
3495
3496 .sp
3497 .ne 2
3498 .na
3499 \fBzfs_vdev_cache_bshift\fR (int)
3500 .ad
3501 .RS 12n
3502 Shift size to inflate reads too
3503 .sp
3504 Default value: \fB16\fR (effectively 65536).
3505 .RE
3506
3507 .sp
3508 .ne 2
3509 .na
3510 \fBzfs_vdev_cache_max\fR (int)
3511 .ad
3512 .RS 12n
3513 Inflate reads smaller than this value to meet the \fBzfs_vdev_cache_bshift\fR
3514 size (default 64k).
3515 .sp
3516 Default value: \fB16384\fR.
3517 .RE
3518
3519 .sp
3520 .ne 2
3521 .na
3522 \fBzfs_vdev_cache_size\fR (int)
3523 .ad
3524 .RS 12n
3525 Total size of the per-disk cache in bytes.
3526 .sp
3527 Currently this feature is disabled as it has been found to not be helpful
3528 for performance and in some cases harmful.
3529 .sp
3530 Default value: \fB0\fR.
3531 .RE
3532
3533 .sp
3534 .ne 2
3535 .na
3536 \fBzfs_vdev_mirror_rotating_inc\fR (int)
3537 .ad
3538 .RS 12n
3539 A number by which the balancing algorithm increments the load calculation for
3540 the purpose of selecting the least busy mirror member when an I/O immediately
3541 follows its predecessor on rotational vdevs for the purpose of making decisions
3542 based on load.
3543 .sp
3544 Default value: \fB0\fR.
3545 .RE
3546
3547 .sp
3548 .ne 2
3549 .na
3550 \fBzfs_vdev_mirror_rotating_seek_inc\fR (int)
3551 .ad
3552 .RS 12n
3553 A number by which the balancing algorithm increments the load calculation for
3554 the purpose of selecting the least busy mirror member when an I/O lacks
3555 locality as defined by the zfs_vdev_mirror_rotating_seek_offset.  I/Os within
3556 this that are not immediately following the previous I/O are incremented by
3557 half.
3558 .sp
3559 Default value: \fB5\fR.
3560 .RE
3561
3562 .sp
3563 .ne 2
3564 .na
3565 \fBzfs_vdev_mirror_rotating_seek_offset\fR (int)
3566 .ad
3567 .RS 12n
3568 The maximum distance for the last queued I/O in which the balancing algorithm
3569 considers an I/O to have locality.
3570 See the section "ZFS I/O SCHEDULER".
3571 .sp
3572 Default value: \fB1048576\fR.
3573 .RE
3574
3575 .sp
3576 .ne 2
3577 .na
3578 \fBzfs_vdev_mirror_non_rotating_inc\fR (int)
3579 .ad
3580 .RS 12n
3581 A number by which the balancing algorithm increments the load calculation for
3582 the purpose of selecting the least busy mirror member on non-rotational vdevs
3583 when I/Os do not immediately follow one another.
3584 .sp
3585 Default value: \fB0\fR.
3586 .RE
3587
3588 .sp
3589 .ne 2
3590 .na
3591 \fBzfs_vdev_mirror_non_rotating_seek_inc\fR (int)
3592 .ad
3593 .RS 12n
3594 A number by which the balancing algorithm increments the load calculation for
3595 the purpose of selecting the least busy mirror member when an I/O lacks
3596 locality as defined by the zfs_vdev_mirror_rotating_seek_offset. I/Os within
3597 this that are not immediately following the previous I/O are incremented by
3598 half.
3599 .sp
3600 Default value: \fB1\fR.
3601 .RE
3602
3603 .sp
3604 .ne 2
3605 .na
3606 \fBzfs_vdev_read_gap_limit\fR (int)
3607 .ad
3608 .RS 12n
3609 Aggregate read I/O operations if the gap on-disk between them is within this
3610 threshold.
3611 .sp
3612 Default value: \fB32,768\fR.
3613 .RE
3614
3615 .sp
3616 .ne 2
3617 .na
3618 \fBzfs_vdev_write_gap_limit\fR (int)
3619 .ad
3620 .RS 12n
3621 Aggregate write I/O over gap
3622 .sp
3623 Default value: \fB4,096\fR.
3624 .RE
3625
3626 .sp
3627 .ne 2
3628 .na
3629 \fBzfs_vdev_raidz_impl\fR (string)
3630 .ad
3631 .RS 12n
3632 Parameter for selecting raidz parity implementation to use.
3633
3634 Options marked (always) below may be selected on module load as they are
3635 supported on all systems.
3636 The remaining options may only be set after the module is loaded, as they
3637 are available only if the implementations are compiled in and supported
3638 on the running system.
3639
3640 Once the module is loaded, the content of
3641 /sys/module/zfs/parameters/zfs_vdev_raidz_impl will show available options
3642 with the currently selected one enclosed in [].
3643 Possible options are:
3644   fastest  - (always) implementation selected using built-in benchmark
3645   original - (always) original raidz implementation
3646   scalar   - (always) scalar raidz implementation
3647   sse2     - implementation using SSE2 instruction set (64bit x86 only)
3648   ssse3    - implementation using SSSE3 instruction set (64bit x86 only)
3649   avx2     - implementation using AVX2 instruction set (64bit x86 only)
3650   avx512f  - implementation using AVX512F instruction set (64bit x86 only)
3651   avx512bw - implementation using AVX512F & AVX512BW instruction sets (64bit x86 only)
3652   aarch64_neon - implementation using NEON (Aarch64/64 bit ARMv8 only)
3653   aarch64_neonx2 - implementation using NEON with more unrolling (Aarch64/64 bit ARMv8 only)
3654   powerpc_altivec - implementation using Altivec (PowerPC only)
3655 .sp
3656 Default value: \fBfastest\fR.
3657 .RE
3658
3659 .sp
3660 .ne 2
3661 .na
3662 \fBzfs_vdev_scheduler\fR (charp)
3663 .ad
3664 .RS 12n
3665 \fBDEPRECATED\fR: This option exists for compatibility with older user
3666 configurations. It does nothing except print a warning to the kernel log if
3667 set.
3668 .sp
3669 .RE
3670
3671 .sp
3672 .ne 2
3673 .na
3674 \fBzfs_zevent_cols\fR (int)
3675 .ad
3676 .RS 12n
3677 When zevents are logged to the console use this as the word wrap width.
3678 .sp
3679 Default value: \fB80\fR.
3680 .RE
3681
3682 .sp
3683 .ne 2
3684 .na
3685 \fBzfs_zevent_console\fR (int)
3686 .ad
3687 .RS 12n
3688 Log events to the console
3689 .sp
3690 Use \fB1\fR for yes and \fB0\fR for no (default).
3691 .RE
3692
3693 .sp
3694 .ne 2
3695 .na
3696 \fBzfs_zevent_len_max\fR (int)
3697 .ad
3698 .RS 12n
3699 Max event queue length. A value of 0 will result in a calculated value which
3700 increases with the number of CPUs in the system (minimum 64 events). Events
3701 in the queue can be viewed with the \fBzpool events\fR command.
3702 .sp
3703 Default value: \fB0\fR.
3704 .RE
3705
3706 .sp
3707 .ne 2
3708 .na
3709 \fBzfs_zevent_retain_max\fR (int)
3710 .ad
3711 .RS 12n
3712 Maximum recent zevent records to retain for duplicate checking.  Setting
3713 this value to zero disables duplicate detection.
3714 .sp
3715 Default value: \fB2000\fR.
3716 .RE
3717
3718 .sp
3719 .ne 2
3720 .na
3721 \fBzfs_zevent_retain_expire_secs\fR (int)
3722 .ad
3723 .RS 12n
3724 Lifespan for a recent ereport that was retained for duplicate checking.
3725 .sp
3726 Default value: \fB900\fR.
3727 .RE
3728
3729 .na
3730 \fBzfs_zil_clean_taskq_maxalloc\fR (int)
3731 .ad
3732 .RS 12n
3733 The maximum number of taskq entries that are allowed to be cached.  When this
3734 limit is exceeded transaction records (itxs) will be cleaned synchronously.
3735 .sp
3736 Default value: \fB1048576\fR.
3737 .RE
3738
3739 .sp
3740 .ne 2
3741 .na
3742 \fBzfs_zil_clean_taskq_minalloc\fR (int)
3743 .ad
3744 .RS 12n
3745 The number of taskq entries that are pre-populated when the taskq is first
3746 created and are immediately available for use.
3747 .sp
3748 Default value: \fB1024\fR.
3749 .RE
3750
3751 .sp
3752 .ne 2
3753 .na
3754 \fBzfs_zil_clean_taskq_nthr_pct\fR (int)
3755 .ad
3756 .RS 12n
3757 This controls the number of threads used by the dp_zil_clean_taskq.  The default
3758 value of 100% will create a maximum of one thread per cpu.
3759 .sp
3760 Default value: \fB100\fR%.
3761 .RE
3762
3763 .sp
3764 .ne 2
3765 .na
3766 \fBzil_maxblocksize\fR (int)
3767 .ad
3768 .RS 12n
3769 This sets the maximum block size used by the ZIL.  On very fragmented pools,
3770 lowering this (typically to 36KB) can improve performance.
3771 .sp
3772 Default value: \fB131072\fR (128KB).
3773 .RE
3774
3775 .sp
3776 .ne 2
3777 .na
3778 \fBzil_nocacheflush\fR (int)
3779 .ad
3780 .RS 12n
3781 Disable the cache flush commands that are normally sent to the disk(s) by
3782 the ZIL after an LWB write has completed. Setting this will cause ZIL
3783 corruption on power loss if a volatile out-of-order write cache is enabled.
3784 .sp
3785 Use \fB1\fR for yes and \fB0\fR for no (default).
3786 .RE
3787
3788 .sp
3789 .ne 2
3790 .na
3791 \fBzil_replay_disable\fR (int)
3792 .ad
3793 .RS 12n
3794 Disable intent logging replay. Can be disabled for recovery from corrupted
3795 ZIL
3796 .sp
3797 Use \fB1\fR for yes and \fB0\fR for no (default).
3798 .RE
3799
3800 .sp
3801 .ne 2
3802 .na
3803 \fBzil_slog_bulk\fR (ulong)
3804 .ad
3805 .RS 12n
3806 Limit SLOG write size per commit executed with synchronous priority.
3807 Any writes above that will be executed with lower (asynchronous) priority
3808 to limit potential SLOG device abuse by single active ZIL writer.
3809 .sp
3810 Default value: \fB786,432\fR.
3811 .RE
3812
3813 .sp
3814 .ne 2
3815 .na
3816 \fBzio_deadman_log_all\fR (int)
3817 .ad
3818 .RS 12n
3819 If non-zero, the zio deadman will produce debugging messages (see
3820 \fBzfs_dbgmsg_enable\fR) for all zios, rather than only for leaf
3821 zios possessing a vdev. This is meant to be used by developers to gain
3822 diagnostic information for hang conditions which don't involve a mutex
3823 or other locking primitive; typically conditions in which a thread in
3824 the zio pipeline is looping indefinitely.
3825 .sp
3826 Default value: \fB0\fR.
3827 .RE
3828
3829 .sp
3830 .ne 2
3831 .na
3832 \fBzio_decompress_fail_fraction\fR (int)
3833 .ad
3834 .RS 12n
3835 If non-zero, this value represents the denominator of the probability that zfs
3836 should induce a decompression failure. For instance, for a 5% decompression
3837 failure rate, this value should be set to 20.
3838 .sp
3839 Default value: \fB0\fR.
3840 .RE
3841
3842 .sp
3843 .ne 2
3844 .na
3845 \fBzio_slow_io_ms\fR (int)
3846 .ad
3847 .RS 12n
3848 When an I/O operation takes more than \fBzio_slow_io_ms\fR milliseconds to
3849 complete is marked as a slow I/O.  Each slow I/O causes a delay zevent.  Slow
3850 I/O counters can be seen with "zpool status -s".
3851
3852 .sp
3853 Default value: \fB30,000\fR.
3854 .RE
3855
3856 .sp
3857 .ne 2
3858 .na
3859 \fBzio_dva_throttle_enabled\fR (int)
3860 .ad
3861 .RS 12n
3862 Throttle block allocations in the I/O pipeline. This allows for
3863 dynamic allocation distribution when devices are imbalanced.
3864 When enabled, the maximum number of pending allocations per top-level vdev
3865 is limited by \fBzfs_vdev_queue_depth_pct\fR.
3866 .sp
3867 Default value: \fB1\fR.
3868 .RE
3869
3870 .sp
3871 .ne 2
3872 .na
3873 \fBzio_requeue_io_start_cut_in_line\fR (int)
3874 .ad
3875 .RS 12n
3876 Prioritize requeued I/O
3877 .sp
3878 Default value: \fB0\fR.
3879 .RE
3880
3881 .sp
3882 .ne 2
3883 .na
3884 \fBzio_taskq_batch_pct\fR (uint)
3885 .ad
3886 .RS 12n
3887 Percentage of online CPUs (or CPU cores, etc) which will run a worker thread
3888 for I/O. These workers are responsible for I/O work such as compression and
3889 checksum calculations. Fractional number of CPUs will be rounded down.
3890 .sp
3891 The default value of 75 was chosen to avoid using all CPUs which can result in
3892 latency issues and inconsistent application performance, especially when high
3893 compression is enabled.
3894 .sp
3895 Default value: \fB75\fR.
3896 .RE
3897
3898 .sp
3899 .ne 2
3900 .na
3901 \fBzvol_inhibit_dev\fR (uint)
3902 .ad
3903 .RS 12n
3904 Do not create zvol device nodes. This may slightly improve startup time on
3905 systems with a very large number of zvols.
3906 .sp
3907 Use \fB1\fR for yes and \fB0\fR for no (default).
3908 .RE
3909
3910 .sp
3911 .ne 2
3912 .na
3913 \fBzvol_major\fR (uint)
3914 .ad
3915 .RS 12n
3916 Major number for zvol block devices
3917 .sp
3918 Default value: \fB230\fR.
3919 .RE
3920
3921 .sp
3922 .ne 2
3923 .na
3924 \fBzvol_max_discard_blocks\fR (ulong)
3925 .ad
3926 .RS 12n
3927 Discard (aka TRIM) operations done on zvols will be done in batches of this
3928 many blocks, where block size is determined by the \fBvolblocksize\fR property
3929 of a zvol.
3930 .sp
3931 Default value: \fB16,384\fR.
3932 .RE
3933
3934 .sp
3935 .ne 2
3936 .na
3937 \fBzvol_prefetch_bytes\fR (uint)
3938 .ad
3939 .RS 12n
3940 When adding a zvol to the system prefetch \fBzvol_prefetch_bytes\fR
3941 from the start and end of the volume.  Prefetching these regions
3942 of the volume is desirable because they are likely to be accessed
3943 immediately by \fBblkid(8)\fR or by the kernel scanning for a partition
3944 table.
3945 .sp
3946 Default value: \fB131,072\fR.
3947 .RE
3948
3949 .sp
3950 .ne 2
3951 .na
3952 \fBzvol_request_sync\fR (uint)
3953 .ad
3954 .RS 12n
3955 When processing I/O requests for a zvol submit them synchronously.  This
3956 effectively limits the queue depth to 1 for each I/O submitter.  When set
3957 to 0 requests are handled asynchronously by a thread pool.  The number of
3958 requests which can be handled concurrently is controller by \fBzvol_threads\fR.
3959 .sp
3960 Default value: \fB0\fR.
3961 .RE
3962
3963 .sp
3964 .ne 2
3965 .na
3966 \fBzvol_threads\fR (uint)
3967 .ad
3968 .RS 12n
3969 Max number of threads which can handle zvol I/O requests concurrently.
3970 .sp
3971 Default value: \fB32\fR.
3972 .RE
3973
3974 .sp
3975 .ne 2
3976 .na
3977 \fBzvol_volmode\fR (uint)
3978 .ad
3979 .RS 12n
3980 Defines zvol block devices behaviour when \fBvolmode\fR is set to \fBdefault\fR.
3981 Valid values are \fB1\fR (full), \fB2\fR (dev) and \fB3\fR (none).
3982 .sp
3983 Default value: \fB1\fR.
3984 .RE
3985
3986 .SH ZFS I/O SCHEDULER
3987 ZFS issues I/O operations to leaf vdevs to satisfy and complete I/Os.
3988 The I/O scheduler determines when and in what order those operations are
3989 issued.  The I/O scheduler divides operations into five I/O classes
3990 prioritized in the following order: sync read, sync write, async read,
3991 async write, and scrub/resilver.  Each queue defines the minimum and
3992 maximum number of concurrent operations that may be issued to the
3993 device.  In addition, the device has an aggregate maximum,
3994 \fBzfs_vdev_max_active\fR. Note that the sum of the per-queue minimums
3995 must not exceed the aggregate maximum.  If the sum of the per-queue
3996 maximums exceeds the aggregate maximum, then the number of active I/Os
3997 may reach \fBzfs_vdev_max_active\fR, in which case no further I/Os will
3998 be issued regardless of whether all per-queue minimums have been met.
3999 .sp
4000 For many physical devices, throughput increases with the number of
4001 concurrent operations, but latency typically suffers. Further, physical
4002 devices typically have a limit at which more concurrent operations have no
4003 effect on throughput or can actually cause it to decrease.
4004 .sp
4005 The scheduler selects the next operation to issue by first looking for an
4006 I/O class whose minimum has not been satisfied. Once all are satisfied and
4007 the aggregate maximum has not been hit, the scheduler looks for classes
4008 whose maximum has not been satisfied. Iteration through the I/O classes is
4009 done in the order specified above. No further operations are issued if the
4010 aggregate maximum number of concurrent operations has been hit or if there
4011 are no operations queued for an I/O class that has not hit its maximum.
4012 Every time an I/O is queued or an operation completes, the I/O scheduler
4013 looks for new operations to issue.
4014 .sp
4015 In general, smaller max_active's will lead to lower latency of synchronous
4016 operations.  Larger max_active's may lead to higher overall throughput,
4017 depending on underlying storage.
4018 .sp
4019 The ratio of the queues' max_actives determines the balance of performance
4020 between reads, writes, and scrubs.  E.g., increasing
4021 \fBzfs_vdev_scrub_max_active\fR will cause the scrub or resilver to complete
4022 more quickly, but reads and writes to have higher latency and lower throughput.
4023 .sp
4024 All I/O classes have a fixed maximum number of outstanding operations
4025 except for the async write class. Asynchronous writes represent the data
4026 that is committed to stable storage during the syncing stage for
4027 transaction groups. Transaction groups enter the syncing state
4028 periodically so the number of queued async writes will quickly burst up
4029 and then bleed down to zero. Rather than servicing them as quickly as
4030 possible, the I/O scheduler changes the maximum number of active async
4031 write I/Os according to the amount of dirty data in the pool.  Since
4032 both throughput and latency typically increase with the number of
4033 concurrent operations issued to physical devices, reducing the
4034 burstiness in the number of concurrent operations also stabilizes the
4035 response time of operations from other -- and in particular synchronous
4036 -- queues. In broad strokes, the I/O scheduler will issue more
4037 concurrent operations from the async write queue as there's more dirty
4038 data in the pool.
4039 .sp
4040 Async Writes
4041 .sp
4042 The number of concurrent operations issued for the async write I/O class
4043 follows a piece-wise linear function defined by a few adjustable points.
4044 .nf
4045
4046        |              o---------| <-- zfs_vdev_async_write_max_active
4047   ^    |             /^         |
4048   |    |            / |         |
4049 active |           /  |         |
4050  I/O   |          /   |         |
4051 count  |         /    |         |
4052        |        /     |         |
4053        |-------o      |         | <-- zfs_vdev_async_write_min_active
4054       0|_______^______|_________|
4055        0%      |      |       100% of zfs_dirty_data_max
4056                |      |
4057                |      `-- zfs_vdev_async_write_active_max_dirty_percent
4058                `--------- zfs_vdev_async_write_active_min_dirty_percent
4059
4060 .fi
4061 Until the amount of dirty data exceeds a minimum percentage of the dirty
4062 data allowed in the pool, the I/O scheduler will limit the number of
4063 concurrent operations to the minimum. As that threshold is crossed, the
4064 number of concurrent operations issued increases linearly to the maximum at
4065 the specified maximum percentage of the dirty data allowed in the pool.
4066 .sp
4067 Ideally, the amount of dirty data on a busy pool will stay in the sloped
4068 part of the function between \fBzfs_vdev_async_write_active_min_dirty_percent\fR
4069 and \fBzfs_vdev_async_write_active_max_dirty_percent\fR. If it exceeds the
4070 maximum percentage, this indicates that the rate of incoming data is
4071 greater than the rate that the backend storage can handle. In this case, we
4072 must further throttle incoming writes, as described in the next section.
4073
4074 .SH ZFS TRANSACTION DELAY
4075 We delay transactions when we've determined that the backend storage
4076 isn't able to accommodate the rate of incoming writes.
4077 .sp
4078 If there is already a transaction waiting, we delay relative to when
4079 that transaction will finish waiting.  This way the calculated delay time
4080 is independent of the number of threads concurrently executing
4081 transactions.
4082 .sp
4083 If we are the only waiter, wait relative to when the transaction
4084 started, rather than the current time.  This credits the transaction for
4085 "time already served", e.g. reading indirect blocks.
4086 .sp
4087 The minimum time for a transaction to take is calculated as:
4088 .nf
4089     min_time = zfs_delay_scale * (dirty - min) / (max - dirty)
4090     min_time is then capped at 100 milliseconds.
4091 .fi
4092 .sp
4093 The delay has two degrees of freedom that can be adjusted via tunables.  The
4094 percentage of dirty data at which we start to delay is defined by
4095 \fBzfs_delay_min_dirty_percent\fR. This should typically be at or above
4096 \fBzfs_vdev_async_write_active_max_dirty_percent\fR so that we only start to
4097 delay after writing at full speed has failed to keep up with the incoming write
4098 rate. The scale of the curve is defined by \fBzfs_delay_scale\fR. Roughly speaking,
4099 this variable determines the amount of delay at the midpoint of the curve.
4100 .sp
4101 .nf
4102 delay
4103  10ms +-------------------------------------------------------------*+
4104       |                                                             *|
4105   9ms +                                                             *+
4106       |                                                             *|
4107   8ms +                                                             *+
4108       |                                                            * |
4109   7ms +                                                            * +
4110       |                                                            * |
4111   6ms +                                                            * +
4112       |                                                            * |
4113   5ms +                                                           *  +
4114       |                                                           *  |
4115   4ms +                                                           *  +
4116       |                                                           *  |
4117   3ms +                                                          *   +
4118       |                                                          *   |
4119   2ms +                                              (midpoint) *    +
4120       |                                                  |    **     |
4121   1ms +                                                  v ***       +
4122       |             zfs_delay_scale ---------->     ********         |
4123     0 +-------------------------------------*********----------------+
4124       0%                    <- zfs_dirty_data_max ->               100%
4125 .fi
4126 .sp
4127 Note that since the delay is added to the outstanding time remaining on the
4128 most recent transaction, the delay is effectively the inverse of IOPS.
4129 Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
4130 was chosen such that small changes in the amount of accumulated dirty data
4131 in the first 3/4 of the curve yield relatively small differences in the
4132 amount of delay.
4133 .sp
4134 The effects can be easier to understand when the amount of delay is
4135 represented on a log scale:
4136 .sp
4137 .nf
4138 delay
4139 100ms +-------------------------------------------------------------++
4140       +                                                              +
4141       |                                                              |
4142       +                                                             *+
4143  10ms +                                                             *+
4144       +                                                           ** +
4145       |                                              (midpoint)  **  |
4146       +                                                  |     **    +
4147   1ms +                                                  v ****      +
4148       +             zfs_delay_scale ---------->        *****         +
4149       |                                             ****             |
4150       +                                          ****                +
4151 100us +                                        **                    +
4152       +                                       *                      +
4153       |                                      *                       |
4154       +                                     *                        +
4155  10us +                                     *                        +
4156       +                                                              +
4157       |                                                              |
4158       +                                                              +
4159       +--------------------------------------------------------------+
4160       0%                    <- zfs_dirty_data_max ->               100%
4161 .fi
4162 .sp
4163 Note here that only as the amount of dirty data approaches its limit does
4164 the delay start to increase rapidly. The goal of a properly tuned system
4165 should be to keep the amount of dirty data out of that range by first
4166 ensuring that the appropriate limits are set for the I/O scheduler to reach
4167 optimal throughput on the backend storage, and then by changing the value
4168 of \fBzfs_delay_scale\fR to increase the steepness of the curve.