sys/contrib/openzfs/module/zfs/arc.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2018, Joyent, Inc.
  24  * Copyright (c) 2011, 2020, Delphix. All rights reserved.
  25  * Copyright (c) 2014, Saso Kiselkov. All rights reserved.
  26  * Copyright (c) 2017, Nexenta Systems, Inc.  All rights reserved.
  27  * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
  28  * Copyright (c) 2020, George Amanakis. All rights reserved.
  29  * Copyright (c) 2019, Klara Inc.
  30  * Copyright (c) 2019, Allan Jude
  31  * Copyright (c) 2020, The FreeBSD Foundation [1]
  32  *
  33  * [1] Portions of this software were developed by Allan Jude
  34  *     under sponsorship from the FreeBSD Foundation.
  35  */
  36
  37 /*
  38  * DVA-based Adjustable Replacement Cache
  39  *
  40  * While much of the theory of operation used here is
  41  * based on the self-tuning, low overhead replacement cache
  42  * presented by Megiddo and Modha at FAST 2003, there are some
  43  * significant differences:
  44  *
  45  * 1. The Megiddo and Modha model assumes any page is evictable.
  46  * Pages in its cache cannot be "locked" into memory.  This makes
  47  * the eviction algorithm simple: evict the last page in the list.
  48  * This also make the performance characteristics easy to reason
  49  * about.  Our cache is not so simple.  At any given moment, some
  50  * subset of the blocks in the cache are un-evictable because we
  51  * have handed out a reference to them.  Blocks are only evictable
  52  * when there are no external references active.  This makes
  53  * eviction far more problematic:  we choose to evict the evictable
  54  * blocks that are the "lowest" in the list.
  55  *
  56  * There are times when it is not possible to evict the requested
  57  * space.  In these circumstances we are unable to adjust the cache
  58  * size.  To prevent the cache growing unbounded at these times we
  59  * implement a "cache throttle" that slows the flow of new data
  60  * into the cache until we can make space available.
  61  *
  62  * 2. The Megiddo and Modha model assumes a fixed cache size.
  63  * Pages are evicted when the cache is full and there is a cache
  64  * miss.  Our model has a variable sized cache.  It grows with
  65  * high use, but also tries to react to memory pressure from the
  66  * operating system: decreasing its size when system memory is
  67  * tight.
  68  *
  69  * 3. The Megiddo and Modha model assumes a fixed page size. All
  70  * elements of the cache are therefore exactly the same size.  So
  71  * when adjusting the cache size following a cache miss, its simply
  72  * a matter of choosing a single page to evict.  In our model, we
  73  * have variable sized cache blocks (ranging from 512 bytes to
  74  * 128K bytes).  We therefore choose a set of blocks to evict to make
  75  * space for a cache miss that approximates as closely as possible
  76  * the space used by the new block.
  77  *
  78  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  79  * by N. Megiddo & D. Modha, FAST 2003
  80  */
  81
  82 /*
  83  * The locking model:
  84  *
  85  * A new reference to a cache buffer can be obtained in two
  86  * ways: 1) via a hash table lookup using the DVA as a key,
  87  * or 2) via one of the ARC lists.  The arc_read() interface
  88  * uses method 1, while the internal ARC algorithms for
  89  * adjusting the cache use method 2.  We therefore provide two
  90  * types of locks: 1) the hash table lock array, and 2) the
  91  * ARC list locks.
  92  *
  93  * Buffers do not have their own mutexes, rather they rely on the
  94  * hash table mutexes for the bulk of their protection (i.e. most
  95  * fields in the arc_buf_hdr_t are protected by these mutexes).
  96  *
  97  * buf_hash_find() returns the appropriate mutex (held) when it
  98  * locates the requested buffer in the hash table.  It returns
  99  * NULL for the mutex if the buffer was not in the table.
 100  *
 101  * buf_hash_remove() expects the appropriate hash mutex to be
 102  * already held before it is invoked.
 103  *
 104  * Each ARC state also has a mutex which is used to protect the
 105  * buffer list associated with the state.  When attempting to
 106  * obtain a hash table lock while holding an ARC list lock you
 107  * must use: mutex_tryenter() to avoid deadlock.  Also note that
 108  * the active state mutex must be held before the ghost state mutex.
 109  *
 110  * It as also possible to register a callback which is run when the
 111  * arc_meta_limit is reached and no buffers can be safely evicted.  In
 112  * this case the arc user should drop a reference on some arc buffers so
 113  * they can be reclaimed and the arc_meta_limit honored.  For example,
 114  * when using the ZPL each dentry holds a references on a znode.  These
 115  * dentries must be pruned before the arc buffer holding the znode can
 116  * be safely evicted.
 117  *
 118  * Note that the majority of the performance stats are manipulated
 119  * with atomic operations.
 120  *
 121  * The L2ARC uses the l2ad_mtx on each vdev for the following:
 122  *
 123  *      - L2ARC buflist creation
 124  *      - L2ARC buflist eviction
 125  *      - L2ARC write completion, which walks L2ARC buflists
 126  *      - ARC header destruction, as it removes from L2ARC buflists
 127  *      - ARC header release, as it removes from L2ARC buflists
 128  */
 129
 130 /*
 131  * ARC operation:
 132  *
 133  * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
 134  * This structure can point either to a block that is still in the cache or to
 135  * one that is only accessible in an L2 ARC device, or it can provide
 136  * information about a block that was recently evicted. If a block is
 137  * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
 138  * information to retrieve it from the L2ARC device. This information is
 139  * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
 140  * that is in this state cannot access the data directly.
 141  *
 142  * Blocks that are actively being referenced or have not been evicted
 143  * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
 144  * the arc_buf_hdr_t that will point to the data block in memory. A block can
 145  * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
 146  * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
 147  * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd).
 148  *
 149  * The L1ARC's data pointer may or may not be uncompressed. The ARC has the
 150  * ability to store the physical data (b_pabd) associated with the DVA of the
 151  * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block,
 152  * it will match its on-disk compression characteristics. This behavior can be
 153  * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
 154  * compressed ARC functionality is disabled, the b_pabd will point to an
 155  * uncompressed version of the on-disk data.
 156  *
 157  * Data in the L1ARC is not accessed by consumers of the ARC directly. Each
 158  * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it.
 159  * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC
 160  * consumer. The ARC will provide references to this data and will keep it
 161  * cached until it is no longer in use. The ARC caches only the L1ARC's physical
 162  * data block and will evict any arc_buf_t that is no longer referenced. The
 163  * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the
 164  * "overhead_size" kstat.
 165  *
 166  * Depending on the consumer, an arc_buf_t can be requested in uncompressed or
 167  * compressed form. The typical case is that consumers will want uncompressed
 168  * data, and when that happens a new data buffer is allocated where the data is
 169  * decompressed for them to use. Currently the only consumer who wants
 170  * compressed arc_buf_t's is "zfs send", when it streams data exactly as it
 171  * exists on disk. When this happens, the arc_buf_t's data buffer is shared
 172  * with the arc_buf_hdr_t.
 173  *
 174  * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The
 175  * first one is owned by a compressed send consumer (and therefore references
 176  * the same compressed data buffer as the arc_buf_hdr_t) and the second could be
 177  * used by any other consumer (and has its own uncompressed copy of the data
 178  * buffer).
 179  *
 180  *   arc_buf_hdr_t
 181  *   +-----------+
 182  *   | fields    |
 183  *   | common to |
 184  *   | L1- and   |
 185  *   | L2ARC     |
 186  *   +-----------+
 187  *   | l2arc_buf_hdr_t
 188  *   |           |
 189  *   +-----------+
 190  *   | l1arc_buf_hdr_t
 191  *   |           |              arc_buf_t
 192  *   | b_buf     +------------>+-----------+      arc_buf_t
 193  *   | b_pabd    +-+           |b_next     +---->+-----------+
 194  *   +-----------+ |           |-----------|     |b_next     +-->NULL
 195  *                 |           |b_comp = T |     +-----------+
 196  *                 |           |b_data     +-+   |b_comp = F |
 197  *                 |           +-----------+ |   |b_data     +-+
 198  *                 +->+------+               |   +-----------+ |
 199  *        compressed  |      |               |                 |
 200  *           data     |      |<--------------+                 | uncompressed
 201  *                    +------+          compressed,            |     data
 202  *                                        shared               +-->+------+
 203  *                                         data                    |      |
 204  *                                                                 |      |
 205  *                                                                 +------+
 206  *
 207  * When a consumer reads a block, the ARC must first look to see if the
 208  * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
 209  * arc_buf_t and either copies uncompressed data into a new data buffer from an
 210  * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a
 211  * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the
 212  * hdr is compressed and the desired compression characteristics of the
 213  * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
 214  * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
 215  * the last buffer in the hdr's b_buf list, however a shared compressed buf can
 216  * be anywhere in the hdr's list.
 217  *
 218  * The diagram below shows an example of an uncompressed ARC hdr that is
 219  * sharing its data with an arc_buf_t (note that the shared uncompressed buf is
 220  * the last element in the buf list):
 221  *
 222  *                arc_buf_hdr_t
 223  *                +-----------+
 224  *                |           |
 225  *                |           |
 226  *                |           |
 227  *                +-----------+
 228  * l2arc_buf_hdr_t|           |
 229  *                |           |
 230  *                +-----------+
 231  * l1arc_buf_hdr_t|           |
 232  *                |           |                 arc_buf_t    (shared)
 233  *                |    b_buf  +------------>+---------+      arc_buf_t
 234  *                |           |             |b_next   +---->+---------+
 235  *                |  b_pabd   +-+           |---------|     |b_next   +-->NULL
 236  *                +-----------+ |           |         |     +---------+
 237  *                              |           |b_data   +-+   |         |
 238  *                              |           +---------+ |   |b_data   +-+
 239  *                              +->+------+             |   +---------+ |
 240  *                                 |      |             |               |
 241  *                   uncompressed  |      |             |               |
 242  *                        data     +------+             |               |
 243  *                                    ^                 +->+------+     |
 244  *                                    |       uncompressed |      |     |
 245  *                                    |           data     |      |     |
 246  *                                    |                    +------+     |
 247  *                                    +---------------------------------+
 248  *
 249  * Writing to the ARC requires that the ARC first discard the hdr's b_pabd
 250  * since the physical block is about to be rewritten. The new data contents
 251  * will be contained in the arc_buf_t. As the I/O pipeline performs the write,
 252  * it may compress the data before writing it to disk. The ARC will be called
 253  * with the transformed data and will bcopy the transformed on-disk block into
 254  * a newly allocated b_pabd. Writes are always done into buffers which have
 255  * either been loaned (and hence are new and don't have other readers) or
 256  * buffers which have been released (and hence have their own hdr, if there
 257  * were originally other readers of the buf's original hdr). This ensures that
 258  * the ARC only needs to update a single buf and its hdr after a write occurs.
 259  *
 260  * When the L2ARC is in use, it will also take advantage of the b_pabd. The
 261  * L2ARC will always write the contents of b_pabd to the L2ARC. This means
 262  * that when compressed ARC is enabled that the L2ARC blocks are identical
 263  * to the on-disk block in the main data pool. This provides a significant
 264  * advantage since the ARC can leverage the bp's checksum when reading from the
 265  * L2ARC to determine if the contents are valid. However, if the compressed
 266  * ARC is disabled, then the L2ARC's block must be transformed to look
 267  * like the physical block in the main data pool before comparing the
 268  * checksum and determining its validity.
 269  *
 270  * The L1ARC has a slightly different system for storing encrypted data.
 271  * Raw (encrypted + possibly compressed) data has a few subtle differences from
 272  * data that is just compressed. The biggest difference is that it is not
 273  * possible to decrypt encrypted data (or vice-versa) if the keys aren't loaded.
 274  * The other difference is that encryption cannot be treated as a suggestion.
 275  * If a caller would prefer compressed data, but they actually wind up with
 276  * uncompressed data the worst thing that could happen is there might be a
 277  * performance hit. If the caller requests encrypted data, however, we must be
 278  * sure they actually get it or else secret information could be leaked. Raw
 279  * data is stored in hdr->b_crypt_hdr.b_rabd. An encrypted header, therefore,
 280  * may have both an encrypted version and a decrypted version of its data at
 281  * once. When a caller needs a raw arc_buf_t, it is allocated and the data is
 282  * copied out of this header. To avoid complications with b_pabd, raw buffers
 283  * cannot be shared.
 284  */
 285
 286 #include <sys/spa.h>
 287 #include <sys/zio.h>
 288 #include <sys/spa_impl.h>
 289 #include <sys/zio_compress.h>
 290 #include <sys/zio_checksum.h>
 291 #include <sys/zfs_context.h>
 292 #include <sys/arc.h>
 293 #include <sys/zfs_refcount.h>
 294 #include <sys/vdev.h>
 295 #include <sys/vdev_impl.h>
 296 #include <sys/dsl_pool.h>
 297 #include <sys/multilist.h>
 298 #include <sys/abd.h>
 299 #include <sys/zil.h>
 300 #include <sys/fm/fs/zfs.h>
 301 #include <sys/callb.h>
 302 #include <sys/kstat.h>
 303 #include <sys/zthr.h>
 304 #include <zfs_fletcher.h>
 305 #include <sys/arc_impl.h>
 306 #include <sys/trace_zfs.h>
 307 #include <sys/aggsum.h>
 308 #include <sys/wmsum.h>
 309 #include <cityhash.h>
 310 #include <sys/vdev_trim.h>
 311 #include <sys/zfs_racct.h>
 312 #include <sys/zstd/zstd.h>
 313
 314 #ifndef _KERNEL
 315 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 316 boolean_t arc_watch = B_FALSE;
 317 #endif
 318
 319 /*
 320  * This thread's job is to keep enough free memory in the system, by
 321  * calling arc_kmem_reap_soon() plus arc_reduce_target_size(), which improves
 322  * arc_available_memory().
 323  */
 324 static zthr_t *arc_reap_zthr;
 325
 326 /*
 327  * This thread's job is to keep arc_size under arc_c, by calling
 328  * arc_evict(), which improves arc_is_overflowing().
 329  */
 330 static zthr_t *arc_evict_zthr;
 331 static arc_buf_hdr_t **arc_state_evict_markers;
 332 static int arc_state_evict_marker_count;
 333
 334 static kmutex_t arc_evict_lock;
 335 static boolean_t arc_evict_needed = B_FALSE;
 336
 337 /*
 338  * Count of bytes evicted since boot.
 339  */
 340 static uint64_t arc_evict_count;
 341
 342 /*
 343  * List of arc_evict_waiter_t's, representing threads waiting for the
 344  * arc_evict_count to reach specific values.
 345  */
 346 static list_t arc_evict_waiters;
 347
 348 /*
 349  * When arc_is_overflowing(), arc_get_data_impl() waits for this percent of
 350  * the requested amount of data to be evicted.  For example, by default for
 351  * every 2KB that's evicted, 1KB of it may be "reused" by a new allocation.
 352  * Since this is above 100%, it ensures that progress is made towards getting
 353  * arc_size under arc_c.  Since this is finite, it ensures that allocations
 354  * can still happen, even during the potentially long time that arc_size is
 355  * more than arc_c.
 356  */
 357 int zfs_arc_eviction_pct = 200;
 358
 359 /*
 360  * The number of headers to evict in arc_evict_state_impl() before
 361  * dropping the sublist lock and evicting from another sublist. A lower
 362  * value means we're more likely to evict the "correct" header (i.e. the
 363  * oldest header in the arc state), but comes with higher overhead
 364  * (i.e. more invocations of arc_evict_state_impl()).
 365  */
 366 int zfs_arc_evict_batch_limit = 10;
 367
 368 /* number of seconds before growing cache again */
 369 int arc_grow_retry = 5;
 370
 371 /*
 372  * Minimum time between calls to arc_kmem_reap_soon().
 373  */
 374 int arc_kmem_cache_reap_retry_ms = 1000;
 375
 376 /* shift of arc_c for calculating overflow limit in arc_get_data_impl */
 377 int zfs_arc_overflow_shift = 8;
 378
 379 /* shift of arc_c for calculating both min and max arc_p */
 380 int arc_p_min_shift = 4;
 381
 382 /* log2(fraction of arc to reclaim) */
 383 int arc_shrink_shift = 7;
 384
 385 /* percent of pagecache to reclaim arc to */
 386 #ifdef _KERNEL
 387 uint_t zfs_arc_pc_percent = 0;
 388 #endif
 389
 390 /*
 391  * log2(fraction of ARC which must be free to allow growing).
 392  * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
 393  * when reading a new block into the ARC, we will evict an equal-sized block
 394  * from the ARC.
 395  *
 396  * This must be less than arc_shrink_shift, so that when we shrink the ARC,
 397  * we will still not allow it to grow.
 398  */
 399 int                     arc_no_grow_shift = 5;
 400
 401
 402 /*
 403  * minimum lifespan of a prefetch block in clock ticks
 404  * (initialized in arc_init())
 405  */
 406 static int              arc_min_prefetch_ms;
 407 static int              arc_min_prescient_prefetch_ms;
 408
 409 /*
 410  * If this percent of memory is free, don't throttle.
 411  */
 412 int arc_lotsfree_percent = 10;
 413
 414 /*
 415  * The arc has filled available memory and has now warmed up.
 416  */
 417 boolean_t arc_warm;
 418
 419 /*
 420  * These tunables are for performance analysis.
 421  */
 422 unsigned long zfs_arc_max = 0;
 423 unsigned long zfs_arc_min = 0;
 424 unsigned long zfs_arc_meta_limit = 0;
 425 unsigned long zfs_arc_meta_min = 0;
 426 unsigned long zfs_arc_dnode_limit = 0;
 427 unsigned long zfs_arc_dnode_reduce_percent = 10;
 428 int zfs_arc_grow_retry = 0;
 429 int zfs_arc_shrink_shift = 0;
 430 int zfs_arc_p_min_shift = 0;
 431 int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
 432
 433 /*
 434  * ARC dirty data constraints for arc_tempreserve_space() throttle.
 435  */
 436 unsigned long zfs_arc_dirty_limit_percent = 50; /* total dirty data limit */
 437 unsigned long zfs_arc_anon_limit_percent = 25;  /* anon block dirty limit */
 438 unsigned long zfs_arc_pool_dirty_percent = 20;  /* each pool's anon allowance */
 439
 440 /*
 441  * Enable or disable compressed arc buffers.
 442  */
 443 int zfs_compressed_arc_enabled = B_TRUE;
 444
 445 /*
 446  * ARC will evict meta buffers that exceed arc_meta_limit. This
 447  * tunable make arc_meta_limit adjustable for different workloads.
 448  */
 449 unsigned long zfs_arc_meta_limit_percent = 75;
 450
 451 /*
 452  * Percentage that can be consumed by dnodes of ARC meta buffers.
 453  */
 454 unsigned long zfs_arc_dnode_limit_percent = 10;
 455
 456 /*
 457  * These tunables are Linux specific
 458  */
 459 unsigned long zfs_arc_sys_free = 0;
 460 int zfs_arc_min_prefetch_ms = 0;
 461 int zfs_arc_min_prescient_prefetch_ms = 0;
 462 int zfs_arc_p_dampener_disable = 1;
 463 int zfs_arc_meta_prune = 10000;
 464 int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED;
 465 int zfs_arc_meta_adjust_restarts = 4096;
 466 int zfs_arc_lotsfree_percent = 10;
 467
 468 /* The 6 states: */
 469 arc_state_t ARC_anon;
 470 arc_state_t ARC_mru;
 471 arc_state_t ARC_mru_ghost;
 472 arc_state_t ARC_mfu;
 473 arc_state_t ARC_mfu_ghost;
 474 arc_state_t ARC_l2c_only;
 475
 476 arc_stats_t arc_stats = {
 477         { "hits",                       KSTAT_DATA_UINT64 },
 478         { "misses",                     KSTAT_DATA_UINT64 },
 479         { "demand_data_hits",           KSTAT_DATA_UINT64 },
 480         { "demand_data_misses",         KSTAT_DATA_UINT64 },
 481         { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
 482         { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
 483         { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
 484         { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
 485         { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
 486         { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },
 487         { "mru_hits",                   KSTAT_DATA_UINT64 },
 488         { "mru_ghost_hits",             KSTAT_DATA_UINT64 },
 489         { "mfu_hits",                   KSTAT_DATA_UINT64 },
 490         { "mfu_ghost_hits",             KSTAT_DATA_UINT64 },
 491         { "deleted",                    KSTAT_DATA_UINT64 },
 492         { "mutex_miss",                 KSTAT_DATA_UINT64 },
 493         { "access_skip",                KSTAT_DATA_UINT64 },
 494         { "evict_skip",                 KSTAT_DATA_UINT64 },
 495         { "evict_not_enough",           KSTAT_DATA_UINT64 },
 496         { "evict_l2_cached",            KSTAT_DATA_UINT64 },
 497         { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
 498         { "evict_l2_eligible_mfu",      KSTAT_DATA_UINT64 },
 499         { "evict_l2_eligible_mru",      KSTAT_DATA_UINT64 },
 500         { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
 501         { "evict_l2_skip",              KSTAT_DATA_UINT64 },
 502         { "hash_elements",              KSTAT_DATA_UINT64 },
 503         { "hash_elements_max",          KSTAT_DATA_UINT64 },
 504         { "hash_collisions",            KSTAT_DATA_UINT64 },
 505         { "hash_chains",                KSTAT_DATA_UINT64 },
 506         { "hash_chain_max",             KSTAT_DATA_UINT64 },
 507         { "p",                          KSTAT_DATA_UINT64 },
 508         { "c",                          KSTAT_DATA_UINT64 },
 509         { "c_min",                      KSTAT_DATA_UINT64 },
 510         { "c_max",                      KSTAT_DATA_UINT64 },
 511         { "size",                       KSTAT_DATA_UINT64 },
 512         { "compressed_size",            KSTAT_DATA_UINT64 },
 513         { "uncompressed_size",          KSTAT_DATA_UINT64 },
 514         { "overhead_size",              KSTAT_DATA_UINT64 },
 515         { "hdr_size",                   KSTAT_DATA_UINT64 },
 516         { "data_size",                  KSTAT_DATA_UINT64 },
 517         { "metadata_size",              KSTAT_DATA_UINT64 },
 518         { "dbuf_size",                  KSTAT_DATA_UINT64 },
 519         { "dnode_size",                 KSTAT_DATA_UINT64 },
 520         { "bonus_size",                 KSTAT_DATA_UINT64 },
 521 #if defined(COMPAT_FREEBSD11)
 522         { "other_size",                 KSTAT_DATA_UINT64 },
 523 #endif
 524         { "anon_size",                  KSTAT_DATA_UINT64 },
 525         { "anon_evictable_data",        KSTAT_DATA_UINT64 },
 526         { "anon_evictable_metadata",    KSTAT_DATA_UINT64 },
 527         { "mru_size",                   KSTAT_DATA_UINT64 },
 528         { "mru_evictable_data",         KSTAT_DATA_UINT64 },
 529         { "mru_evictable_metadata",     KSTAT_DATA_UINT64 },
 530         { "mru_ghost_size",             KSTAT_DATA_UINT64 },
 531         { "mru_ghost_evictable_data",   KSTAT_DATA_UINT64 },
 532         { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
 533         { "mfu_size",                   KSTAT_DATA_UINT64 },
 534         { "mfu_evictable_data",         KSTAT_DATA_UINT64 },
 535         { "mfu_evictable_metadata",     KSTAT_DATA_UINT64 },
 536         { "mfu_ghost_size",             KSTAT_DATA_UINT64 },
 537         { "mfu_ghost_evictable_data",   KSTAT_DATA_UINT64 },
 538         { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
 539         { "l2_hits",                    KSTAT_DATA_UINT64 },
 540         { "l2_misses",                  KSTAT_DATA_UINT64 },
 541         { "l2_prefetch_asize",          KSTAT_DATA_UINT64 },
 542         { "l2_mru_asize",               KSTAT_DATA_UINT64 },
 543         { "l2_mfu_asize",               KSTAT_DATA_UINT64 },
 544         { "l2_bufc_data_asize",         KSTAT_DATA_UINT64 },
 545         { "l2_bufc_metadata_asize",     KSTAT_DATA_UINT64 },
 546         { "l2_feeds",                   KSTAT_DATA_UINT64 },
 547         { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 548         { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 549         { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 550         { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 551         { "l2_writes_done",             KSTAT_DATA_UINT64 },
 552         { "l2_writes_error",            KSTAT_DATA_UINT64 },
 553         { "l2_writes_lock_retry",       KSTAT_DATA_UINT64 },
 554         { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 555         { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 556         { "l2_evict_l1cached",          KSTAT_DATA_UINT64 },
 557         { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 558         { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 559         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 560         { "l2_io_error",                KSTAT_DATA_UINT64 },
 561         { "l2_size",                    KSTAT_DATA_UINT64 },
 562         { "l2_asize",                   KSTAT_DATA_UINT64 },
 563         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 564         { "l2_log_blk_writes",          KSTAT_DATA_UINT64 },
 565         { "l2_log_blk_avg_asize",       KSTAT_DATA_UINT64 },
 566         { "l2_log_blk_asize",           KSTAT_DATA_UINT64 },
 567         { "l2_log_blk_count",           KSTAT_DATA_UINT64 },
 568         { "l2_data_to_meta_ratio",      KSTAT_DATA_UINT64 },
 569         { "l2_rebuild_success",         KSTAT_DATA_UINT64 },
 570         { "l2_rebuild_unsupported",     KSTAT_DATA_UINT64 },
 571         { "l2_rebuild_io_errors",       KSTAT_DATA_UINT64 },
 572         { "l2_rebuild_dh_errors",       KSTAT_DATA_UINT64 },
 573         { "l2_rebuild_cksum_lb_errors", KSTAT_DATA_UINT64 },
 574         { "l2_rebuild_lowmem",          KSTAT_DATA_UINT64 },
 575         { "l2_rebuild_size",            KSTAT_DATA_UINT64 },
 576         { "l2_rebuild_asize",           KSTAT_DATA_UINT64 },
 577         { "l2_rebuild_bufs",            KSTAT_DATA_UINT64 },
 578         { "l2_rebuild_bufs_precached",  KSTAT_DATA_UINT64 },
 579         { "l2_rebuild_log_blks",        KSTAT_DATA_UINT64 },
 580         { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 581         { "memory_direct_count",        KSTAT_DATA_UINT64 },
 582         { "memory_indirect_count",      KSTAT_DATA_UINT64 },
 583         { "memory_all_bytes",           KSTAT_DATA_UINT64 },
 584         { "memory_free_bytes",          KSTAT_DATA_UINT64 },
 585         { "memory_available_bytes",     KSTAT_DATA_INT64 },
 586         { "arc_no_grow",                KSTAT_DATA_UINT64 },
 587         { "arc_tempreserve",            KSTAT_DATA_UINT64 },
 588         { "arc_loaned_bytes",           KSTAT_DATA_UINT64 },
 589         { "arc_prune",                  KSTAT_DATA_UINT64 },
 590         { "arc_meta_used",              KSTAT_DATA_UINT64 },
 591         { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 592         { "arc_dnode_limit",            KSTAT_DATA_UINT64 },
 593         { "arc_meta_max",               KSTAT_DATA_UINT64 },
 594         { "arc_meta_min",               KSTAT_DATA_UINT64 },
 595         { "async_upgrade_sync",         KSTAT_DATA_UINT64 },
 596         { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
 597         { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
 598         { "arc_need_free",              KSTAT_DATA_UINT64 },
 599         { "arc_sys_free",               KSTAT_DATA_UINT64 },
 600         { "arc_raw_size",               KSTAT_DATA_UINT64 },
 601         { "cached_only_in_progress",    KSTAT_DATA_UINT64 },
 602         { "abd_chunk_waste_size",       KSTAT_DATA_UINT64 },
 603 };
 604
 605 arc_sums_t arc_sums;
 606
 607 #define ARCSTAT_MAX(stat, val) {                                        \
 608         uint64_t m;                                                     \
 609         while ((val) > (m = arc_stats.stat.value.ui64) &&               \
 610             (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
 611                 continue;                                               \
 612 }
 613
 614 /*
 615  * We define a macro to allow ARC hits/misses to be easily broken down by
 616  * two separate conditions, giving a total of four different subtypes for
 617  * each of hits and misses (so eight statistics total).
 618  */
 619 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 620         if (cond1) {                                                    \
 621                 if (cond2) {                                            \
 622                         ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 623                 } else {                                                \
 624                         ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 625                 }                                                       \
 626         } else {                                                        \
 627                 if (cond2) {                                            \
 628                         ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 629                 } else {                                                \
 630                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 631                 }                                                       \
 632         }
 633
 634 /*
 635  * This macro allows us to use kstats as floating averages. Each time we
 636  * update this kstat, we first factor it and the update value by
 637  * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
 638  * average. This macro assumes that integer loads and stores are atomic, but
 639  * is not safe for multiple writers updating the kstat in parallel (only the
 640  * last writer's update will remain).
 641  */
 642 #define ARCSTAT_F_AVG_FACTOR    3
 643 #define ARCSTAT_F_AVG(stat, value) \
 644         do { \
 645                 uint64_t x = ARCSTAT(stat); \
 646                 x = x - x / ARCSTAT_F_AVG_FACTOR + \
 647                     (value) / ARCSTAT_F_AVG_FACTOR; \
 648                 ARCSTAT(stat) = x; \
 649                 _NOTE(CONSTCOND) \
 650         } while (0)
 651
 652 kstat_t                 *arc_ksp;
 653
 654 /*
 655  * There are several ARC variables that are critical to export as kstats --
 656  * but we don't want to have to grovel around in the kstat whenever we wish to
 657  * manipulate them.  For these variables, we therefore define them to be in
 658  * terms of the statistic variable.  This assures that we are not introducing
 659  * the possibility of inconsistency by having shadow copies of the variables,
 660  * while still allowing the code to be readable.
 661  */
 662 #define arc_tempreserve ARCSTAT(arcstat_tempreserve)
 663 #define arc_loaned_bytes        ARCSTAT(arcstat_loaned_bytes)
 664 #define arc_meta_limit  ARCSTAT(arcstat_meta_limit) /* max size for metadata */
 665 /* max size for dnodes */
 666 #define arc_dnode_size_limit    ARCSTAT(arcstat_dnode_limit)
 667 #define arc_meta_min    ARCSTAT(arcstat_meta_min) /* min size for metadata */
 668 #define arc_need_free   ARCSTAT(arcstat_need_free) /* waiting to be evicted */
 669
 670 hrtime_t arc_growtime;
 671 list_t arc_prune_list;
 672 kmutex_t arc_prune_mtx;
 673 taskq_t *arc_prune_taskq;
 674
 675 #define GHOST_STATE(state)      \
 676         ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
 677         (state) == arc_l2c_only)
 678
 679 #define HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
 680 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
 681 #define HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
 682 #define HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_FLAG_PREFETCH)
 683 #define HDR_PRESCIENT_PREFETCH(hdr)     \
 684         ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
 685 #define HDR_COMPRESSION_ENABLED(hdr)    \
 686         ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
 687
 688 #define HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_FLAG_L2CACHE)
 689 #define HDR_L2_READING(hdr)     \
 690         (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&  \
 691         ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
 692 #define HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
 693 #define HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
 694 #define HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
 695 #define HDR_PROTECTED(hdr)      ((hdr)->b_flags & ARC_FLAG_PROTECTED)
 696 #define HDR_NOAUTH(hdr)         ((hdr)->b_flags & ARC_FLAG_NOAUTH)
 697 #define HDR_SHARED_DATA(hdr)    ((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
 698
 699 #define HDR_ISTYPE_METADATA(hdr)        \
 700         ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
 701 #define HDR_ISTYPE_DATA(hdr)    (!HDR_ISTYPE_METADATA(hdr))
 702
 703 #define HDR_HAS_L1HDR(hdr)      ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
 704 #define HDR_HAS_L2HDR(hdr)      ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
 705 #define HDR_HAS_RABD(hdr)       \
 706         (HDR_HAS_L1HDR(hdr) && HDR_PROTECTED(hdr) &&    \
 707         (hdr)->b_crypt_hdr.b_rabd != NULL)
 708 #define HDR_ENCRYPTED(hdr)      \
 709         (HDR_PROTECTED(hdr) && DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
 710 #define HDR_AUTHENTICATED(hdr)  \
 711         (HDR_PROTECTED(hdr) && !DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
 712
 713 /* For storing compression mode in b_flags */
 714 #define HDR_COMPRESS_OFFSET     (highbit64(ARC_FLAG_COMPRESS_0) - 1)
 715
 716 #define HDR_GET_COMPRESS(hdr)   ((enum zio_compress)BF32_GET((hdr)->b_flags, \
 717         HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
 718 #define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
 719         HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
 720
 721 #define ARC_BUF_LAST(buf)       ((buf)->b_next == NULL)
 722 #define ARC_BUF_SHARED(buf)     ((buf)->b_flags & ARC_BUF_FLAG_SHARED)
 723 #define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED)
 724 #define ARC_BUF_ENCRYPTED(buf)  ((buf)->b_flags & ARC_BUF_FLAG_ENCRYPTED)
 725
 726 /*
 727  * Other sizes
 728  */
 729
 730 #define HDR_FULL_CRYPT_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 731 #define HDR_FULL_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_crypt_hdr))
 732 #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
 733
 734 /*
 735  * Hash table routines
 736  */
 737
 738 #define BUF_LOCKS 2048
 739 typedef struct buf_hash_table {
 740         uint64_t ht_mask;
 741         arc_buf_hdr_t **ht_table;
 742         kmutex_t ht_locks[BUF_LOCKS] ____cacheline_aligned;
 743 } buf_hash_table_t;
 744
 745 static buf_hash_table_t buf_hash_table;
 746
 747 #define BUF_HASH_INDEX(spa, dva, birth) \
 748         (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 749 #define BUF_HASH_LOCK(idx)      (&buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 750 #define HDR_LOCK(hdr) \
 751         (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 752
 753 uint64_t zfs_crc64_table[256];
 754
 755 /*
 756  * Level 2 ARC
 757  */
 758
 759 #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
 760 #define L2ARC_HEADROOM          2                       /* num of writes */
 761
 762 /*
 763  * If we discover during ARC scan any buffers to be compressed, we boost
 764  * our headroom for the next scanning cycle by this percentage multiple.
 765  */
 766 #define L2ARC_HEADROOM_BOOST    200
 767 #define L2ARC_FEED_SECS         1               /* caching interval secs */
 768 #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 769
 770 /*
 771  * We can feed L2ARC from two states of ARC buffers, mru and mfu,
 772  * and each of the state has two types: data and metadata.
 773  */
 774 #define L2ARC_FEED_TYPES        4
 775
 776 /* L2ARC Performance Tunables */
 777 unsigned long l2arc_write_max = L2ARC_WRITE_SIZE;       /* def max write size */
 778 unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE;     /* extra warmup write */
 779 unsigned long l2arc_headroom = L2ARC_HEADROOM;          /* # of dev writes */
 780 unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 781 unsigned long l2arc_feed_secs = L2ARC_FEED_SECS;        /* interval seconds */
 782 unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;    /* min interval msecs */
 783 int l2arc_noprefetch = B_TRUE;                  /* don't cache prefetch bufs */
 784 int l2arc_feed_again = B_TRUE;                  /* turbo warmup */
 785 int l2arc_norw = B_FALSE;                       /* no reads during writes */
 786 int l2arc_meta_percent = 33;                    /* limit on headers size */
 787
 788 /*
 789  * L2ARC Internals
 790  */
 791 static list_t L2ARC_dev_list;                   /* device list */
 792 static list_t *l2arc_dev_list;                  /* device list pointer */
 793 static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 794 static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 795 static list_t L2ARC_free_on_write;              /* free after write buf list */
 796 static list_t *l2arc_free_on_write;             /* free after write list ptr */
 797 static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 798 static uint64_t l2arc_ndev;                     /* number of devices */
 799
 800 typedef struct l2arc_read_callback {
 801         arc_buf_hdr_t           *l2rcb_hdr;             /* read header */
 802         blkptr_t                l2rcb_bp;               /* original blkptr */
 803         zbookmark_phys_t        l2rcb_zb;               /* original bookmark */
 804         int                     l2rcb_flags;            /* original flags */
 805         abd_t                   *l2rcb_abd;             /* temporary buffer */
 806 } l2arc_read_callback_t;
 807
 808 typedef struct l2arc_data_free {
 809         /* protected by l2arc_free_on_write_mtx */
 810         abd_t           *l2df_abd;
 811         size_t          l2df_size;
 812         arc_buf_contents_t l2df_type;
 813         list_node_t     l2df_list_node;
 814 } l2arc_data_free_t;
 815
 816 typedef enum arc_fill_flags {
 817         ARC_FILL_LOCKED         = 1 << 0, /* hdr lock is held */
 818         ARC_FILL_COMPRESSED     = 1 << 1, /* fill with compressed data */
 819         ARC_FILL_ENCRYPTED      = 1 << 2, /* fill with encrypted data */
 820         ARC_FILL_NOAUTH         = 1 << 3, /* don't attempt to authenticate */
 821         ARC_FILL_IN_PLACE       = 1 << 4  /* fill in place (special case) */
 822 } arc_fill_flags_t;
 823
 824 typedef enum arc_ovf_level {
 825         ARC_OVF_NONE,                   /* ARC within target size. */
 826         ARC_OVF_SOME,                   /* ARC is slightly overflowed. */
 827         ARC_OVF_SEVERE                  /* ARC is severely overflowed. */
 828 } arc_ovf_level_t;
 829
 830 static kmutex_t l2arc_feed_thr_lock;
 831 static kcondvar_t l2arc_feed_thr_cv;
 832 static uint8_t l2arc_thread_exit;
 833
 834 static kmutex_t l2arc_rebuild_thr_lock;
 835 static kcondvar_t l2arc_rebuild_thr_cv;
 836
 837 enum arc_hdr_alloc_flags {
 838         ARC_HDR_ALLOC_RDATA = 0x1,
 839         ARC_HDR_DO_ADAPT = 0x2,
 840         ARC_HDR_USE_RESERVE = 0x4,
 841 };
 842
 843
 844 static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, int);
 845 static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
 846 static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, int);
 847 static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *);
 848 static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
 849 static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag);
 850 static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t);
 851 static void arc_hdr_alloc_abd(arc_buf_hdr_t *, int);
 852 static void arc_access(arc_buf_hdr_t *, kmutex_t *);
 853 static void arc_buf_watch(arc_buf_t *);
 854
 855 static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
 856 static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
 857 static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
 858 static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
 859
 860 static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
 861 static void l2arc_read_done(zio_t *);
 862 static void l2arc_do_free_on_write(void);
 863 static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
 864     boolean_t state_only);
 865
 866 #define l2arc_hdr_arcstats_increment(hdr) \
 867         l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE)
 868 #define l2arc_hdr_arcstats_decrement(hdr) \
 869         l2arc_hdr_arcstats_update((hdr), B_FALSE, B_FALSE)
 870 #define l2arc_hdr_arcstats_increment_state(hdr) \
 871         l2arc_hdr_arcstats_update((hdr), B_TRUE, B_TRUE)
 872 #define l2arc_hdr_arcstats_decrement_state(hdr) \
 873         l2arc_hdr_arcstats_update((hdr), B_FALSE, B_TRUE)
 874
 875 /*
 876  * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU
 877  *              metadata and data are cached from ARC into L2ARC.
 878  */
 879 int l2arc_mfuonly = 0;
 880
 881 /*
 882  * L2ARC TRIM
 883  * l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of
 884  *              the current write size (l2arc_write_max) we should TRIM if we
 885  *              have filled the device. It is defined as a percentage of the
 886  *              write size. If set to 100 we trim twice the space required to
 887  *              accommodate upcoming writes. A minimum of 64MB will be trimmed.
 888  *              It also enables TRIM of the whole L2ARC device upon creation or
 889  *              addition to an existing pool or if the header of the device is
 890  *              invalid upon importing a pool or onlining a cache device. The
 891  *              default is 0, which disables TRIM on L2ARC altogether as it can
 892  *              put significant stress on the underlying storage devices. This
 893  *              will vary depending of how well the specific device handles
 894  *              these commands.
 895  */
 896 unsigned long l2arc_trim_ahead = 0;
 897
 898 /*
 899  * Performance tuning of L2ARC persistence:
 900  *
 901  * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding
 902  *              an L2ARC device (either at pool import or later) will attempt
 903  *              to rebuild L2ARC buffer contents.
 904  * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls
 905  *              whether log blocks are written to the L2ARC device. If the L2ARC
 906  *              device is less than 1GB, the amount of data l2arc_evict()
 907  *              evicts is significant compared to the amount of restored L2ARC
 908  *              data. In this case do not write log blocks in L2ARC in order
 909  *              not to waste space.
 910  */
 911 int l2arc_rebuild_enabled = B_TRUE;
 912 unsigned long l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024;
 913
 914 /* L2ARC persistence rebuild control routines. */
 915 void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen);
 916 static void l2arc_dev_rebuild_thread(void *arg);
 917 static int l2arc_rebuild(l2arc_dev_t *dev);
 918
 919 /* L2ARC persistence read I/O routines. */
 920 static int l2arc_dev_hdr_read(l2arc_dev_t *dev);
 921 static int l2arc_log_blk_read(l2arc_dev_t *dev,
 922     const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp,
 923     l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
 924     zio_t *this_io, zio_t **next_io);
 925 static zio_t *l2arc_log_blk_fetch(vdev_t *vd,
 926     const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb);
 927 static void l2arc_log_blk_fetch_abort(zio_t *zio);
 928
 929 /* L2ARC persistence block restoration routines. */
 930 static void l2arc_log_blk_restore(l2arc_dev_t *dev,
 931     const l2arc_log_blk_phys_t *lb, uint64_t lb_asize);
 932 static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
 933     l2arc_dev_t *dev);
 934
 935 /* L2ARC persistence write I/O routines. */
 936 static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
 937     l2arc_write_callback_t *cb);
 938
 939 /* L2ARC persistence auxiliary routines. */
 940 boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
 941     const l2arc_log_blkptr_t *lbp);
 942 static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev,
 943     const arc_buf_hdr_t *ab);
 944 boolean_t l2arc_range_check_overlap(uint64_t bottom,
 945     uint64_t top, uint64_t check);
 946 static void l2arc_blk_fetch_done(zio_t *zio);
 947 static inline uint64_t
 948     l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev);
 949
 950 /*
 951  * We use Cityhash for this. It's fast, and has good hash properties without
 952  * requiring any large static buffers.
 953  */
 954 static uint64_t
 955 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 956 {
 957         return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth));
 958 }
 959
 960 #define HDR_EMPTY(hdr)                                          \
 961         ((hdr)->b_dva.dva_word[0] == 0 &&                       \
 962         (hdr)->b_dva.dva_word[1] == 0)
 963
 964 #define HDR_EMPTY_OR_LOCKED(hdr)                                \
 965         (HDR_EMPTY(hdr) || MUTEX_HELD(HDR_LOCK(hdr)))
 966
 967 #define HDR_EQUAL(spa, dva, birth, hdr)                         \
 968         ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&     \
 969         ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&     \
 970         ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
 971
 972 static void
 973 buf_discard_identity(arc_buf_hdr_t *hdr)
 974 {
 975         hdr->b_dva.dva_word[0] = 0;
 976         hdr->b_dva.dva_word[1] = 0;
 977         hdr->b_birth = 0;
 978 }
 979
 980 static arc_buf_hdr_t *
 981 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
 982 {
 983         const dva_t *dva = BP_IDENTITY(bp);
 984         uint64_t birth = BP_PHYSICAL_BIRTH(bp);
 985         uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 986         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 987         arc_buf_hdr_t *hdr;
 988
 989         mutex_enter(hash_lock);
 990         for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
 991             hdr = hdr->b_hash_next) {
 992                 if (HDR_EQUAL(spa, dva, birth, hdr)) {
 993                         *lockp = hash_lock;
 994                         return (hdr);
 995                 }
 996         }
 997         mutex_exit(hash_lock);
 998         *lockp = NULL;
 999         return (NULL);
1000 }
1001
1002 /*
1003  * Insert an entry into the hash table.  If there is already an element
1004  * equal to elem in the hash table, then the already existing element
1005  * will be returned and the new element will not be inserted.
1006  * Otherwise returns NULL.
1007  * If lockp == NULL, the caller is assumed to already hold the hash lock.
1008  */
1009 static arc_buf_hdr_t *
1010 buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
1011 {
1012         uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1013         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1014         arc_buf_hdr_t *fhdr;
1015         uint32_t i;
1016
1017         ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
1018         ASSERT(hdr->b_birth != 0);
1019         ASSERT(!HDR_IN_HASH_TABLE(hdr));
1020
1021         if (lockp != NULL) {
1022                 *lockp = hash_lock;
1023                 mutex_enter(hash_lock);
1024         } else {
1025                 ASSERT(MUTEX_HELD(hash_lock));
1026         }
1027
1028         for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
1029             fhdr = fhdr->b_hash_next, i++) {
1030                 if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
1031                         return (fhdr);
1032         }
1033
1034         hdr->b_hash_next = buf_hash_table.ht_table[idx];
1035         buf_hash_table.ht_table[idx] = hdr;
1036         arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
1037
1038         /* collect some hash table performance data */
1039         if (i > 0) {
1040                 ARCSTAT_BUMP(arcstat_hash_collisions);
1041                 if (i == 1)
1042                         ARCSTAT_BUMP(arcstat_hash_chains);
1043
1044                 ARCSTAT_MAX(arcstat_hash_chain_max, i);
1045         }
1046         uint64_t he = atomic_inc_64_nv(
1047             &arc_stats.arcstat_hash_elements.value.ui64);
1048         ARCSTAT_MAX(arcstat_hash_elements_max, he);
1049
1050         return (NULL);
1051 }
1052
1053 static void
1054 buf_hash_remove(arc_buf_hdr_t *hdr)
1055 {
1056         arc_buf_hdr_t *fhdr, **hdrp;
1057         uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1058
1059         ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
1060         ASSERT(HDR_IN_HASH_TABLE(hdr));
1061
1062         hdrp = &buf_hash_table.ht_table[idx];
1063         while ((fhdr = *hdrp) != hdr) {
1064                 ASSERT3P(fhdr, !=, NULL);
1065                 hdrp = &fhdr->b_hash_next;
1066         }
1067         *hdrp = hdr->b_hash_next;
1068         hdr->b_hash_next = NULL;
1069         arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
1070
1071         /* collect some hash table performance data */
1072         atomic_dec_64(&arc_stats.arcstat_hash_elements.value.ui64);
1073
1074         if (buf_hash_table.ht_table[idx] &&
1075             buf_hash_table.ht_table[idx]->b_hash_next == NULL)
1076                 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
1077 }
1078
1079 /*
1080  * Global data structures and functions for the buf kmem cache.
1081  */
1082
1083 static kmem_cache_t *hdr_full_cache;
1084 static kmem_cache_t *hdr_full_crypt_cache;
1085 static kmem_cache_t *hdr_l2only_cache;
1086 static kmem_cache_t *buf_cache;
1087
1088 static void
1089 buf_fini(void)
1090 {
1091         int i;
1092
1093 #if defined(_KERNEL)
1094         /*
1095          * Large allocations which do not require contiguous pages
1096          * should be using vmem_free() in the linux kernel\
1097          */
1098         vmem_free(buf_hash_table.ht_table,
1099             (buf_hash_table.ht_mask + 1) * sizeof (void *));
1100 #else
1101         kmem_free(buf_hash_table.ht_table,
1102             (buf_hash_table.ht_mask + 1) * sizeof (void *));
1103 #endif
1104         for (i = 0; i < BUF_LOCKS; i++)
1105                 mutex_destroy(BUF_HASH_LOCK(i));
1106         kmem_cache_destroy(hdr_full_cache);
1107         kmem_cache_destroy(hdr_full_crypt_cache);
1108         kmem_cache_destroy(hdr_l2only_cache);
1109         kmem_cache_destroy(buf_cache);
1110 }
1111
1112 /*
1113  * Constructor callback - called when the cache is empty
1114  * and a new buf is requested.
1115  */
1116 /* ARGSUSED */
1117 static int
1118 hdr_full_cons(void *vbuf, void *unused, int kmflag)
1119 {
1120         arc_buf_hdr_t *hdr = vbuf;
1121
1122         bzero(hdr, HDR_FULL_SIZE);
1123         hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
1124         cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
1125         zfs_refcount_create(&hdr->b_l1hdr.b_refcnt);
1126         mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
1127         list_link_init(&hdr->b_l1hdr.b_arc_node);
1128         list_link_init(&hdr->b_l2hdr.b_l2node);
1129         multilist_link_init(&hdr->b_l1hdr.b_arc_node);
1130         arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1131
1132         return (0);
1133 }
1134
1135 /* ARGSUSED */
1136 static int
1137 hdr_full_crypt_cons(void *vbuf, void *unused, int kmflag)
1138 {
1139         arc_buf_hdr_t *hdr = vbuf;
1140
1141         hdr_full_cons(vbuf, unused, kmflag);
1142         bzero(&hdr->b_crypt_hdr, sizeof (hdr->b_crypt_hdr));
1143         arc_space_consume(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS);
1144
1145         return (0);
1146 }
1147
1148 /* ARGSUSED */
1149 static int
1150 hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
1151 {
1152         arc_buf_hdr_t *hdr = vbuf;
1153
1154         bzero(hdr, HDR_L2ONLY_SIZE);
1155         arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1156
1157         return (0);
1158 }
1159
1160 /* ARGSUSED */
1161 static int
1162 buf_cons(void *vbuf, void *unused, int kmflag)
1163 {
1164         arc_buf_t *buf = vbuf;
1165
1166         bzero(buf, sizeof (arc_buf_t));
1167         mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1168         arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1169
1170         return (0);
1171 }
1172
1173 /*
1174  * Destructor callback - called when a cached buf is
1175  * no longer required.
1176  */
1177 /* ARGSUSED */
1178 static void
1179 hdr_full_dest(void *vbuf, void *unused)
1180 {
1181         arc_buf_hdr_t *hdr = vbuf;
1182
1183         ASSERT(HDR_EMPTY(hdr));
1184         cv_destroy(&hdr->b_l1hdr.b_cv);
1185         zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt);
1186         mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
1187         ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
1188         arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1189 }
1190
1191 /* ARGSUSED */
1192 static void
1193 hdr_full_crypt_dest(void *vbuf, void *unused)
1194 {
1195         arc_buf_hdr_t *hdr = vbuf;
1196
1197         hdr_full_dest(vbuf, unused);
1198         arc_space_return(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS);
1199 }
1200
1201 /* ARGSUSED */
1202 static void
1203 hdr_l2only_dest(void *vbuf, void *unused)
1204 {
1205         arc_buf_hdr_t *hdr __maybe_unused = vbuf;
1206
1207         ASSERT(HDR_EMPTY(hdr));
1208         arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1209 }
1210
1211 /* ARGSUSED */
1212 static void
1213 buf_dest(void *vbuf, void *unused)
1214 {
1215         arc_buf_t *buf = vbuf;
1216
1217         mutex_destroy(&buf->b_evict_lock);
1218         arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1219 }
1220
1221 static void
1222 buf_init(void)
1223 {
1224         uint64_t *ct = NULL;
1225         uint64_t hsize = 1ULL << 12;
1226         int i, j;
1227
1228         /*
1229          * The hash table is big enough to fill all of physical memory
1230          * with an average block size of zfs_arc_average_blocksize (default 8K).
1231          * By default, the table will take up
1232          * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
1233          */
1234         while (hsize * zfs_arc_average_blocksize < arc_all_memory())
1235                 hsize <<= 1;
1236 retry:
1237         buf_hash_table.ht_mask = hsize - 1;
1238 #if defined(_KERNEL)
1239         /*
1240          * Large allocations which do not require contiguous pages
1241          * should be using vmem_alloc() in the linux kernel
1242          */
1243         buf_hash_table.ht_table =
1244             vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
1245 #else
1246         buf_hash_table.ht_table =
1247             kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1248 #endif
1249         if (buf_hash_table.ht_table == NULL) {
1250                 ASSERT(hsize > (1ULL << 8));
1251                 hsize >>= 1;
1252                 goto retry;
1253         }
1254
1255         hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
1256             0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0);
1257         hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt",
1258             HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest,
1259             NULL, NULL, NULL, 0);
1260         hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
1261             HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL,
1262             NULL, NULL, 0);
1263         buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1264             0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1265
1266         for (i = 0; i < 256; i++)
1267                 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1268                         *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1269
1270         for (i = 0; i < BUF_LOCKS; i++)
1271                 mutex_init(BUF_HASH_LOCK(i), NULL, MUTEX_DEFAULT, NULL);
1272 }
1273
1274 #define ARC_MINTIME     (hz>>4) /* 62 ms */
1275
1276 /*
1277  * This is the size that the buf occupies in memory. If the buf is compressed,
1278  * it will correspond to the compressed size. You should use this method of
1279  * getting the buf size unless you explicitly need the logical size.
1280  */
1281 uint64_t
1282 arc_buf_size(arc_buf_t *buf)
1283 {
1284         return (ARC_BUF_COMPRESSED(buf) ?
1285             HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr));
1286 }
1287
1288 uint64_t
1289 arc_buf_lsize(arc_buf_t *buf)
1290 {
1291         return (HDR_GET_LSIZE(buf->b_hdr));
1292 }
1293
1294 /*
1295  * This function will return B_TRUE if the buffer is encrypted in memory.
1296  * This buffer can be decrypted by calling arc_untransform().
1297  */
1298 boolean_t
1299 arc_is_encrypted(arc_buf_t *buf)
1300 {
1301         return (ARC_BUF_ENCRYPTED(buf) != 0);
1302 }
1303
1304 /*
1305  * Returns B_TRUE if the buffer represents data that has not had its MAC
1306  * verified yet.
1307  */
1308 boolean_t
1309 arc_is_unauthenticated(arc_buf_t *buf)
1310 {
1311         return (HDR_NOAUTH(buf->b_hdr) != 0);
1312 }
1313
1314 void
1315 arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt,
1316     uint8_t *iv, uint8_t *mac)
1317 {
1318         arc_buf_hdr_t *hdr = buf->b_hdr;
1319
1320         ASSERT(HDR_PROTECTED(hdr));
1321
1322         bcopy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN);
1323         bcopy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN);
1324         bcopy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN);
1325         *byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
1326             ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
1327 }
1328
1329 /*
1330  * Indicates how this buffer is compressed in memory. If it is not compressed
1331  * the value will be ZIO_COMPRESS_OFF. It can be made normally readable with
1332  * arc_untransform() as long as it is also unencrypted.
1333  */
1334 enum zio_compress
1335 arc_get_compression(arc_buf_t *buf)
1336 {
1337         return (ARC_BUF_COMPRESSED(buf) ?
1338             HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF);
1339 }
1340
1341 /*
1342  * Return the compression algorithm used to store this data in the ARC. If ARC
1343  * compression is enabled or this is an encrypted block, this will be the same
1344  * as what's used to store it on-disk. Otherwise, this will be ZIO_COMPRESS_OFF.
1345  */
1346 static inline enum zio_compress
1347 arc_hdr_get_compress(arc_buf_hdr_t *hdr)
1348 {
1349         return (HDR_COMPRESSION_ENABLED(hdr) ?
1350             HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF);
1351 }
1352
1353 uint8_t
1354 arc_get_complevel(arc_buf_t *buf)
1355 {
1356         return (buf->b_hdr->b_complevel);
1357 }
1358
1359 static inline boolean_t
1360 arc_buf_is_shared(arc_buf_t *buf)
1361 {
1362         boolean_t shared = (buf->b_data != NULL &&
1363             buf->b_hdr->b_l1hdr.b_pabd != NULL &&
1364             abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
1365             buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
1366         IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
1367         IMPLY(shared, ARC_BUF_SHARED(buf));
1368         IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
1369
1370         /*
1371          * It would be nice to assert arc_can_share() too, but the "hdr isn't
1372          * already being shared" requirement prevents us from doing that.
1373          */
1374
1375         return (shared);
1376 }
1377
1378 /*
1379  * Free the checksum associated with this header. If there is no checksum, this
1380  * is a no-op.
1381  */
1382 static inline void
1383 arc_cksum_free(arc_buf_hdr_t *hdr)
1384 {
1385         ASSERT(HDR_HAS_L1HDR(hdr));
1386
1387         mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
1388         if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
1389                 kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
1390                 hdr->b_l1hdr.b_freeze_cksum = NULL;
1391         }
1392         mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1393 }
1394
1395 /*
1396  * Return true iff at least one of the bufs on hdr is not compressed.
1397  * Encrypted buffers count as compressed.
1398  */
1399 static boolean_t
1400 arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
1401 {
1402         ASSERT(hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY_OR_LOCKED(hdr));
1403
1404         for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
1405                 if (!ARC_BUF_COMPRESSED(b)) {
1406                         return (B_TRUE);
1407                 }
1408         }
1409         return (B_FALSE);
1410 }
1411
1412
1413 /*
1414  * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data
1415  * matches the checksum that is stored in the hdr. If there is no checksum,
1416  * or if the buf is compressed, this is a no-op.
1417  */
1418 static void
1419 arc_cksum_verify(arc_buf_t *buf)
1420 {
1421         arc_buf_hdr_t *hdr = buf->b_hdr;
1422         zio_cksum_t zc;
1423
1424         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1425                 return;
1426
1427         if (ARC_BUF_COMPRESSED(buf))
1428                 return;
1429
1430         ASSERT(HDR_HAS_L1HDR(hdr));
1431
1432         mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
1433
1434         if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) {
1435                 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1436                 return;
1437         }
1438
1439         fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc);
1440         if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
1441                 panic("buffer modified while frozen!");
1442         mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1443 }
1444
1445 /*
1446  * This function makes the assumption that data stored in the L2ARC
1447  * will be transformed exactly as it is in the main pool. Because of
1448  * this we can verify the checksum against the reading process's bp.
1449  */
1450 static boolean_t
1451 arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
1452 {
1453         ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
1454         VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
1455
1456         /*
1457          * Block pointers always store the checksum for the logical data.
1458          * If the block pointer has the gang bit set, then the checksum
1459          * it represents is for the reconstituted data and not for an
1460          * individual gang member. The zio pipeline, however, must be able to
1461          * determine the checksum of each of the gang constituents so it
1462          * treats the checksum comparison differently than what we need
1463          * for l2arc blocks. This prevents us from using the
1464          * zio_checksum_error() interface directly. Instead we must call the
1465          * zio_checksum_error_impl() so that we can ensure the checksum is
1466          * generated using the correct checksum algorithm and accounts for the
1467          * logical I/O size and not just a gang fragment.
1468          */
1469         return (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
1470             BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
1471             zio->io_offset, NULL) == 0);
1472 }
1473
1474 /*
1475  * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a
1476  * checksum and attaches it to the buf's hdr so that we can ensure that the buf
1477  * isn't modified later on. If buf is compressed or there is already a checksum
1478  * on the hdr, this is a no-op (we only checksum uncompressed bufs).
1479  */
1480 static void
1481 arc_cksum_compute(arc_buf_t *buf)
1482 {
1483         arc_buf_hdr_t *hdr = buf->b_hdr;
1484
1485         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1486                 return;
1487
1488         ASSERT(HDR_HAS_L1HDR(hdr));
1489
1490         mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1491         if (hdr->b_l1hdr.b_freeze_cksum != NULL || ARC_BUF_COMPRESSED(buf)) {
1492                 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1493                 return;
1494         }
1495
1496         ASSERT(!ARC_BUF_ENCRYPTED(buf));
1497         ASSERT(!ARC_BUF_COMPRESSED(buf));
1498         hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
1499             KM_SLEEP);
1500         fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
1501             hdr->b_l1hdr.b_freeze_cksum);
1502         mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1503         arc_buf_watch(buf);
1504 }
1505
1506 #ifndef _KERNEL
1507 void
1508 arc_buf_sigsegv(int sig, siginfo_t *si, void *unused)
1509 {
1510         panic("Got SIGSEGV at address: 0x%lx\n", (long)si->si_addr);
1511 }
1512 #endif
1513
1514 /* ARGSUSED */
1515 static void
1516 arc_buf_unwatch(arc_buf_t *buf)
1517 {
1518 #ifndef _KERNEL
1519         if (arc_watch) {
1520                 ASSERT0(mprotect(buf->b_data, arc_buf_size(buf),
1521                     PROT_READ | PROT_WRITE));
1522         }
1523 #endif
1524 }
1525
1526 /* ARGSUSED */
1527 static void
1528 arc_buf_watch(arc_buf_t *buf)
1529 {
1530 #ifndef _KERNEL
1531         if (arc_watch)
1532                 ASSERT0(mprotect(buf->b_data, arc_buf_size(buf),
1533                     PROT_READ));
1534 #endif
1535 }
1536
1537 static arc_buf_contents_t
1538 arc_buf_type(arc_buf_hdr_t *hdr)
1539 {
1540         arc_buf_contents_t type;
1541         if (HDR_ISTYPE_METADATA(hdr)) {
1542                 type = ARC_BUFC_METADATA;
1543         } else {
1544                 type = ARC_BUFC_DATA;
1545         }
1546         VERIFY3U(hdr->b_type, ==, type);
1547         return (type);
1548 }
1549
1550 boolean_t
1551 arc_is_metadata(arc_buf_t *buf)
1552 {
1553         return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0);
1554 }
1555
1556 static uint32_t
1557 arc_bufc_to_flags(arc_buf_contents_t type)
1558 {
1559         switch (type) {
1560         case ARC_BUFC_DATA:
1561                 /* metadata field is 0 if buffer contains normal data */
1562                 return (0);
1563         case ARC_BUFC_METADATA:
1564                 return (ARC_FLAG_BUFC_METADATA);
1565         default:
1566                 break;
1567         }
1568         panic("undefined ARC buffer type!");
1569         return ((uint32_t)-1);
1570 }
1571
1572 void
1573 arc_buf_thaw(arc_buf_t *buf)
1574 {
1575         arc_buf_hdr_t *hdr = buf->b_hdr;
1576
1577         ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
1578         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1579
1580         arc_cksum_verify(buf);
1581
1582         /*
1583          * Compressed buffers do not manipulate the b_freeze_cksum.
1584          */
1585         if (ARC_BUF_COMPRESSED(buf))
1586                 return;
1587
1588         ASSERT(HDR_HAS_L1HDR(hdr));
1589         arc_cksum_free(hdr);
1590         arc_buf_unwatch(buf);
1591 }
1592
1593 void
1594 arc_buf_freeze(arc_buf_t *buf)
1595 {
1596         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1597                 return;
1598
1599         if (ARC_BUF_COMPRESSED(buf))
1600                 return;
1601
1602         ASSERT(HDR_HAS_L1HDR(buf->b_hdr));
1603         arc_cksum_compute(buf);
1604 }
1605
1606 /*
1607  * The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
1608  * the following functions should be used to ensure that the flags are
1609  * updated in a thread-safe way. When manipulating the flags either
1610  * the hash_lock must be held or the hdr must be undiscoverable. This
1611  * ensures that we're not racing with any other threads when updating
1612  * the flags.
1613  */
1614 static inline void
1615 arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
1616 {
1617         ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1618         hdr->b_flags |= flags;
1619 }
1620
1621 static inline void
1622 arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
1623 {
1624         ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1625         hdr->b_flags &= ~flags;
1626 }
1627
1628 /*
1629  * Setting the compression bits in the arc_buf_hdr_t's b_flags is
1630  * done in a special way since we have to clear and set bits
1631  * at the same time. Consumers that wish to set the compression bits
1632  * must use this function to ensure that the flags are updated in
1633  * thread-safe manner.
1634  */
1635 static void
1636 arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
1637 {
1638         ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1639
1640         /*
1641          * Holes and embedded blocks will always have a psize = 0 so
1642          * we ignore the compression of the blkptr and set the
1643          * want to uncompress them. Mark them as uncompressed.
1644          */
1645         if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) {
1646                 arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
1647                 ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
1648         } else {
1649                 arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
1650                 ASSERT(HDR_COMPRESSION_ENABLED(hdr));
1651         }
1652
1653         HDR_SET_COMPRESS(hdr, cmp);
1654         ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
1655 }
1656
1657 /*
1658  * Looks for another buf on the same hdr which has the data decompressed, copies
1659  * from it, and returns true. If no such buf exists, returns false.
1660  */
1661 static boolean_t
1662 arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
1663 {
1664         arc_buf_hdr_t *hdr = buf->b_hdr;
1665         boolean_t copied = B_FALSE;
1666
1667         ASSERT(HDR_HAS_L1HDR(hdr));
1668         ASSERT3P(buf->b_data, !=, NULL);
1669         ASSERT(!ARC_BUF_COMPRESSED(buf));
1670
1671         for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL;
1672             from = from->b_next) {
1673                 /* can't use our own data buffer */
1674                 if (from == buf) {
1675                         continue;
1676                 }
1677
1678                 if (!ARC_BUF_COMPRESSED(from)) {
1679                         bcopy(from->b_data, buf->b_data, arc_buf_size(buf));
1680                         copied = B_TRUE;
1681                         break;
1682                 }
1683         }
1684
1685         /*
1686          * There were no decompressed bufs, so there should not be a
1687          * checksum on the hdr either.
1688          */
1689         if (zfs_flags & ZFS_DEBUG_MODIFY)
1690                 EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
1691
1692         return (copied);
1693 }
1694
1695 /*
1696  * Allocates an ARC buf header that's in an evicted & L2-cached state.
1697  * This is used during l2arc reconstruction to make empty ARC buffers
1698  * which circumvent the regular disk->arc->l2arc path and instead come
1699  * into being in the reverse order, i.e. l2arc->arc.
1700  */
1701 static arc_buf_hdr_t *
1702 arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev,
1703     dva_t dva, uint64_t daddr, int32_t psize, uint64_t birth,
1704     enum zio_compress compress, uint8_t complevel, boolean_t protected,
1705     boolean_t prefetch, arc_state_type_t arcs_state)
1706 {
1707         arc_buf_hdr_t   *hdr;
1708
1709         ASSERT(size != 0);
1710         hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP);
1711         hdr->b_birth = birth;
1712         hdr->b_type = type;
1713         hdr->b_flags = 0;
1714         arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR);
1715         HDR_SET_LSIZE(hdr, size);
1716         HDR_SET_PSIZE(hdr, psize);
1717         arc_hdr_set_compress(hdr, compress);
1718         hdr->b_complevel = complevel;
1719         if (protected)
1720                 arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
1721         if (prefetch)
1722                 arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
1723         hdr->b_spa = spa_load_guid(dev->l2ad_vdev->vdev_spa);
1724
1725         hdr->b_dva = dva;
1726
1727         hdr->b_l2hdr.b_dev = dev;
1728         hdr->b_l2hdr.b_daddr = daddr;
1729         hdr->b_l2hdr.b_arcs_state = arcs_state;
1730
1731         return (hdr);
1732 }
1733
1734 /*
1735  * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
1736  */
1737 static uint64_t
1738 arc_hdr_size(arc_buf_hdr_t *hdr)
1739 {
1740         uint64_t size;
1741
1742         if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
1743             HDR_GET_PSIZE(hdr) > 0) {
1744                 size = HDR_GET_PSIZE(hdr);
1745         } else {
1746                 ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0);
1747                 size = HDR_GET_LSIZE(hdr);
1748         }
1749         return (size);
1750 }
1751
1752 static int
1753 arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
1754 {
1755         int ret;
1756         uint64_t csize;
1757         uint64_t lsize = HDR_GET_LSIZE(hdr);
1758         uint64_t psize = HDR_GET_PSIZE(hdr);
1759         void *tmpbuf = NULL;
1760         abd_t *abd = hdr->b_l1hdr.b_pabd;
1761
1762         ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1763         ASSERT(HDR_AUTHENTICATED(hdr));
1764         ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
1765
1766         /*
1767          * The MAC is calculated on the compressed data that is stored on disk.
1768          * However, if compressed arc is disabled we will only have the
1769          * decompressed data available to us now. Compress it into a temporary
1770          * abd so we can verify the MAC. The performance overhead of this will
1771          * be relatively low, since most objects in an encrypted objset will
1772          * be encrypted (instead of authenticated) anyway.
1773          */
1774         if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
1775             !HDR_COMPRESSION_ENABLED(hdr)) {
1776                 tmpbuf = zio_buf_alloc(lsize);
1777                 abd = abd_get_from_buf(tmpbuf, lsize);
1778                 abd_take_ownership_of_buf(abd, B_TRUE);
1779                 csize = zio_compress_data(HDR_GET_COMPRESS(hdr),
1780                     hdr->b_l1hdr.b_pabd, tmpbuf, lsize, hdr->b_complevel);
1781                 ASSERT3U(csize, <=, psize);
1782                 abd_zero_off(abd, csize, psize - csize);
1783         }
1784
1785         /*
1786          * Authentication is best effort. We authenticate whenever the key is
1787          * available. If we succeed we clear ARC_FLAG_NOAUTH.
1788          */
1789         if (hdr->b_crypt_hdr.b_ot == DMU_OT_OBJSET) {
1790                 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
1791                 ASSERT3U(lsize, ==, psize);
1792                 ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, abd,
1793                     psize, hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
1794         } else {
1795                 ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, abd, psize,
1796                     hdr->b_crypt_hdr.b_mac);
1797         }
1798
1799         if (ret == 0)
1800                 arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH);
1801         else if (ret != ENOENT)
1802                 goto error;
1803
1804         if (tmpbuf != NULL)
1805                 abd_free(abd);
1806
1807         return (0);
1808
1809 error:
1810         if (tmpbuf != NULL)
1811                 abd_free(abd);
1812
1813         return (ret);
1814 }
1815
1816 /*
1817  * This function will take a header that only has raw encrypted data in
1818  * b_crypt_hdr.b_rabd and decrypt it into a new buffer which is stored in
1819  * b_l1hdr.b_pabd. If designated in the header flags, this function will
1820  * also decompress the data.
1821  */
1822 static int
1823 arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
1824 {
1825         int ret;
1826         abd_t *cabd = NULL;
1827         void *tmp = NULL;
1828         boolean_t no_crypt = B_FALSE;
1829         boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
1830
1831         ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1832         ASSERT(HDR_ENCRYPTED(hdr));
1833
1834         arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT);
1835
1836         ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot,
1837             B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv,
1838             hdr->b_crypt_hdr.b_mac, HDR_GET_PSIZE(hdr), hdr->b_l1hdr.b_pabd,
1839             hdr->b_crypt_hdr.b_rabd, &no_crypt);
1840         if (ret != 0)
1841                 goto error;
1842
1843         if (no_crypt) {
1844                 abd_copy(hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd,
1845                     HDR_GET_PSIZE(hdr));
1846         }
1847
1848         /*
1849          * If this header has disabled arc compression but the b_pabd is
1850          * compressed after decrypting it, we need to decompress the newly
1851          * decrypted data.
1852          */
1853         if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
1854             !HDR_COMPRESSION_ENABLED(hdr)) {
1855                 /*
1856                  * We want to make sure that we are correctly honoring the
1857                  * zfs_abd_scatter_enabled setting, so we allocate an abd here
1858                  * and then loan a buffer from it, rather than allocating a
1859                  * linear buffer and wrapping it in an abd later.
1860                  */
1861                 cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
1862                     ARC_HDR_DO_ADAPT);
1863                 tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
1864
1865                 ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
1866                     hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
1867                     HDR_GET_LSIZE(hdr), &hdr->b_complevel);
1868                 if (ret != 0) {
1869                         abd_return_buf(cabd, tmp, arc_hdr_size(hdr));
1870                         goto error;
1871                 }
1872
1873                 abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
1874                 arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
1875                     arc_hdr_size(hdr), hdr);
1876                 hdr->b_l1hdr.b_pabd = cabd;
1877         }
1878
1879         return (0);
1880
1881 error:
1882         arc_hdr_free_abd(hdr, B_FALSE);
1883         if (cabd != NULL)
1884                 arc_free_data_buf(hdr, cabd, arc_hdr_size(hdr), hdr);
1885
1886         return (ret);
1887 }
1888
1889 /*
1890  * This function is called during arc_buf_fill() to prepare the header's
1891  * abd plaintext pointer for use. This involves authenticated protected
1892  * data and decrypting encrypted data into the plaintext abd.
1893  */
1894 static int
1895 arc_fill_hdr_crypt(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, spa_t *spa,
1896     const zbookmark_phys_t *zb, boolean_t noauth)
1897 {
1898         int ret;
1899
1900         ASSERT(HDR_PROTECTED(hdr));
1901
1902         if (hash_lock != NULL)
1903                 mutex_enter(hash_lock);
1904
1905         if (HDR_NOAUTH(hdr) && !noauth) {
1906                 /*
1907                  * The caller requested authenticated data but our data has
1908                  * not been authenticated yet. Verify the MAC now if we can.
1909                  */
1910                 ret = arc_hdr_authenticate(hdr, spa, zb->zb_objset);
1911                 if (ret != 0)
1912                         goto error;
1913         } else if (HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd == NULL) {
1914                 /*
1915                  * If we only have the encrypted version of the data, but the
1916                  * unencrypted version was requested we take this opportunity
1917                  * to store the decrypted version in the header for future use.
1918                  */
1919                 ret = arc_hdr_decrypt(hdr, spa, zb);
1920                 if (ret != 0)
1921                         goto error;
1922         }
1923
1924         ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
1925
1926         if (hash_lock != NULL)
1927                 mutex_exit(hash_lock);
1928
1929         return (0);
1930
1931 error:
1932         if (hash_lock != NULL)
1933                 mutex_exit(hash_lock);
1934
1935         return (ret);
1936 }
1937
1938 /*
1939  * This function is used by the dbuf code to decrypt bonus buffers in place.
1940  * The dbuf code itself doesn't have any locking for decrypting a shared dnode
1941  * block, so we use the hash lock here to protect against concurrent calls to
1942  * arc_buf_fill().
1943  */
1944 static void
1945 arc_buf_untransform_in_place(arc_buf_t *buf, kmutex_t *hash_lock)
1946 {
1947         arc_buf_hdr_t *hdr = buf->b_hdr;
1948
1949         ASSERT(HDR_ENCRYPTED(hdr));
1950         ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
1951         ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1952         ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
1953
1954         zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data,
1955             arc_buf_size(buf));
1956         buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
1957         buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
1958         hdr->b_crypt_hdr.b_ebufcnt -= 1;
1959 }
1960
1961 /*
1962  * Given a buf that has a data buffer attached to it, this function will
1963  * efficiently fill the buf with data of the specified compression setting from
1964  * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr
1965  * are already sharing a data buf, no copy is performed.
1966  *
1967  * If the buf is marked as compressed but uncompressed data was requested, this
1968  * will allocate a new data buffer for the buf, remove that flag, and fill the
1969  * buf with uncompressed data. You can't request a compressed buf on a hdr with
1970  * uncompressed data, and (since we haven't added support for it yet) if you
1971  * want compressed data your buf must already be marked as compressed and have
1972  * the correct-sized data buffer.
1973  */
1974 static int
1975 arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
1976     arc_fill_flags_t flags)
1977 {
1978         int error = 0;
1979         arc_buf_hdr_t *hdr = buf->b_hdr;
1980         boolean_t hdr_compressed =
1981             (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
1982         boolean_t compressed = (flags & ARC_FILL_COMPRESSED) != 0;
1983         boolean_t encrypted = (flags & ARC_FILL_ENCRYPTED) != 0;
1984         dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
1985         kmutex_t *hash_lock = (flags & ARC_FILL_LOCKED) ? NULL : HDR_LOCK(hdr);
1986
1987         ASSERT3P(buf->b_data, !=, NULL);
1988         IMPLY(compressed, hdr_compressed || ARC_BUF_ENCRYPTED(buf));
1989         IMPLY(compressed, ARC_BUF_COMPRESSED(buf));
1990         IMPLY(encrypted, HDR_ENCRYPTED(hdr));
1991         IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf));
1992         IMPLY(encrypted, ARC_BUF_COMPRESSED(buf));
1993         IMPLY(encrypted, !ARC_BUF_SHARED(buf));
1994
1995         /*
1996          * If the caller wanted encrypted data we just need to copy it from
1997          * b_rabd and potentially byteswap it. We won't be able to do any
1998          * further transforms on it.
1999          */
2000         if (encrypted) {
2001                 ASSERT(HDR_HAS_RABD(hdr));
2002                 abd_copy_to_buf(buf->b_data, hdr->b_crypt_hdr.b_rabd,
2003                     HDR_GET_PSIZE(hdr));
2004                 goto byteswap;
2005         }
2006
2007         /*
2008          * Adjust encrypted and authenticated headers to accommodate
2009          * the request if needed. Dnode blocks (ARC_FILL_IN_PLACE) are
2010          * allowed to fail decryption due to keys not being loaded
2011          * without being marked as an IO error.
2012          */
2013         if (HDR_PROTECTED(hdr)) {
2014                 error = arc_fill_hdr_crypt(hdr, hash_lock, spa,
2015                     zb, !!(flags & ARC_FILL_NOAUTH));
2016                 if (error == EACCES && (flags & ARC_FILL_IN_PLACE) != 0) {
2017                         return (error);
2018                 } else if (error != 0) {
2019                         if (hash_lock != NULL)
2020                                 mutex_enter(hash_lock);
2021                         arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
2022                         if (hash_lock != NULL)
2023                                 mutex_exit(hash_lock);
2024                         return (error);
2025                 }
2026         }
2027
2028         /*
2029          * There is a special case here for dnode blocks which are
2030          * decrypting their bonus buffers. These blocks may request to
2031          * be decrypted in-place. This is necessary because there may
2032          * be many dnodes pointing into this buffer and there is
2033          * currently no method to synchronize replacing the backing
2034          * b_data buffer and updating all of the pointers. Here we use
2035          * the hash lock to ensure there are no races. If the need
2036          * arises for other types to be decrypted in-place, they must
2037          * add handling here as well.
2038          */
2039         if ((flags & ARC_FILL_IN_PLACE) != 0) {
2040                 ASSERT(!hdr_compressed);
2041                 ASSERT(!compressed);
2042                 ASSERT(!encrypted);
2043
2044                 if (HDR_ENCRYPTED(hdr) && ARC_BUF_ENCRYPTED(buf)) {
2045                         ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
2046
2047                         if (hash_lock != NULL)
2048                                 mutex_enter(hash_lock);
2049                         arc_buf_untransform_in_place(buf, hash_lock);
2050                         if (hash_lock != NULL)
2051                                 mutex_exit(hash_lock);
2052
2053                         /* Compute the hdr's checksum if necessary */
2054                         arc_cksum_compute(buf);
2055                 }
2056
2057                 return (0);
2058         }
2059
2060         if (hdr_compressed == compressed) {
2061                 if (!arc_buf_is_shared(buf)) {
2062                         abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
2063                             arc_buf_size(buf));
2064                 }
2065         } else {
2066                 ASSERT(hdr_compressed);
2067                 ASSERT(!compressed);
2068
2069                 /*
2070                  * If the buf is sharing its data with the hdr, unlink it and
2071                  * allocate a new data buffer for the buf.
2072                  */
2073                 if (arc_buf_is_shared(buf)) {
2074                         ASSERT(ARC_BUF_COMPRESSED(buf));
2075
2076                         /* We need to give the buf its own b_data */
2077                         buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
2078                         buf->b_data =
2079                             arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
2080                         arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
2081
2082                         /* Previously overhead was 0; just add new overhead */
2083                         ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
2084                 } else if (ARC_BUF_COMPRESSED(buf)) {
2085                         /* We need to reallocate the buf's b_data */
2086                         arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
2087                             buf);
2088                         buf->b_data =
2089                             arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
2090
2091                         /* We increased the size of b_data; update overhead */
2092                         ARCSTAT_INCR(arcstat_overhead_size,
2093                             HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr));
2094                 }
2095
2096                 /*
2097                  * Regardless of the buf's previous compression settings, it
2098                  * should not be compressed at the end of this function.
2099                  */
2100                 buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
2101
2102                 /*
2103                  * Try copying the data from another buf which already has a
2104                  * decompressed version. If that's not possible, it's time to
2105                  * bite the bullet and decompress the data from the hdr.
2106                  */
2107                 if (arc_buf_try_copy_decompressed_data(buf)) {
2108                         /* Skip byteswapping and checksumming (already done) */
2109                         return (0);
2110                 } else {
2111                         error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
2112                             hdr->b_l1hdr.b_pabd, buf->b_data,
2113                             HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr),
2114                             &hdr->b_complevel);
2115
2116                         /*
2117                          * Absent hardware errors or software bugs, this should
2118                          * be impossible, but log it anyway so we can debug it.
2119                          */
2120                         if (error != 0) {
2121                                 zfs_dbgmsg(
2122                                     "hdr %px, compress %d, psize %d, lsize %d",
2123                                     hdr, arc_hdr_get_compress(hdr),
2124                                     HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
2125                                 if (hash_lock != NULL)
2126                                         mutex_enter(hash_lock);
2127                                 arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
2128                                 if (hash_lock != NULL)
2129                                         mutex_exit(hash_lock);
2130                                 return (SET_ERROR(EIO));
2131                         }
2132                 }
2133         }
2134
2135 byteswap:
2136         /* Byteswap the buf's data if necessary */
2137         if (bswap != DMU_BSWAP_NUMFUNCS) {
2138                 ASSERT(!HDR_SHARED_DATA(hdr));
2139                 ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
2140                 dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
2141         }
2142
2143         /* Compute the hdr's checksum if necessary */
2144         arc_cksum_compute(buf);
2145
2146         return (0);
2147 }
2148
2149 /*
2150  * If this function is being called to decrypt an encrypted buffer or verify an
2151  * authenticated one, the key must be loaded and a mapping must be made
2152  * available in the keystore via spa_keystore_create_mapping() or one of its
2153  * callers.
2154  */
2155 int
2156 arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
2157     boolean_t in_place)
2158 {
2159         int ret;
2160         arc_fill_flags_t flags = 0;
2161
2162         if (in_place)
2163                 flags |= ARC_FILL_IN_PLACE;
2164
2165         ret = arc_buf_fill(buf, spa, zb, flags);
2166         if (ret == ECKSUM) {
2167                 /*
2168                  * Convert authentication and decryption errors to EIO
2169                  * (and generate an ereport) before leaving the ARC.
2170                  */
2171                 ret = SET_ERROR(EIO);
2172                 spa_log_error(spa, zb);
2173                 (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
2174                     spa, NULL, zb, NULL, 0);
2175         }
2176
2177         return (ret);
2178 }
2179
2180 /*
2181  * Increment the amount of evictable space in the arc_state_t's refcount.
2182  * We account for the space used by the hdr and the arc buf individually
2183  * so that we can add and remove them from the refcount individually.
2184  */
2185 static void
2186 arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
2187 {
2188         arc_buf_contents_t type = arc_buf_type(hdr);
2189
2190         ASSERT(HDR_HAS_L1HDR(hdr));
2191
2192         if (GHOST_STATE(state)) {
2193                 ASSERT0(hdr->b_l1hdr.b_bufcnt);
2194                 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2195                 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2196                 ASSERT(!HDR_HAS_RABD(hdr));
2197                 (void) zfs_refcount_add_many(&state->arcs_esize[type],
2198                     HDR_GET_LSIZE(hdr), hdr);
2199                 return;
2200         }
2201
2202         if (hdr->b_l1hdr.b_pabd != NULL) {
2203                 (void) zfs_refcount_add_many(&state->arcs_esize[type],
2204                     arc_hdr_size(hdr), hdr);
2205         }
2206         if (HDR_HAS_RABD(hdr)) {
2207                 (void) zfs_refcount_add_many(&state->arcs_esize[type],
2208                     HDR_GET_PSIZE(hdr), hdr);
2209         }
2210
2211         for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2212             buf = buf->b_next) {
2213                 if (arc_buf_is_shared(buf))
2214                         continue;
2215                 (void) zfs_refcount_add_many(&state->arcs_esize[type],
2216                     arc_buf_size(buf), buf);
2217         }
2218 }
2219
2220 /*
2221  * Decrement the amount of evictable space in the arc_state_t's refcount.
2222  * We account for the space used by the hdr and the arc buf individually
2223  * so that we can add and remove them from the refcount individually.
2224  */
2225 static void
2226 arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
2227 {
2228         arc_buf_contents_t type = arc_buf_type(hdr);
2229
2230         ASSERT(HDR_HAS_L1HDR(hdr));
2231
2232         if (GHOST_STATE(state)) {
2233                 ASSERT0(hdr->b_l1hdr.b_bufcnt);
2234                 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2235                 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2236                 ASSERT(!HDR_HAS_RABD(hdr));
2237                 (void) zfs_refcount_remove_many(&state->arcs_esize[type],
2238                     HDR_GET_LSIZE(hdr), hdr);
2239                 return;
2240         }
2241
2242         if (hdr->b_l1hdr.b_pabd != NULL) {
2243                 (void) zfs_refcount_remove_many(&state->arcs_esize[type],
2244                     arc_hdr_size(hdr), hdr);
2245         }
2246         if (HDR_HAS_RABD(hdr)) {
2247                 (void) zfs_refcount_remove_many(&state->arcs_esize[type],
2248                     HDR_GET_PSIZE(hdr), hdr);
2249         }
2250
2251         for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2252             buf = buf->b_next) {
2253                 if (arc_buf_is_shared(buf))
2254                         continue;
2255                 (void) zfs_refcount_remove_many(&state->arcs_esize[type],
2256                     arc_buf_size(buf), buf);
2257         }
2258 }
2259
2260 /*
2261  * Add a reference to this hdr indicating that someone is actively
2262  * referencing that memory. When the refcount transitions from 0 to 1,
2263  * we remove it from the respective arc_state_t list to indicate that
2264  * it is not evictable.
2265  */
2266 static void
2267 add_reference(arc_buf_hdr_t *hdr, void *tag)
2268 {
2269         arc_state_t *state;
2270
2271         ASSERT(HDR_HAS_L1HDR(hdr));
2272         if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) {
2273                 ASSERT(hdr->b_l1hdr.b_state == arc_anon);
2274                 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2275                 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2276         }
2277
2278         state = hdr->b_l1hdr.b_state;
2279
2280         if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
2281             (state != arc_anon)) {
2282                 /* We don't use the L2-only state list. */
2283                 if (state != arc_l2c_only) {
2284                         multilist_remove(&state->arcs_list[arc_buf_type(hdr)],
2285                             hdr);
2286                         arc_evictable_space_decrement(hdr, state);
2287                 }
2288                 /* remove the prefetch flag if we get a reference */
2289                 if (HDR_HAS_L2HDR(hdr))
2290                         l2arc_hdr_arcstats_decrement_state(hdr);
2291                 arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
2292                 if (HDR_HAS_L2HDR(hdr))
2293                         l2arc_hdr_arcstats_increment_state(hdr);
2294         }
2295 }
2296
2297 /*
2298  * Remove a reference from this hdr. When the reference transitions from
2299  * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's
2300  * list making it eligible for eviction.
2301  */
2302 static int
2303 remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
2304 {
2305         int cnt;
2306         arc_state_t *state = hdr->b_l1hdr.b_state;
2307
2308         ASSERT(HDR_HAS_L1HDR(hdr));
2309         ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
2310         ASSERT(!GHOST_STATE(state));
2311
2312         /*
2313          * arc_l2c_only counts as a ghost state so we don't need to explicitly
2314          * check to prevent usage of the arc_l2c_only list.
2315          */
2316         if (((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
2317             (state != arc_anon)) {
2318                 multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr);
2319                 ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
2320                 arc_evictable_space_increment(hdr, state);
2321         }
2322         return (cnt);
2323 }
2324
2325 /*
2326  * Returns detailed information about a specific arc buffer.  When the
2327  * state_index argument is set the function will calculate the arc header
2328  * list position for its arc state.  Since this requires a linear traversal
2329  * callers are strongly encourage not to do this.  However, it can be helpful
2330  * for targeted analysis so the functionality is provided.
2331  */
2332 void
2333 arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
2334 {
2335         arc_buf_hdr_t *hdr = ab->b_hdr;
2336         l1arc_buf_hdr_t *l1hdr = NULL;
2337         l2arc_buf_hdr_t *l2hdr = NULL;
2338         arc_state_t *state = NULL;
2339
2340         memset(abi, 0, sizeof (arc_buf_info_t));
2341
2342         if (hdr == NULL)
2343                 return;
2344
2345         abi->abi_flags = hdr->b_flags;
2346
2347         if (HDR_HAS_L1HDR(hdr)) {
2348                 l1hdr = &hdr->b_l1hdr;
2349                 state = l1hdr->b_state;
2350         }
2351         if (HDR_HAS_L2HDR(hdr))
2352                 l2hdr = &hdr->b_l2hdr;
2353
2354         if (l1hdr) {
2355                 abi->abi_bufcnt = l1hdr->b_bufcnt;
2356                 abi->abi_access = l1hdr->b_arc_access;
2357                 abi->abi_mru_hits = l1hdr->b_mru_hits;
2358                 abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits;
2359                 abi->abi_mfu_hits = l1hdr->b_mfu_hits;
2360                 abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits;
2361                 abi->abi_holds = zfs_refcount_count(&l1hdr->b_refcnt);
2362         }
2363
2364         if (l2hdr) {
2365                 abi->abi_l2arc_dattr = l2hdr->b_daddr;
2366                 abi->abi_l2arc_hits = l2hdr->b_hits;
2367         }
2368
2369         abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
2370         abi->abi_state_contents = arc_buf_type(hdr);
2371         abi->abi_size = arc_hdr_size(hdr);
2372 }
2373
2374 /*
2375  * Move the supplied buffer to the indicated state. The hash lock
2376  * for the buffer must be held by the caller.
2377  */
2378 static void
2379 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
2380     kmutex_t *hash_lock)
2381 {
2382         arc_state_t *old_state;
2383         int64_t refcnt;
2384         uint32_t bufcnt;
2385         boolean_t update_old, update_new;
2386         arc_buf_contents_t buftype = arc_buf_type(hdr);
2387
2388         /*
2389          * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
2390          * in arc_read() when bringing a buffer out of the L2ARC.  However, the
2391          * L1 hdr doesn't always exist when we change state to arc_anon before
2392          * destroying a header, in which case reallocating to add the L1 hdr is
2393          * pointless.
2394          */
2395         if (HDR_HAS_L1HDR(hdr)) {
2396                 old_state = hdr->b_l1hdr.b_state;
2397                 refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt);
2398                 bufcnt = hdr->b_l1hdr.b_bufcnt;
2399                 update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL ||
2400                     HDR_HAS_RABD(hdr));
2401         } else {
2402                 old_state = arc_l2c_only;
2403                 refcnt = 0;
2404                 bufcnt = 0;
2405                 update_old = B_FALSE;
2406         }
2407         update_new = update_old;
2408
2409         ASSERT(MUTEX_HELD(hash_lock));
2410         ASSERT3P(new_state, !=, old_state);
2411         ASSERT(!GHOST_STATE(new_state) || bufcnt == 0);
2412         ASSERT(old_state != arc_anon || bufcnt <= 1);
2413
2414         /*
2415          * If this buffer is evictable, transfer it from the
2416          * old state list to the new state list.
2417          */
2418         if (refcnt == 0) {
2419                 if (old_state != arc_anon && old_state != arc_l2c_only) {
2420                         ASSERT(HDR_HAS_L1HDR(hdr));
2421                         multilist_remove(&old_state->arcs_list[buftype], hdr);
2422
2423                         if (GHOST_STATE(old_state)) {
2424                                 ASSERT0(bufcnt);
2425                                 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2426                                 update_old = B_TRUE;
2427                         }
2428                         arc_evictable_space_decrement(hdr, old_state);
2429                 }
2430                 if (new_state != arc_anon && new_state != arc_l2c_only) {
2431                         /*
2432                          * An L1 header always exists here, since if we're
2433                          * moving to some L1-cached state (i.e. not l2c_only or
2434                          * anonymous), we realloc the header to add an L1hdr
2435                          * beforehand.
2436                          */
2437                         ASSERT(HDR_HAS_L1HDR(hdr));
2438                         multilist_insert(&new_state->arcs_list[buftype], hdr);
2439
2440                         if (GHOST_STATE(new_state)) {
2441                                 ASSERT0(bufcnt);
2442                                 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2443                                 update_new = B_TRUE;
2444                         }
2445                         arc_evictable_space_increment(hdr, new_state);
2446                 }
2447         }
2448
2449         ASSERT(!HDR_EMPTY(hdr));
2450         if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
2451                 buf_hash_remove(hdr);
2452
2453         /* adjust state sizes (ignore arc_l2c_only) */
2454
2455         if (update_new && new_state != arc_l2c_only) {
2456                 ASSERT(HDR_HAS_L1HDR(hdr));
2457                 if (GHOST_STATE(new_state)) {
2458                         ASSERT0(bufcnt);
2459
2460                         /*
2461                          * When moving a header to a ghost state, we first
2462                          * remove all arc buffers. Thus, we'll have a
2463                          * bufcnt of zero, and no arc buffer to use for
2464                          * the reference. As a result, we use the arc
2465                          * header pointer for the reference.
2466                          */
2467                         (void) zfs_refcount_add_many(&new_state->arcs_size,
2468                             HDR_GET_LSIZE(hdr), hdr);
2469                         ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2470                         ASSERT(!HDR_HAS_RABD(hdr));
2471                 } else {
2472                         uint32_t buffers = 0;
2473
2474                         /*
2475                          * Each individual buffer holds a unique reference,
2476                          * thus we must remove each of these references one
2477                          * at a time.
2478                          */
2479                         for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2480                             buf = buf->b_next) {
2481                                 ASSERT3U(bufcnt, !=, 0);
2482                                 buffers++;
2483
2484                                 /*
2485                                  * When the arc_buf_t is sharing the data
2486                                  * block with the hdr, the owner of the
2487                                  * reference belongs to the hdr. Only
2488                                  * add to the refcount if the arc_buf_t is
2489                                  * not shared.
2490                                  */
2491                                 if (arc_buf_is_shared(buf))
2492                                         continue;
2493
2494                                 (void) zfs_refcount_add_many(
2495                                     &new_state->arcs_size,
2496                                     arc_buf_size(buf), buf);
2497                         }
2498                         ASSERT3U(bufcnt, ==, buffers);
2499
2500                         if (hdr->b_l1hdr.b_pabd != NULL) {
2501                                 (void) zfs_refcount_add_many(
2502                                     &new_state->arcs_size,
2503                                     arc_hdr_size(hdr), hdr);
2504                         }
2505
2506                         if (HDR_HAS_RABD(hdr)) {
2507                                 (void) zfs_refcount_add_many(
2508                                     &new_state->arcs_size,
2509                                     HDR_GET_PSIZE(hdr), hdr);
2510                         }
2511                 }
2512         }
2513
2514         if (update_old && old_state != arc_l2c_only) {
2515                 ASSERT(HDR_HAS_L1HDR(hdr));
2516                 if (GHOST_STATE(old_state)) {
2517                         ASSERT0(bufcnt);
2518                         ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2519                         ASSERT(!HDR_HAS_RABD(hdr));
2520
2521                         /*
2522                          * When moving a header off of a ghost state,
2523                          * the header will not contain any arc buffers.
2524                          * We use the arc header pointer for the reference
2525                          * which is exactly what we did when we put the
2526                          * header on the ghost state.
2527                          */
2528
2529                         (void) zfs_refcount_remove_many(&old_state->arcs_size,
2530                             HDR_GET_LSIZE(hdr), hdr);
2531                 } else {
2532                         uint32_t buffers = 0;
2533
2534                         /*
2535                          * Each individual buffer holds a unique reference,
2536                          * thus we must remove each of these references one
2537                          * at a time.
2538                          */
2539                         for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2540                             buf = buf->b_next) {
2541                                 ASSERT3U(bufcnt, !=, 0);
2542                                 buffers++;
2543
2544                                 /*
2545                                  * When the arc_buf_t is sharing the data
2546                                  * block with the hdr, the owner of the
2547                                  * reference belongs to the hdr. Only
2548                                  * add to the refcount if the arc_buf_t is
2549                                  * not shared.
2550                                  */
2551                                 if (arc_buf_is_shared(buf))
2552                                         continue;
2553
2554                                 (void) zfs_refcount_remove_many(
2555                                     &old_state->arcs_size, arc_buf_size(buf),
2556                                     buf);
2557                         }
2558                         ASSERT3U(bufcnt, ==, buffers);
2559                         ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
2560                             HDR_HAS_RABD(hdr));
2561
2562                         if (hdr->b_l1hdr.b_pabd != NULL) {
2563                                 (void) zfs_refcount_remove_many(
2564                                     &old_state->arcs_size, arc_hdr_size(hdr),
2565                                     hdr);
2566                         }
2567
2568                         if (HDR_HAS_RABD(hdr)) {
2569                                 (void) zfs_refcount_remove_many(
2570                                     &old_state->arcs_size, HDR_GET_PSIZE(hdr),
2571                                     hdr);
2572                         }
2573                 }
2574         }
2575
2576         if (HDR_HAS_L1HDR(hdr)) {
2577                 hdr->b_l1hdr.b_state = new_state;
2578
2579                 if (HDR_HAS_L2HDR(hdr) && new_state != arc_l2c_only) {
2580                         l2arc_hdr_arcstats_decrement_state(hdr);
2581                         hdr->b_l2hdr.b_arcs_state = new_state->arcs_state;
2582                         l2arc_hdr_arcstats_increment_state(hdr);
2583                 }
2584         }
2585 }
2586
2587 void
2588 arc_space_consume(uint64_t space, arc_space_type_t type)
2589 {
2590         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
2591
2592         switch (type) {
2593         default:
2594                 break;
2595         case ARC_SPACE_DATA:
2596                 ARCSTAT_INCR(arcstat_data_size, space);
2597                 break;
2598         case ARC_SPACE_META:
2599                 ARCSTAT_INCR(arcstat_metadata_size, space);
2600                 break;
2601         case ARC_SPACE_BONUS:
2602                 ARCSTAT_INCR(arcstat_bonus_size, space);
2603                 break;
2604         case ARC_SPACE_DNODE:
2605                 aggsum_add(&arc_sums.arcstat_dnode_size, space);
2606                 break;
2607         case ARC_SPACE_DBUF:
2608                 ARCSTAT_INCR(arcstat_dbuf_size, space);
2609                 break;
2610         case ARC_SPACE_HDRS:
2611                 ARCSTAT_INCR(arcstat_hdr_size, space);
2612                 break;
2613         case ARC_SPACE_L2HDRS:
2614                 aggsum_add(&arc_sums.arcstat_l2_hdr_size, space);
2615                 break;
2616         case ARC_SPACE_ABD_CHUNK_WASTE:
2617                 /*
2618                  * Note: this includes space wasted by all scatter ABD's, not
2619                  * just those allocated by the ARC.  But the vast majority of
2620                  * scatter ABD's come from the ARC, because other users are
2621                  * very short-lived.
2622                  */
2623                 ARCSTAT_INCR(arcstat_abd_chunk_waste_size, space);
2624                 break;
2625         }
2626
2627         if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE)
2628                 aggsum_add(&arc_sums.arcstat_meta_used, space);
2629
2630         aggsum_add(&arc_sums.arcstat_size, space);
2631 }
2632
2633 void
2634 arc_space_return(uint64_t space, arc_space_type_t type)
2635 {
2636         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
2637
2638         switch (type) {
2639         default:
2640                 break;
2641         case ARC_SPACE_DATA:
2642                 ARCSTAT_INCR(arcstat_data_size, -space);
2643                 break;
2644         case ARC_SPACE_META:
2645                 ARCSTAT_INCR(arcstat_metadata_size, -space);
2646                 break;
2647         case ARC_SPACE_BONUS:
2648                 ARCSTAT_INCR(arcstat_bonus_size, -space);
2649                 break;
2650         case ARC_SPACE_DNODE:
2651                 aggsum_add(&arc_sums.arcstat_dnode_size, -space);
2652                 break;
2653         case ARC_SPACE_DBUF:
2654                 ARCSTAT_INCR(arcstat_dbuf_size, -space);
2655                 break;
2656         case ARC_SPACE_HDRS:
2657                 ARCSTAT_INCR(arcstat_hdr_size, -space);
2658                 break;
2659         case ARC_SPACE_L2HDRS:
2660                 aggsum_add(&arc_sums.arcstat_l2_hdr_size, -space);
2661                 break;
2662         case ARC_SPACE_ABD_CHUNK_WASTE:
2663                 ARCSTAT_INCR(arcstat_abd_chunk_waste_size, -space);
2664                 break;
2665         }
2666
2667         if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) {
2668                 ASSERT(aggsum_compare(&arc_sums.arcstat_meta_used,
2669                     space) >= 0);
2670                 ARCSTAT_MAX(arcstat_meta_max,
2671                     aggsum_upper_bound(&arc_sums.arcstat_meta_used));
2672                 aggsum_add(&arc_sums.arcstat_meta_used, -space);
2673         }
2674
2675         ASSERT(aggsum_compare(&arc_sums.arcstat_size, space) >= 0);
2676         aggsum_add(&arc_sums.arcstat_size, -space);
2677 }
2678
2679 /*
2680  * Given a hdr and a buf, returns whether that buf can share its b_data buffer
2681  * with the hdr's b_pabd.
2682  */
2683 static boolean_t
2684 arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
2685 {
2686         /*
2687          * The criteria for sharing a hdr's data are:
2688          * 1. the buffer is not encrypted
2689          * 2. the hdr's compression matches the buf's compression
2690          * 3. the hdr doesn't need to be byteswapped
2691          * 4. the hdr isn't already being shared
2692          * 5. the buf is either compressed or it is the last buf in the hdr list
2693          *
2694          * Criterion #5 maintains the invariant that shared uncompressed
2695          * bufs must be the final buf in the hdr's b_buf list. Reading this, you
2696          * might ask, "if a compressed buf is allocated first, won't that be the
2697          * last thing in the list?", but in that case it's impossible to create
2698          * a shared uncompressed buf anyway (because the hdr must be compressed
2699          * to have the compressed buf). You might also think that #3 is
2700          * sufficient to make this guarantee, however it's possible
2701          * (specifically in the rare L2ARC write race mentioned in
2702          * arc_buf_alloc_impl()) there will be an existing uncompressed buf that
2703          * is shareable, but wasn't at the time of its allocation. Rather than
2704          * allow a new shared uncompressed buf to be created and then shuffle
2705          * the list around to make it the last element, this simply disallows
2706          * sharing if the new buf isn't the first to be added.
2707          */
2708         ASSERT3P(buf->b_hdr, ==, hdr);
2709         boolean_t hdr_compressed =
2710             arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF;
2711         boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0;
2712         return (!ARC_BUF_ENCRYPTED(buf) &&
2713             buf_compressed == hdr_compressed &&
2714             hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
2715             !HDR_SHARED_DATA(hdr) &&
2716             (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf)));
2717 }
2718
2719 /*
2720  * Allocate a buf for this hdr. If you care about the data that's in the hdr,
2721  * or if you want a compressed buffer, pass those flags in. Returns 0 if the
2722  * copy was made successfully, or an error code otherwise.
2723  */
2724 static int
2725 arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
2726     void *tag, boolean_t encrypted, boolean_t compressed, boolean_t noauth,
2727     boolean_t fill, arc_buf_t **ret)
2728 {
2729         arc_buf_t *buf;
2730         arc_fill_flags_t flags = ARC_FILL_LOCKED;
2731
2732         ASSERT(HDR_HAS_L1HDR(hdr));
2733         ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
2734         VERIFY(hdr->b_type == ARC_BUFC_DATA ||
2735             hdr->b_type == ARC_BUFC_METADATA);
2736         ASSERT3P(ret, !=, NULL);
2737         ASSERT3P(*ret, ==, NULL);
2738         IMPLY(encrypted, compressed);
2739
2740         buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2741         buf->b_hdr = hdr;
2742         buf->b_data = NULL;
2743         buf->b_next = hdr->b_l1hdr.b_buf;
2744         buf->b_flags = 0;
2745
2746         add_reference(hdr, tag);
2747
2748         /*
2749          * We're about to change the hdr's b_flags. We must either
2750          * hold the hash_lock or be undiscoverable.
2751          */
2752         ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
2753
2754         /*
2755          * Only honor requests for compressed bufs if the hdr is actually
2756          * compressed. This must be overridden if the buffer is encrypted since
2757          * encrypted buffers cannot be decompressed.
2758          */
2759         if (encrypted) {
2760                 buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
2761                 buf->b_flags |= ARC_BUF_FLAG_ENCRYPTED;
2762                 flags |= ARC_FILL_COMPRESSED | ARC_FILL_ENCRYPTED;
2763         } else if (compressed &&
2764             arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
2765                 buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
2766                 flags |= ARC_FILL_COMPRESSED;
2767         }
2768
2769         if (noauth) {
2770                 ASSERT0(encrypted);
2771                 flags |= ARC_FILL_NOAUTH;
2772         }
2773
2774         /*
2775          * If the hdr's data can be shared then we share the data buffer and
2776          * set the appropriate bit in the hdr's b_flags to indicate the hdr is
2777          * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new
2778          * buffer to store the buf's data.
2779          *
2780          * There are two additional restrictions here because we're sharing
2781          * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
2782          * actively involved in an L2ARC write, because if this buf is used by
2783          * an arc_write() then the hdr's data buffer will be released when the
2784          * write completes, even though the L2ARC write might still be using it.
2785          * Second, the hdr's ABD must be linear so that the buf's user doesn't
2786          * need to be ABD-aware.  It must be allocated via
2787          * zio_[data_]buf_alloc(), not as a page, because we need to be able
2788          * to abd_release_ownership_of_buf(), which isn't allowed on "linear
2789          * page" buffers because the ABD code needs to handle freeing them
2790          * specially.
2791          */
2792         boolean_t can_share = arc_can_share(hdr, buf) &&
2793             !HDR_L2_WRITING(hdr) &&
2794             hdr->b_l1hdr.b_pabd != NULL &&
2795             abd_is_linear(hdr->b_l1hdr.b_pabd) &&
2796             !abd_is_linear_page(hdr->b_l1hdr.b_pabd);
2797
2798         /* Set up b_data and sharing */
2799         if (can_share) {
2800                 buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd);
2801                 buf->b_flags |= ARC_BUF_FLAG_SHARED;
2802                 arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
2803         } else {
2804                 buf->b_data =
2805                     arc_get_data_buf(hdr, arc_buf_size(buf), buf);
2806                 ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
2807         }
2808         VERIFY3P(buf->b_data, !=, NULL);
2809
2810         hdr->b_l1hdr.b_buf = buf;
2811         hdr->b_l1hdr.b_bufcnt += 1;
2812         if (encrypted)
2813                 hdr->b_crypt_hdr.b_ebufcnt += 1;
2814
2815         /*
2816          * If the user wants the data from the hdr, we need to either copy or
2817          * decompress the data.
2818          */
2819         if (fill) {
2820                 ASSERT3P(zb, !=, NULL);
2821                 return (arc_buf_fill(buf, spa, zb, flags));
2822         }
2823
2824         return (0);
2825 }
2826
2827 static char *arc_onloan_tag = "onloan";
2828
2829 static inline void
2830 arc_loaned_bytes_update(int64_t delta)
2831 {
2832         atomic_add_64(&arc_loaned_bytes, delta);
2833
2834         /* assert that it did not wrap around */
2835         ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
2836 }
2837
2838 /*
2839  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
2840  * flight data by arc_tempreserve_space() until they are "returned". Loaned
2841  * buffers must be returned to the arc before they can be used by the DMU or
2842  * freed.
2843  */
2844 arc_buf_t *
2845 arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size)
2846 {
2847         arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag,
2848             is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size);
2849
2850         arc_loaned_bytes_update(arc_buf_size(buf));
2851
2852         return (buf);
2853 }
2854
2855 arc_buf_t *
2856 arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
2857     enum zio_compress compression_type, uint8_t complevel)
2858 {
2859         arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag,
2860             psize, lsize, compression_type, complevel);
2861
2862         arc_loaned_bytes_update(arc_buf_size(buf));
2863
2864         return (buf);
2865 }
2866
2867 arc_buf_t *
2868 arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder,
2869     const uint8_t *salt, const uint8_t *iv, const uint8_t *mac,
2870     dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
2871     enum zio_compress compression_type, uint8_t complevel)
2872 {
2873         arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj,
2874             byteorder, salt, iv, mac, ot, psize, lsize, compression_type,
2875             complevel);
2876
2877         atomic_add_64(&arc_loaned_bytes, psize);
2878         return (buf);
2879 }
2880
2881
2882 /*
2883  * Return a loaned arc buffer to the arc.
2884  */
2885 void
2886 arc_return_buf(arc_buf_t *buf, void *tag)
2887 {
2888         arc_buf_hdr_t *hdr = buf->b_hdr;
2889
2890         ASSERT3P(buf->b_data, !=, NULL);
2891         ASSERT(HDR_HAS_L1HDR(hdr));
2892         (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
2893         (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
2894
2895         arc_loaned_bytes_update(-arc_buf_size(buf));
2896 }
2897
2898 /* Detach an arc_buf from a dbuf (tag) */
2899 void
2900 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
2901 {
2902         arc_buf_hdr_t *hdr = buf->b_hdr;
2903
2904         ASSERT3P(buf->b_data, !=, NULL);
2905         ASSERT(HDR_HAS_L1HDR(hdr));
2906         (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
2907         (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
2908
2909         arc_loaned_bytes_update(arc_buf_size(buf));
2910 }
2911
2912 static void
2913 l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type)
2914 {
2915         l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
2916
2917         df->l2df_abd = abd;
2918         df->l2df_size = size;
2919         df->l2df_type = type;
2920         mutex_enter(&l2arc_free_on_write_mtx);
2921         list_insert_head(l2arc_free_on_write, df);
2922         mutex_exit(&l2arc_free_on_write_mtx);
2923 }
2924
2925 static void
2926 arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata)
2927 {
2928         arc_state_t *state = hdr->b_l1hdr.b_state;
2929         arc_buf_contents_t type = arc_buf_type(hdr);
2930         uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
2931
2932         /* protected by hash lock, if in the hash table */
2933         if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
2934                 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2935                 ASSERT(state != arc_anon && state != arc_l2c_only);
2936
2937                 (void) zfs_refcount_remove_many(&state->arcs_esize[type],
2938                     size, hdr);
2939         }
2940         (void) zfs_refcount_remove_many(&state->arcs_size, size, hdr);
2941         if (type == ARC_BUFC_METADATA) {
2942                 arc_space_return(size, ARC_SPACE_META);
2943         } else {
2944                 ASSERT(type == ARC_BUFC_DATA);
2945                 arc_space_return(size, ARC_SPACE_DATA);
2946         }
2947
2948         if (free_rdata) {
2949                 l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd, size, type);
2950         } else {
2951                 l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
2952         }
2953 }
2954
2955 /*
2956  * Share the arc_buf_t's data with the hdr. Whenever we are sharing the
2957  * data buffer, we transfer the refcount ownership to the hdr and update
2958  * the appropriate kstats.
2959  */
2960 static void
2961 arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
2962 {
2963         ASSERT(arc_can_share(hdr, buf));
2964         ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2965         ASSERT(!ARC_BUF_ENCRYPTED(buf));
2966         ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
2967
2968         /*
2969          * Start sharing the data buffer. We transfer the
2970          * refcount ownership to the hdr since it always owns
2971          * the refcount whenever an arc_buf_t is shared.
2972          */
2973         zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size,
2974             arc_hdr_size(hdr), buf, hdr);
2975         hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
2976         abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
2977             HDR_ISTYPE_METADATA(hdr));
2978         arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
2979         buf->b_flags |= ARC_BUF_FLAG_SHARED;
2980
2981         /*
2982          * Since we've transferred ownership to the hdr we need
2983          * to increment its compressed and uncompressed kstats and
2984          * decrement the overhead size.
2985          */
2986         ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
2987         ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
2988         ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf));
2989 }
2990
2991 static void
2992 arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
2993 {
2994         ASSERT(arc_buf_is_shared(buf));
2995         ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
2996         ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
2997
2998         /*
2999          * We are no longer sharing this buffer so we need
3000          * to transfer its ownership to the rightful owner.
3001          */
3002         zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size,
3003             arc_hdr_size(hdr), hdr, buf);
3004         arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
3005         abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
3006         abd_free(hdr->b_l1hdr.b_pabd);
3007         hdr->b_l1hdr.b_pabd = NULL;
3008         buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
3009
3010         /*
3011          * Since the buffer is no longer shared between
3012          * the arc buf and the hdr, count it as overhead.
3013          */
3014         ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
3015         ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
3016         ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
3017 }
3018
3019 /*
3020  * Remove an arc_buf_t from the hdr's buf list and return the last
3021  * arc_buf_t on the list. If no buffers remain on the list then return
3022  * NULL.
3023  */
3024 static arc_buf_t *
3025 arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
3026 {
3027         ASSERT(HDR_HAS_L1HDR(hdr));
3028         ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
3029
3030         arc_buf_t **bufp = &hdr->b_l1hdr.b_buf;
3031         arc_buf_t *lastbuf = NULL;
3032
3033         /*
3034          * Remove the buf from the hdr list and locate the last
3035          * remaining buffer on the list.
3036          */
3037         while (*bufp != NULL) {
3038                 if (*bufp == buf)
3039                         *bufp = buf->b_next;
3040
3041                 /*
3042                  * If we've removed a buffer in the middle of
3043                  * the list then update the lastbuf and update
3044                  * bufp.
3045                  */
3046                 if (*bufp != NULL) {
3047                         lastbuf = *bufp;
3048                         bufp = &(*bufp)->b_next;
3049                 }
3050         }
3051         buf->b_next = NULL;
3052         ASSERT3P(lastbuf, !=, buf);
3053         IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL);
3054         IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL);
3055         IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));
3056
3057         return (lastbuf);
3058 }
3059
3060 /*
3061  * Free up buf->b_data and pull the arc_buf_t off of the arc_buf_hdr_t's
3062  * list and free it.
3063  */
3064 static void
3065 arc_buf_destroy_impl(arc_buf_t *buf)
3066 {
3067         arc_buf_hdr_t *hdr = buf->b_hdr;
3068
3069         /*
3070          * Free up the data associated with the buf but only if we're not
3071          * sharing this with the hdr. If we are sharing it with the hdr, the
3072          * hdr is responsible for doing the free.
3073          */
3074         if (buf->b_data != NULL) {
3075                 /*
3076                  * We're about to change the hdr's b_flags. We must either
3077                  * hold the hash_lock or be undiscoverable.
3078                  */
3079                 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
3080
3081                 arc_cksum_verify(buf);
3082                 arc_buf_unwatch(buf);
3083
3084                 if (arc_buf_is_shared(buf)) {
3085                         arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
3086                 } else {
3087                         uint64_t size = arc_buf_size(buf);
3088                         arc_free_data_buf(hdr, buf->b_data, size, buf);
3089                         ARCSTAT_INCR(arcstat_overhead_size, -size);
3090                 }
3091                 buf->b_data = NULL;
3092
3093                 ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
3094                 hdr->b_l1hdr.b_bufcnt -= 1;
3095
3096                 if (ARC_BUF_ENCRYPTED(buf)) {
3097                         hdr->b_crypt_hdr.b_ebufcnt -= 1;
3098
3099                         /*
3100                          * If we have no more encrypted buffers and we've
3101                          * already gotten a copy of the decrypted data we can
3102                          * free b_rabd to save some space.
3103                          */
3104                         if (hdr->b_crypt_hdr.b_ebufcnt == 0 &&
3105                             HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd != NULL &&
3106                             !HDR_IO_IN_PROGRESS(hdr)) {
3107                                 arc_hdr_free_abd(hdr, B_TRUE);
3108                         }
3109                 }
3110         }
3111
3112         arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
3113
3114         if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
3115                 /*
3116                  * If the current arc_buf_t is sharing its data buffer with the
3117                  * hdr, then reassign the hdr's b_pabd to share it with the new
3118                  * buffer at the end of the list. The shared buffer is always
3119                  * the last one on the hdr's buffer list.
3120                  *
3121                  * There is an equivalent case for compressed bufs, but since
3122                  * they aren't guaranteed to be the last buf in the list and
3123                  * that is an exceedingly rare case, we just allow that space be
3124                  * wasted temporarily. We must also be careful not to share
3125                  * encrypted buffers, since they cannot be shared.
3126                  */
3127                 if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) {
3128                         /* Only one buf can be shared at once */
3129                         VERIFY(!arc_buf_is_shared(lastbuf));
3130                         /* hdr is uncompressed so can't have compressed buf */
3131                         VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
3132
3133                         ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
3134                         arc_hdr_free_abd(hdr, B_FALSE);
3135
3136                         /*
3137                          * We must setup a new shared block between the
3138                          * last buffer and the hdr. The data would have
3139                          * been allocated by the arc buf so we need to transfer
3140                          * ownership to the hdr since it's now being shared.
3141                          */
3142                         arc_share_buf(hdr, lastbuf);
3143                 }
3144         } else if (HDR_SHARED_DATA(hdr)) {
3145                 /*
3146                  * Uncompressed shared buffers are always at the end
3147                  * of the list. Compressed buffers don't have the
3148                  * same requirements. This makes it hard to
3149                  * simply assert that the lastbuf is shared so
3150                  * we rely on the hdr's compression flags to determine
3151                  * if we have a compressed, shared buffer.
3152                  */
3153                 ASSERT3P(lastbuf, !=, NULL);
3154                 ASSERT(arc_buf_is_shared(lastbuf) ||
3155                     arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
3156         }
3157
3158         /*
3159          * Free the checksum if we're removing the last uncompressed buf from
3160          * this hdr.
3161          */
3162         if (!arc_hdr_has_uncompressed_buf(hdr)) {
3163                 arc_cksum_free(hdr);
3164         }
3165
3166         /* clean up the buf */
3167         buf->b_hdr = NULL;
3168         kmem_cache_free(buf_cache, buf);
3169 }
3170
3171 static void
3172 arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, int alloc_flags)
3173 {
3174         uint64_t size;
3175         boolean_t alloc_rdata = ((alloc_flags & ARC_HDR_ALLOC_RDATA) != 0);
3176
3177         ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
3178         ASSERT(HDR_HAS_L1HDR(hdr));
3179         ASSERT(!HDR_SHARED_DATA(hdr) || alloc_rdata);
3180         IMPLY(alloc_rdata, HDR_PROTECTED(hdr));
3181
3182         if (alloc_rdata) {
3183                 size = HDR_GET_PSIZE(hdr);
3184                 ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL);
3185                 hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr,
3186                     alloc_flags);
3187                 ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL);
3188                 ARCSTAT_INCR(arcstat_raw_size, size);
3189         } else {
3190                 size = arc_hdr_size(hdr);
3191                 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
3192                 hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr,
3193                     alloc_flags);
3194                 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
3195         }
3196
3197         ARCSTAT_INCR(arcstat_compressed_size, size);
3198         ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
3199 }
3200
3201 static void
3202 arc_hdr_free_abd(arc_buf_hdr_t *hdr, boolean_t free_rdata)
3203 {
3204         uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
3205
3206         ASSERT(HDR_HAS_L1HDR(hdr));
3207         ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
3208         IMPLY(free_rdata, HDR_HAS_RABD(hdr));
3209
3210         /*
3211          * If the hdr is currently being written to the l2arc then
3212          * we defer freeing the data by adding it to the l2arc_free_on_write
3213          * list. The l2arc will free the data once it's finished
3214          * writing it to the l2arc device.
3215          */
3216         if (HDR_L2_WRITING(hdr)) {
3217                 arc_hdr_free_on_write(hdr, free_rdata);
3218                 ARCSTAT_BUMP(arcstat_l2_free_on_write);
3219         } else if (free_rdata) {
3220                 arc_free_data_abd(hdr, hdr->b_crypt_hdr.b_rabd, size, hdr);
3221         } else {
3222                 arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, size, hdr);
3223         }
3224
3225         if (free_rdata) {
3226                 hdr->b_crypt_hdr.b_rabd = NULL;
3227                 ARCSTAT_INCR(arcstat_raw_size, -size);
3228         } else {
3229                 hdr->b_l1hdr.b_pabd = NULL;
3230         }
3231
3232         if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr))
3233                 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
3234
3235         ARCSTAT_INCR(arcstat_compressed_size, -size);
3236         ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
3237 }
3238
3239 /*
3240  * Allocate empty anonymous ARC header.  The header will get its identity
3241  * assigned and buffers attached later as part of read or write operations.
3242  *
3243  * In case of read arc_read() assigns header its identify (b_dva + b_birth),
3244  * inserts it into ARC hash to become globally visible and allocates physical
3245  * (b_pabd) or raw (b_rabd) ABD buffer to read into from disk.  On disk read
3246  * completion arc_read_done() allocates ARC buffer(s) as needed, potentially
3247  * sharing one of them with the physical ABD buffer.
3248  *
3249  * In case of write arc_alloc_buf() allocates ARC buffer to be filled with
3250  * data.  Then after compression and/or encryption arc_write_ready() allocates
3251  * and fills (or potentially shares) physical (b_pabd) or raw (b_rabd) ABD
3252  * buffer.  On disk write completion arc_write_done() assigns the header its
3253  * new identity (b_dva + b_birth) and inserts into ARC hash.
3254  *
3255  * In case of partial overwrite the old data is read first as described. Then
3256  * arc_release() either allocates new anonymous ARC header and moves the ARC
3257  * buffer to it, or reuses the old ARC header by discarding its identity and
3258  * removing it from ARC hash.  After buffer modification normal write process
3259  * follows as described.
3260  */
3261 static arc_buf_hdr_t *
3262 arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
3263     boolean_t protected, enum zio_compress compression_type, uint8_t complevel,
3264     arc_buf_contents_t type)
3265 {
3266         arc_buf_hdr_t *hdr;
3267
3268         VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
3269         if (protected) {
3270                 hdr = kmem_cache_alloc(hdr_full_crypt_cache, KM_PUSHPAGE);
3271         } else {
3272                 hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
3273         }
3274
3275         ASSERT(HDR_EMPTY(hdr));
3276         ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
3277         HDR_SET_PSIZE(hdr, psize);
3278         HDR_SET_LSIZE(hdr, lsize);
3279         hdr->b_spa = spa;
3280         hdr->b_type = type;
3281         hdr->b_flags = 0;
3282         arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR);
3283         arc_hdr_set_compress(hdr, compression_type);
3284         hdr->b_complevel = complevel;
3285         if (protected)
3286                 arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
3287
3288         hdr->b_l1hdr.b_state = arc_anon;
3289         hdr->b_l1hdr.b_arc_access = 0;
3290         hdr->b_l1hdr.b_mru_hits = 0;
3291         hdr->b_l1hdr.b_mru_ghost_hits = 0;
3292         hdr->b_l1hdr.b_mfu_hits = 0;
3293         hdr->b_l1hdr.b_mfu_ghost_hits = 0;
3294         hdr->b_l1hdr.b_bufcnt = 0;
3295         hdr->b_l1hdr.b_buf = NULL;
3296
3297         ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3298
3299         return (hdr);
3300 }
3301
3302 /*
3303  * Transition between the two allocation states for the arc_buf_hdr struct.
3304  * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
3305  * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
3306  * version is used when a cache buffer is only in the L2ARC in order to reduce
3307  * memory usage.
3308  */
3309 static arc_buf_hdr_t *
3310 arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
3311 {
3312         ASSERT(HDR_HAS_L2HDR(hdr));
3313
3314         arc_buf_hdr_t *nhdr;
3315         l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
3316
3317         ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
3318             (old == hdr_l2only_cache && new == hdr_full_cache));
3319
3320         /*
3321          * if the caller wanted a new full header and the header is to be
3322          * encrypted we will actually allocate the header from the full crypt
3323          * cache instead. The same applies to freeing from the old cache.
3324          */
3325         if (HDR_PROTECTED(hdr) && new == hdr_full_cache)
3326                 new = hdr_full_crypt_cache;
3327         if (HDR_PROTECTED(hdr) && old == hdr_full_cache)
3328                 old = hdr_full_crypt_cache;
3329
3330         nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
3331
3332         ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
3333         buf_hash_remove(hdr);
3334
3335         bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
3336
3337         if (new == hdr_full_cache || new == hdr_full_crypt_cache) {
3338                 arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
3339                 /*
3340                  * arc_access and arc_change_state need to be aware that a
3341                  * header has just come out of L2ARC, so we set its state to
3342                  * l2c_only even though it's about to change.
3343                  */
3344                 nhdr->b_l1hdr.b_state = arc_l2c_only;
3345
3346                 /* Verify previous threads set to NULL before freeing */
3347                 ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL);
3348                 ASSERT(!HDR_HAS_RABD(hdr));
3349         } else {
3350                 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
3351                 ASSERT0(hdr->b_l1hdr.b_bufcnt);
3352                 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
3353
3354                 /*
3355                  * If we've reached here, We must have been called from
3356                  * arc_evict_hdr(), as such we should have already been
3357                  * removed from any ghost list we were previously on
3358                  * (which protects us from racing with arc_evict_state),
3359                  * thus no locking is needed during this check.
3360                  */
3361                 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
3362
3363                 /*
3364                  * A buffer must not be moved into the arc_l2c_only
3365                  * state if it's not finished being written out to the
3366                  * l2arc device. Otherwise, the b_l1hdr.b_pabd field
3367                  * might try to be accessed, even though it was removed.
3368                  */
3369                 VERIFY(!HDR_L2_WRITING(hdr));
3370                 VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL);
3371                 ASSERT(!HDR_HAS_RABD(hdr));
3372
3373                 arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
3374         }
3375         /*
3376          * The header has been reallocated so we need to re-insert it into any
3377          * lists it was on.
3378          */
3379         (void) buf_hash_insert(nhdr, NULL);
3380
3381         ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
3382
3383         mutex_enter(&dev->l2ad_mtx);
3384
3385         /*
3386          * We must place the realloc'ed header back into the list at
3387          * the same spot. Otherwise, if it's placed earlier in the list,
3388          * l2arc_write_buffers() could find it during the function's
3389          * write phase, and try to write it out to the l2arc.
3390          */
3391         list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
3392         list_remove(&dev->l2ad_buflist, hdr);
3393
3394         mutex_exit(&dev->l2ad_mtx);
3395
3396         /*
3397          * Since we're using the pointer address as the tag when
3398          * incrementing and decrementing the l2ad_alloc refcount, we
3399          * must remove the old pointer (that we're about to destroy) and
3400          * add the new pointer to the refcount. Otherwise we'd remove
3401          * the wrong pointer address when calling arc_hdr_destroy() later.
3402          */
3403
3404         (void) zfs_refcount_remove_many(&dev->l2ad_alloc,
3405             arc_hdr_size(hdr), hdr);
3406         (void) zfs_refcount_add_many(&dev->l2ad_alloc,
3407             arc_hdr_size(nhdr), nhdr);
3408
3409         buf_discard_identity(hdr);
3410         kmem_cache_free(old, hdr);
3411
3412         return (nhdr);
3413 }
3414
3415 /*
3416  * This function allows an L1 header to be reallocated as a crypt
3417  * header and vice versa. If we are going to a crypt header, the
3418  * new fields will be zeroed out.
3419  */
3420 static arc_buf_hdr_t *
3421 arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt)
3422 {
3423         arc_buf_hdr_t *nhdr;
3424         arc_buf_t *buf;
3425         kmem_cache_t *ncache, *ocache;
3426         unsigned nsize, osize;
3427
3428         /*
3429          * This function requires that hdr is in the arc_anon state.
3430          * Therefore it won't have any L2ARC data for us to worry
3431          * about copying.
3432          */
3433         ASSERT(HDR_HAS_L1HDR(hdr));
3434         ASSERT(!HDR_HAS_L2HDR(hdr));
3435         ASSERT3U(!!HDR_PROTECTED(hdr), !=, need_crypt);
3436         ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
3437         ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
3438         ASSERT(!list_link_active(&hdr->b_l2hdr.b_l2node));
3439         ASSERT3P(hdr->b_hash_next, ==, NULL);
3440
3441         if (need_crypt) {
3442                 ncache = hdr_full_crypt_cache;
3443                 nsize = sizeof (hdr->b_crypt_hdr);
3444                 ocache = hdr_full_cache;
3445                 osize = HDR_FULL_SIZE;
3446         } else {
3447                 ncache = hdr_full_cache;
3448                 nsize = HDR_FULL_SIZE;
3449                 ocache = hdr_full_crypt_cache;
3450                 osize = sizeof (hdr->b_crypt_hdr);
3451         }
3452
3453         nhdr = kmem_cache_alloc(ncache, KM_PUSHPAGE);
3454
3455         /*
3456          * Copy all members that aren't locks or condvars to the new header.
3457          * No lists are pointing to us (as we asserted above), so we don't
3458          * need to worry about the list nodes.
3459          */
3460         nhdr->b_dva = hdr->b_dva;
3461         nhdr->b_birth = hdr->b_birth;
3462         nhdr->b_type = hdr->b_type;
3463         nhdr->b_flags = hdr->b_flags;
3464         nhdr->b_psize = hdr->b_psize;
3465         nhdr->b_lsize = hdr->b_lsize;
3466         nhdr->b_spa = hdr->b_spa;
3467         nhdr->b_l1hdr.b_freeze_cksum = hdr->b_l1hdr.b_freeze_cksum;
3468         nhdr->b_l1hdr.b_bufcnt = hdr->b_l1hdr.b_bufcnt;
3469         nhdr->b_l1hdr.b_byteswap = hdr->b_l1hdr.b_byteswap;
3470         nhdr->b_l1hdr.b_state = hdr->b_l1hdr.b_state;
3471         nhdr->b_l1hdr.b_arc_access = hdr->b_l1hdr.b_arc_access;
3472         nhdr->b_l1hdr.b_mru_hits = hdr->b_l1hdr.b_mru_hits;
3473         nhdr->b_l1hdr.b_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits;
3474         nhdr->b_l1hdr.b_mfu_hits = hdr->b_l1hdr.b_mfu_hits;
3475         nhdr->b_l1hdr.b_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits;
3476         nhdr->b_l1hdr.b_acb = hdr->b_l1hdr.b_acb;
3477         nhdr->b_l1hdr.b_pabd = hdr->b_l1hdr.b_pabd;
3478
3479         /*
3480          * This zfs_refcount_add() exists only to ensure that the individual
3481          * arc buffers always point to a header that is referenced, avoiding
3482          * a small race condition that could trigger ASSERTs.
3483          */
3484         (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, FTAG);
3485         nhdr->b_l1hdr.b_buf = hdr->b_l1hdr.b_buf;
3486         for (buf = nhdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) {
3487                 mutex_enter(&buf->b_evict_lock);
3488                 buf->b_hdr = nhdr;
3489                 mutex_exit(&buf->b_evict_lock);
3490         }
3491
3492         zfs_refcount_transfer(&nhdr->b_l1hdr.b_refcnt, &hdr->b_l1hdr.b_refcnt);
3493         (void) zfs_refcount_remove(&nhdr->b_l1hdr.b_refcnt, FTAG);
3494         ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
3495
3496         if (need_crypt) {
3497                 arc_hdr_set_flags(nhdr, ARC_FLAG_PROTECTED);
3498         } else {
3499                 arc_hdr_clear_flags(nhdr, ARC_FLAG_PROTECTED);
3500         }
3501
3502         /* unset all members of the original hdr */
3503         bzero(&hdr->b_dva, sizeof (dva_t));
3504         hdr->b_birth = 0;
3505         hdr->b_type = ARC_BUFC_INVALID;
3506         hdr->b_flags = 0;
3507         hdr->b_psize = 0;
3508         hdr->b_lsize = 0;
3509         hdr->b_spa = 0;
3510         hdr->b_l1hdr.b_freeze_cksum = NULL;
3511         hdr->b_l1hdr.b_buf = NULL;
3512         hdr->b_l1hdr.b_bufcnt = 0;
3513         hdr->b_l1hdr.b_byteswap = 0;
3514         hdr->b_l1hdr.b_state = NULL;
3515         hdr->b_l1hdr.b_arc_access = 0;
3516         hdr->b_l1hdr.b_mru_hits = 0;
3517         hdr->b_l1hdr.b_mru_ghost_hits = 0;
3518         hdr->b_l1hdr.b_mfu_hits = 0;
3519         hdr->b_l1hdr.b_mfu_ghost_hits = 0;
3520         hdr->b_l1hdr.b_acb = NULL;
3521         hdr->b_l1hdr.b_pabd = NULL;
3522
3523         if (ocache == hdr_full_crypt_cache) {
3524                 ASSERT(!HDR_HAS_RABD(hdr));
3525                 hdr->b_crypt_hdr.b_ot = DMU_OT_NONE;
3526                 hdr->b_crypt_hdr.b_ebufcnt = 0;
3527                 hdr->b_crypt_hdr.b_dsobj = 0;
3528                 bzero(hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
3529                 bzero(hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
3530                 bzero(hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
3531         }
3532
3533         buf_discard_identity(hdr);
3534         kmem_cache_free(ocache, hdr);
3535
3536         return (nhdr);
3537 }
3538
3539 /*
3540  * This function is used by the send / receive code to convert a newly
3541  * allocated arc_buf_t to one that is suitable for a raw encrypted write. It
3542  * is also used to allow the root objset block to be updated without altering
3543  * its embedded MACs. Both block types will always be uncompressed so we do not
3544  * have to worry about compression type or psize.
3545  */
3546 void
3547 arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder,
3548     dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv,
3549     const uint8_t *mac)
3550 {
3551         arc_buf_hdr_t *hdr = buf->b_hdr;
3552
3553         ASSERT(ot == DMU_OT_DNODE || ot == DMU_OT_OBJSET);
3554         ASSERT(HDR_HAS_L1HDR(hdr));
3555         ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
3556
3557         buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED);
3558         if (!HDR_PROTECTED(hdr))
3559                 hdr = arc_hdr_realloc_crypt(hdr, B_TRUE);
3560         hdr->b_crypt_hdr.b_dsobj = dsobj;
3561         hdr->b_crypt_hdr.b_ot = ot;
3562         hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
3563             DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
3564         if (!arc_hdr_has_uncompressed_buf(hdr))
3565                 arc_cksum_free(hdr);
3566
3567         if (salt != NULL)
3568                 bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
3569         if (iv != NULL)
3570                 bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
3571         if (mac != NULL)
3572                 bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
3573 }
3574
3575 /*
3576  * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller.
3577  * The buf is returned thawed since we expect the consumer to modify it.
3578  */
3579 arc_buf_t *
3580 arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size)
3581 {
3582         arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
3583             B_FALSE, ZIO_COMPRESS_OFF, 0, type);
3584
3585         arc_buf_t *buf = NULL;
3586         VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE,
3587             B_FALSE, B_FALSE, &buf));
3588         arc_buf_thaw(buf);
3589
3590         return (buf);
3591 }
3592
3593 /*
3594  * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this
3595  * for bufs containing metadata.
3596  */
3597 arc_buf_t *
3598 arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
3599     enum zio_compress compression_type, uint8_t complevel)
3600 {
3601         ASSERT3U(lsize, >, 0);
3602         ASSERT3U(lsize, >=, psize);
3603         ASSERT3U(compression_type, >, ZIO_COMPRESS_OFF);
3604         ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
3605
3606         arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
3607             B_FALSE, compression_type, complevel, ARC_BUFC_DATA);
3608
3609         arc_buf_t *buf = NULL;
3610         VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE,
3611             B_TRUE, B_FALSE, B_FALSE, &buf));
3612         arc_buf_thaw(buf);
3613         ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
3614
3615         /*
3616          * To ensure that the hdr has the correct data in it if we call
3617          * arc_untransform() on this buf before it's been written to disk,
3618          * it's easiest if we just set up sharing between the buf and the hdr.
3619          */
3620         arc_share_buf(hdr, buf);
3621
3622         return (buf);
3623 }
3624
3625 arc_buf_t *
3626 arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder,
3627     const uint8_t *salt, const uint8_t *iv, const uint8_t *mac,
3628     dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
3629     enum zio_compress compression_type, uint8_t complevel)
3630 {
3631         arc_buf_hdr_t *hdr;
3632         arc_buf_t *buf;
3633         arc_buf_contents_t type = DMU_OT_IS_METADATA(ot) ?
3634             ARC_BUFC_METADATA : ARC_BUFC_DATA;
3635
3636         ASSERT3U(lsize, >, 0);
3637         ASSERT3U(lsize, >=, psize);
3638         ASSERT3U(compression_type, >=, ZIO_COMPRESS_OFF);
3639         ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
3640
3641         hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE,
3642             compression_type, complevel, type);
3643
3644         hdr->b_crypt_hdr.b_dsobj = dsobj;
3645         hdr->b_crypt_hdr.b_ot = ot;
3646         hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
3647             DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
3648         bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
3649         bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
3650         bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
3651
3652         /*
3653          * This buffer will be considered encrypted even if the ot is not an
3654          * encrypted type. It will become authenticated instead in
3655          * arc_write_ready().
3656          */
3657         buf = NULL;
3658         VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_TRUE, B_TRUE,
3659             B_FALSE, B_FALSE, &buf));
3660         arc_buf_thaw(buf);
3661         ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
3662
3663         return (buf);
3664 }
3665
3666 static void
3667 l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
3668     boolean_t state_only)
3669 {
3670         l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
3671         l2arc_dev_t *dev = l2hdr->b_dev;
3672         uint64_t lsize = HDR_GET_LSIZE(hdr);
3673         uint64_t psize = HDR_GET_PSIZE(hdr);
3674         uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
3675         arc_buf_contents_t type = hdr->b_type;
3676         int64_t lsize_s;
3677         int64_t psize_s;
3678         int64_t asize_s;
3679
3680         if (incr) {
3681                 lsize_s = lsize;
3682                 psize_s = psize;
3683                 asize_s = asize;
3684         } else {
3685                 lsize_s = -lsize;
3686                 psize_s = -psize;
3687                 asize_s = -asize;
3688         }
3689
3690         /* If the buffer is a prefetch, count it as such. */
3691         if (HDR_PREFETCH(hdr)) {
3692                 ARCSTAT_INCR(arcstat_l2_prefetch_asize, asize_s);
3693         } else {
3694                 /*
3695                  * We use the value stored in the L2 header upon initial
3696                  * caching in L2ARC. This value will be updated in case
3697                  * an MRU/MRU_ghost buffer transitions to MFU but the L2ARC
3698                  * metadata (log entry) cannot currently be updated. Having
3699                  * the ARC state in the L2 header solves the problem of a
3700                  * possibly absent L1 header (apparent in buffers restored
3701                  * from persistent L2ARC).
3702                  */
3703                 switch (hdr->b_l2hdr.b_arcs_state) {
3704                         case ARC_STATE_MRU_GHOST:
3705                         case ARC_STATE_MRU:
3706                                 ARCSTAT_INCR(arcstat_l2_mru_asize, asize_s);
3707                                 break;
3708                         case ARC_STATE_MFU_GHOST:
3709                         case ARC_STATE_MFU:
3710                                 ARCSTAT_INCR(arcstat_l2_mfu_asize, asize_s);
3711                                 break;
3712                         default:
3713                                 break;
3714                 }
3715         }
3716
3717         if (state_only)
3718                 return;
3719
3720         ARCSTAT_INCR(arcstat_l2_psize, psize_s);
3721         ARCSTAT_INCR(arcstat_l2_lsize, lsize_s);
3722
3723         switch (type) {
3724                 case ARC_BUFC_DATA:
3725                         ARCSTAT_INCR(arcstat_l2_bufc_data_asize, asize_s);
3726                         break;
3727                 case ARC_BUFC_METADATA:
3728                         ARCSTAT_INCR(arcstat_l2_bufc_metadata_asize, asize_s);
3729                         break;
3730                 default:
3731                         break;
3732         }
3733 }
3734
3735
3736 static void
3737 arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
3738 {
3739         l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
3740         l2arc_dev_t *dev = l2hdr->b_dev;
3741         uint64_t psize = HDR_GET_PSIZE(hdr);
3742         uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
3743
3744         ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
3745         ASSERT(HDR_HAS_L2HDR(hdr));
3746
3747         list_remove(&dev->l2ad_buflist, hdr);
3748
3749         l2arc_hdr_arcstats_decrement(hdr);
3750         vdev_space_update(dev->l2ad_vdev, -asize, 0, 0);
3751
3752         (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr),
3753             hdr);
3754         arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
3755 }
3756
3757 static void
3758 arc_hdr_destroy(arc_buf_hdr_t *hdr)
3759 {
3760         if (HDR_HAS_L1HDR(hdr)) {
3761                 ASSERT(hdr->b_l1hdr.b_buf == NULL ||
3762                     hdr->b_l1hdr.b_bufcnt > 0);
3763                 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3764                 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
3765         }
3766         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3767         ASSERT(!HDR_IN_HASH_TABLE(hdr));
3768
3769         if (HDR_HAS_L2HDR(hdr)) {
3770                 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
3771                 boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
3772
3773                 if (!buflist_held)
3774                         mutex_enter(&dev->l2ad_mtx);
3775
3776                 /*
3777                  * Even though we checked this conditional above, we
3778                  * need to check this again now that we have the
3779                  * l2ad_mtx. This is because we could be racing with
3780                  * another thread calling l2arc_evict() which might have
3781                  * destroyed this header's L2 portion as we were waiting
3782                  * to acquire the l2ad_mtx. If that happens, we don't
3783                  * want to re-destroy the header's L2 portion.
3784                  */
3785                 if (HDR_HAS_L2HDR(hdr))
3786                         arc_hdr_l2hdr_destroy(hdr);
3787
3788                 if (!buflist_held)
3789                         mutex_exit(&dev->l2ad_mtx);
3790         }
3791
3792         /*
3793          * The header's identify can only be safely discarded once it is no
3794          * longer discoverable.  This requires removing it from the hash table
3795          * and the l2arc header list.  After this point the hash lock can not
3796          * be used to protect the header.
3797          */
3798         if (!HDR_EMPTY(hdr))
3799                 buf_discard_identity(hdr);
3800
3801         if (HDR_HAS_L1HDR(hdr)) {
3802                 arc_cksum_free(hdr);
3803
3804                 while (hdr->b_l1hdr.b_buf != NULL)
3805                         arc_buf_destroy_impl(hdr->b_l1hdr.b_buf);
3806
3807                 if (hdr->b_l1hdr.b_pabd != NULL)
3808                         arc_hdr_free_abd(hdr, B_FALSE);
3809
3810                 if (HDR_HAS_RABD(hdr))
3811                         arc_hdr_free_abd(hdr, B_TRUE);
3812         }
3813
3814         ASSERT3P(hdr->b_hash_next, ==, NULL);
3815         if (HDR_HAS_L1HDR(hdr)) {
3816                 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
3817                 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
3818
3819                 if (!HDR_PROTECTED(hdr)) {
3820                         kmem_cache_free(hdr_full_cache, hdr);
3821                 } else {
3822                         kmem_cache_free(hdr_full_crypt_cache, hdr);
3823                 }
3824         } else {
3825                 kmem_cache_free(hdr_l2only_cache, hdr);
3826         }
3827 }
3828
3829 void
3830 arc_buf_destroy(arc_buf_t *buf, void* tag)
3831 {
3832         arc_buf_hdr_t *hdr = buf->b_hdr;
3833
3834         if (hdr->b_l1hdr.b_state == arc_anon) {
3835                 ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
3836                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3837                 VERIFY0(remove_reference(hdr, NULL, tag));
3838                 arc_hdr_destroy(hdr);
3839                 return;
3840         }
3841
3842         kmutex_t *hash_lock = HDR_LOCK(hdr);
3843         mutex_enter(hash_lock);
3844
3845         ASSERT3P(hdr, ==, buf->b_hdr);
3846         ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
3847         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3848         ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
3849         ASSERT3P(buf->b_data, !=, NULL);
3850
3851         (void) remove_reference(hdr, hash_lock, tag);
3852         arc_buf_destroy_impl(buf);
3853         mutex_exit(hash_lock);
3854 }
3855
3856 /*
3857  * Evict the arc_buf_hdr that is provided as a parameter. The resultant
3858  * state of the header is dependent on its state prior to entering this
3859  * function. The following transitions are possible:
3860  *
3861  *    - arc_mru -> arc_mru_ghost
3862  *    - arc_mfu -> arc_mfu_ghost
3863  *    - arc_mru_ghost -> arc_l2c_only
3864  *    - arc_mru_ghost -> deleted
3865  *    - arc_mfu_ghost -> arc_l2c_only
3866  *    - arc_mfu_ghost -> deleted
3867  *
3868  * Return total size of evicted data buffers for eviction progress tracking.
3869  * When evicting from ghost states return logical buffer size to make eviction
3870  * progress at the same (or at least comparable) rate as from non-ghost states.
3871  *
3872  * Return *real_evicted for actual ARC size reduction to wake up threads
3873  * waiting for it.  For non-ghost states it includes size of evicted data
3874  * buffers (the headers are not freed there).  For ghost states it includes
3875  * only the evicted headers size.
3876  */
3877 static int64_t
3878 arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted)
3879 {
3880         arc_state_t *evicted_state, *state;
3881         int64_t bytes_evicted = 0;
3882         int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
3883             arc_min_prescient_prefetch_ms : arc_min_prefetch_ms;
3884
3885         ASSERT(MUTEX_HELD(hash_lock));
3886         ASSERT(HDR_HAS_L1HDR(hdr));
3887
3888         *real_evicted = 0;
3889         state = hdr->b_l1hdr.b_state;
3890         if (GHOST_STATE(state)) {
3891                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3892                 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
3893
3894                 /*
3895                  * l2arc_write_buffers() relies on a header's L1 portion
3896                  * (i.e. its b_pabd field) during it's write phase.
3897                  * Thus, we cannot push a header onto the arc_l2c_only
3898                  * state (removing its L1 piece) until the header is
3899                  * done being written to the l2arc.
3900                  */
3901                 if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
3902                         ARCSTAT_BUMP(arcstat_evict_l2_skip);
3903                         return (bytes_evicted);
3904                 }
3905
3906                 ARCSTAT_BUMP(arcstat_deleted);
3907                 bytes_evicted += HDR_GET_LSIZE(hdr);
3908
3909                 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
3910
3911                 if (HDR_HAS_L2HDR(hdr)) {
3912                         ASSERT(hdr->b_l1hdr.b_pabd == NULL);
3913                         ASSERT(!HDR_HAS_RABD(hdr));
3914                         /*
3915                          * This buffer is cached on the 2nd Level ARC;
3916                          * don't destroy the header.
3917                          */
3918                         arc_change_state(arc_l2c_only, hdr, hash_lock);
3919                         /*
3920                          * dropping from L1+L2 cached to L2-only,
3921                          * realloc to remove the L1 header.
3922                          */
3923                         hdr = arc_hdr_realloc(hdr, hdr_full_cache,
3924                             hdr_l2only_cache);
3925                         *real_evicted += HDR_FULL_SIZE - HDR_L2ONLY_SIZE;
3926                 } else {
3927                         arc_change_state(arc_anon, hdr, hash_lock);
3928                         arc_hdr_destroy(hdr);
3929                         *real_evicted += HDR_FULL_SIZE;
3930                 }
3931                 return (bytes_evicted);
3932         }
3933
3934         ASSERT(state == arc_mru || state == arc_mfu);
3935         evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3936
3937         /* prefetch buffers have a minimum lifespan */
3938         if (HDR_IO_IN_PROGRESS(hdr) ||
3939             ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
3940             ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
3941             MSEC_TO_TICK(min_lifetime))) {
3942                 ARCSTAT_BUMP(arcstat_evict_skip);
3943                 return (bytes_evicted);
3944         }
3945
3946         ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
3947         while (hdr->b_l1hdr.b_buf) {
3948                 arc_buf_t *buf = hdr->b_l1hdr.b_buf;
3949                 if (!mutex_tryenter(&buf->b_evict_lock)) {
3950                         ARCSTAT_BUMP(arcstat_mutex_miss);
3951                         break;
3952                 }
3953                 if (buf->b_data != NULL) {
3954                         bytes_evicted += HDR_GET_LSIZE(hdr);
3955                         *real_evicted += HDR_GET_LSIZE(hdr);
3956                 }
3957                 mutex_exit(&buf->b_evict_lock);
3958                 arc_buf_destroy_impl(buf);
3959         }
3960
3961         if (HDR_HAS_L2HDR(hdr)) {
3962                 ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr));
3963         } else {
3964                 if (l2arc_write_eligible(hdr->b_spa, hdr)) {
3965                         ARCSTAT_INCR(arcstat_evict_l2_eligible,
3966                             HDR_GET_LSIZE(hdr));
3967
3968                         switch (state->arcs_state) {
3969                                 case ARC_STATE_MRU:
3970                                         ARCSTAT_INCR(
3971                                             arcstat_evict_l2_eligible_mru,
3972                                             HDR_GET_LSIZE(hdr));
3973                                         break;
3974                                 case ARC_STATE_MFU:
3975                                         ARCSTAT_INCR(
3976                                             arcstat_evict_l2_eligible_mfu,
3977                                             HDR_GET_LSIZE(hdr));
3978                                         break;
3979                                 default:
3980                                         break;
3981                         }
3982                 } else {
3983                         ARCSTAT_INCR(arcstat_evict_l2_ineligible,
3984                             HDR_GET_LSIZE(hdr));
3985                 }
3986         }
3987
3988         if (hdr->b_l1hdr.b_bufcnt == 0) {
3989                 arc_cksum_free(hdr);
3990
3991                 bytes_evicted += arc_hdr_size(hdr);
3992                 *real_evicted += arc_hdr_size(hdr);
3993
3994                 /*
3995                  * If this hdr is being evicted and has a compressed
3996                  * buffer then we discard it here before we change states.
3997                  * This ensures that the accounting is updated correctly
3998                  * in arc_free_data_impl().
3999                  */
4000                 if (hdr->b_l1hdr.b_pabd != NULL)
4001                         arc_hdr_free_abd(hdr, B_FALSE);
4002
4003                 if (HDR_HAS_RABD(hdr))
4004                         arc_hdr_free_abd(hdr, B_TRUE);
4005
4006                 arc_change_state(evicted_state, hdr, hash_lock);
4007                 ASSERT(HDR_IN_HASH_TABLE(hdr));
4008                 arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
4009                 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
4010         }
4011
4012         return (bytes_evicted);
4013 }
4014
4015 static void
4016 arc_set_need_free(void)
4017 {
4018         ASSERT(MUTEX_HELD(&arc_evict_lock));
4019         int64_t remaining = arc_free_memory() - arc_sys_free / 2;
4020         arc_evict_waiter_t *aw = list_tail(&arc_evict_waiters);
4021         if (aw == NULL) {
4022                 arc_need_free = MAX(-remaining, 0);
4023         } else {
4024                 arc_need_free =
4025                     MAX(-remaining, (int64_t)(aw->aew_count - arc_evict_count));
4026         }
4027 }
4028
4029 static uint64_t
4030 arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
4031     uint64_t spa, uint64_t bytes)
4032 {
4033         multilist_sublist_t *mls;
4034         uint64_t bytes_evicted = 0, real_evicted = 0;
4035         arc_buf_hdr_t *hdr;
4036         kmutex_t *hash_lock;
4037         int evict_count = zfs_arc_evict_batch_limit;
4038
4039         ASSERT3P(marker, !=, NULL);
4040
4041         mls = multilist_sublist_lock(ml, idx);
4042
4043         for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL);
4044             hdr = multilist_sublist_prev(mls, marker)) {
4045                 if ((evict_count <= 0) || (bytes_evicted >= bytes))
4046                         break;
4047
4048                 /*
4049                  * To keep our iteration location, move the marker
4050                  * forward. Since we're not holding hdr's hash lock, we
4051                  * must be very careful and not remove 'hdr' from the
4052                  * sublist. Otherwise, other consumers might mistake the
4053                  * 'hdr' as not being on a sublist when they call the
4054                  * multilist_link_active() function (they all rely on
4055                  * the hash lock protecting concurrent insertions and
4056                  * removals). multilist_sublist_move_forward() was
4057                  * specifically implemented to ensure this is the case
4058                  * (only 'marker' will be removed and re-inserted).
4059                  */
4060                 multilist_sublist_move_forward(mls, marker);
4061
4062                 /*
4063                  * The only case where the b_spa field should ever be
4064                  * zero, is the marker headers inserted by
4065                  * arc_evict_state(). It's possible for multiple threads
4066                  * to be calling arc_evict_state() concurrently (e.g.
4067                  * dsl_pool_close() and zio_inject_fault()), so we must
4068                  * skip any markers we see from these other threads.
4069                  */
4070                 if (hdr->b_spa == 0)
4071                         continue;
4072
4073                 /* we're only interested in evicting buffers of a certain spa */
4074                 if (spa != 0 && hdr->b_spa != spa) {
4075                         ARCSTAT_BUMP(arcstat_evict_skip);
4076                         continue;
4077                 }
4078
4079                 hash_lock = HDR_LOCK(hdr);
4080
4081                 /*
4082                  * We aren't calling this function from any code path
4083                  * that would already be holding a hash lock, so we're
4084                  * asserting on this assumption to be defensive in case
4085                  * this ever changes. Without this check, it would be
4086                  * possible to incorrectly increment arcstat_mutex_miss
4087                  * below (e.g. if the code changed such that we called
4088                  * this function with a hash lock held).
4089                  */
4090                 ASSERT(!MUTEX_HELD(hash_lock));
4091
4092                 if (mutex_tryenter(hash_lock)) {
4093                         uint64_t revicted;
4094                         uint64_t evicted = arc_evict_hdr(hdr, hash_lock,
4095                             &revicted);
4096                         mutex_exit(hash_lock);
4097
4098                         bytes_evicted += evicted;
4099                         real_evicted += revicted;
4100
4101                         /*
4102                          * If evicted is zero, arc_evict_hdr() must have
4103                          * decided to skip this header, don't increment
4104                          * evict_count in this case.
4105                          */
4106                         if (evicted != 0)
4107                                 evict_count--;
4108
4109                 } else {
4110                         ARCSTAT_BUMP(arcstat_mutex_miss);
4111                 }
4112         }
4113
4114         multilist_sublist_unlock(mls);
4115
4116         /*
4117          * Increment the count of evicted bytes, and wake up any threads that
4118          * are waiting for the count to reach this value.  Since the list is
4119          * ordered by ascending aew_count, we pop off the beginning of the
4120          * list until we reach the end, or a waiter that's past the current
4121          * "count".  Doing this outside the loop reduces the number of times
4122          * we need to acquire the global arc_evict_lock.
4123          *
4124          * Only wake when there's sufficient free memory in the system
4125          * (specifically, arc_sys_free/2, which by default is a bit more than
4126          * 1/64th of RAM).  See the comments in arc_wait_for_eviction().
4127          */
4128         mutex_enter(&arc_evict_lock);
4129         arc_evict_count += real_evicted;
4130
4131         if (arc_free_memory() > arc_sys_free / 2) {
4132                 arc_evict_waiter_t *aw;
4133                 while ((aw = list_head(&arc_evict_waiters)) != NULL &&
4134                     aw->aew_count <= arc_evict_count) {
4135                         list_remove(&arc_evict_waiters, aw);
4136                         cv_broadcast(&aw->aew_cv);
4137                 }
4138         }
4139         arc_set_need_free();
4140         mutex_exit(&arc_evict_lock);
4141
4142         /*
4143          * If the ARC size is reduced from arc_c_max to arc_c_min (especially
4144          * if the average cached block is small), eviction can be on-CPU for
4145          * many seconds.  To ensure that other threads that may be bound to
4146          * this CPU are able to make progress, make a voluntary preemption
4147          * call here.
4148          */
4149         cond_resched();
4150
4151         return (bytes_evicted);
4152 }
4153
4154 /*
4155  * Allocate an array of buffer headers used as placeholders during arc state
4156  * eviction.
4157  */
4158 static arc_buf_hdr_t **
4159 arc_state_alloc_markers(int count)
4160 {
4161         arc_buf_hdr_t **markers;
4162
4163         markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP);
4164         for (int i = 0; i < count; i++) {
4165                 markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
4166
4167                 /*
4168                  * A b_spa of 0 is used to indicate that this header is
4169                  * a marker. This fact is used in arc_evict_type() and
4170                  * arc_evict_state_impl().
4171                  */
4172                 markers[i]->b_spa = 0;
4173
4174         }
4175         return (markers);
4176 }
4177
4178 static void
4179 arc_state_free_markers(arc_buf_hdr_t **markers, int count)
4180 {
4181         for (int i = 0; i < count; i++)
4182                 kmem_cache_free(hdr_full_cache, markers[i]);
4183         kmem_free(markers, sizeof (*markers) * count);
4184 }
4185
4186 /*
4187  * Evict buffers from the given arc state, until we've removed the
4188  * specified number of bytes. Move the removed buffers to the
4189  * appropriate evict state.
4190  *
4191  * This function makes a "best effort". It skips over any buffers
4192  * it can't get a hash_lock on, and so, may not catch all candidates.
4193  * It may also return without evicting as much space as requested.
4194  *
4195  * If bytes is specified using the special value ARC_EVICT_ALL, this
4196  * will evict all available (i.e. unlocked and evictable) buffers from
4197  * the given arc state; which is used by arc_flush().
4198  */
4199 static uint64_t
4200 arc_evict_state(arc_state_t *state, uint64_t spa, uint64_t bytes,
4201     arc_buf_contents_t type)
4202 {
4203         uint64_t total_evicted = 0;
4204         multilist_t *ml = &state->arcs_list[type];
4205         int num_sublists;
4206         arc_buf_hdr_t **markers;
4207
4208         num_sublists = multilist_get_num_sublists(ml);
4209
4210         /*
4211          * If we've tried to evict from each sublist, made some
4212          * progress, but still have not hit the target number of bytes
4213          * to evict, we want to keep trying. The markers allow us to
4214          * pick up where we left off for each individual sublist, rather
4215          * than starting from the tail each time.
4216          */
4217         if (zthr_iscurthread(arc_evict_zthr)) {
4218                 markers = arc_state_evict_markers;
4219                 ASSERT3S(num_sublists, <=, arc_state_evict_marker_count);
4220         } else {
4221                 markers = arc_state_alloc_markers(num_sublists);
4222         }
4223         for (int i = 0; i < num_sublists; i++) {
4224                 multilist_sublist_t *mls;
4225
4226                 mls = multilist_sublist_lock(ml, i);
4227                 multilist_sublist_insert_tail(mls, markers[i]);
4228                 multilist_sublist_unlock(mls);
4229         }
4230
4231         /*
4232          * While we haven't hit our target number of bytes to evict, or
4233          * we're evicting all available buffers.
4234          */
4235         while (total_evicted < bytes) {
4236                 int sublist_idx = multilist_get_random_index(ml);
4237                 uint64_t scan_evicted = 0;
4238
4239                 /*
4240                  * Try to reduce pinned dnodes with a floor of arc_dnode_limit.
4241                  * Request that 10% of the LRUs be scanned by the superblock
4242                  * shrinker.
4243                  */
4244                 if (type == ARC_BUFC_DATA && aggsum_compare(
4245                     &arc_sums.arcstat_dnode_size, arc_dnode_size_limit) > 0) {
4246                         arc_prune_async((aggsum_upper_bound(
4247                             &arc_sums.arcstat_dnode_size) -
4248                             arc_dnode_size_limit) / sizeof (dnode_t) /
4249                             zfs_arc_dnode_reduce_percent);
4250                 }
4251
4252                 /*
4253                  * Start eviction using a randomly selected sublist,
4254                  * this is to try and evenly balance eviction across all
4255                  * sublists. Always starting at the same sublist
4256                  * (e.g. index 0) would cause evictions to favor certain
4257                  * sublists over others.
4258                  */
4259                 for (int i = 0; i < num_sublists; i++) {
4260                         uint64_t bytes_remaining;
4261                         uint64_t bytes_evicted;
4262
4263                         if (total_evicted < bytes)
4264                                 bytes_remaining = bytes - total_evicted;
4265                         else
4266                                 break;
4267
4268                         bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
4269                             markers[sublist_idx], spa, bytes_remaining);
4270
4271                         scan_evicted += bytes_evicted;
4272                         total_evicted += bytes_evicted;
4273
4274                         /* we've reached the end, wrap to the beginning */
4275                         if (++sublist_idx >= num_sublists)
4276                                 sublist_idx = 0;
4277                 }
4278
4279                 /*
4280                  * If we didn't evict anything during this scan, we have
4281                  * no reason to believe we'll evict more during another
4282                  * scan, so break the loop.
4283                  */
4284                 if (scan_evicted == 0) {
4285                         /* This isn't possible, let's make that obvious */
4286                         ASSERT3S(bytes, !=, 0);
4287
4288                         /*
4289                          * When bytes is ARC_EVICT_ALL, the only way to
4290                          * break the loop is when scan_evicted is zero.
4291                          * In that case, we actually have evicted enough,
4292                          * so we don't want to increment the kstat.
4293                          */
4294                         if (bytes != ARC_EVICT_ALL) {
4295                                 ASSERT3S(total_evicted, <, bytes);
4296                                 ARCSTAT_BUMP(arcstat_evict_not_enough);
4297                         }
4298
4299                         break;
4300                 }
4301         }
4302
4303         for (int i = 0; i < num_sublists; i++) {
4304                 multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
4305                 multilist_sublist_remove(mls, markers[i]);
4306                 multilist_sublist_unlock(mls);
4307         }
4308         if (markers != arc_state_evict_markers)
4309                 arc_state_free_markers(markers, num_sublists);
4310
4311         return (total_evicted);
4312 }
4313
4314 /*
4315  * Flush all "evictable" data of the given type from the arc state
4316  * specified. This will not evict any "active" buffers (i.e. referenced).
4317  *
4318  * When 'retry' is set to B_FALSE, the function will make a single pass
4319  * over the state and evict any buffers that it can. Since it doesn't
4320  * continually retry the eviction, it might end up leaving some buffers
4321  * in the ARC due to lock misses.
4322  *
4323  * When 'retry' is set to B_TRUE, the function will continually retry the
4324  * eviction until *all* evictable buffers have been removed from the
4325  * state. As a result, if concurrent insertions into the state are
4326  * allowed (e.g. if the ARC isn't shutting down), this function might
4327  * wind up in an infinite loop, continually trying to evict buffers.
4328  */
4329 static uint64_t
4330 arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
4331     boolean_t retry)
4332 {
4333         uint64_t evicted = 0;
4334
4335         while (zfs_refcount_count(&state->arcs_esize[type]) != 0) {
4336                 evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
4337
4338                 if (!retry)
4339                         break;
4340         }
4341
4342         return (evicted);
4343 }
4344
4345 /*
4346  * Evict the specified number of bytes from the state specified,
4347  * restricting eviction to the spa and type given. This function
4348  * prevents us from trying to evict more from a state's list than
4349  * is "evictable", and to skip evicting altogether when passed a
4350  * negative value for "bytes". In contrast, arc_evict_state() will
4351  * evict everything it can, when passed a negative value for "bytes".
4352  */
4353 static uint64_t
4354 arc_evict_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
4355     arc_buf_contents_t type)
4356 {
4357         uint64_t delta;
4358
4359         if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) {
4360                 delta = MIN(zfs_refcount_count(&state->arcs_esize[type]),
4361                     bytes);
4362                 return (arc_evict_state(state, spa, delta, type));
4363         }
4364
4365         return (0);
4366 }
4367
4368 /*
4369  * The goal of this function is to evict enough meta data buffers from the
4370  * ARC in order to enforce the arc_meta_limit.  Achieving this is slightly
4371  * more complicated than it appears because it is common for data buffers
4372  * to have holds on meta data buffers.  In addition, dnode meta data buffers
4373  * will be held by the dnodes in the block preventing them from being freed.
4374  * This means we can't simply traverse the ARC and expect to always find
4375  * enough unheld meta data buffer to release.
4376  *
4377  * Therefore, this function has been updated to make alternating passes
4378  * over the ARC releasing data buffers and then newly unheld meta data
4379  * buffers.  This ensures forward progress is maintained and meta_used
4380  * will decrease.  Normally this is sufficient, but if required the ARC
4381  * will call the registered prune callbacks causing dentry and inodes to
4382  * be dropped from the VFS cache.  This will make dnode meta data buffers
4383  * available for reclaim.
4384  */
4385 static uint64_t
4386 arc_evict_meta_balanced(uint64_t meta_used)
4387 {
4388         int64_t delta, prune = 0, adjustmnt;
4389         uint64_t total_evicted = 0;
4390         arc_buf_contents_t type = ARC_BUFC_DATA;
4391         int restarts = MAX(zfs_arc_meta_adjust_restarts, 0);
4392
4393 restart:
4394         /*
4395          * This slightly differs than the way we evict from the mru in
4396          * arc_evict because we don't have a "target" value (i.e. no
4397          * "meta" arc_p). As a result, I think we can completely
4398          * cannibalize the metadata in the MRU before we evict the
4399          * metadata from the MFU. I think we probably need to implement a
4400          * "metadata arc_p" value to do this properly.
4401          */
4402         adjustmnt = meta_used - arc_meta_limit;
4403
4404         if (adjustmnt > 0 &&
4405             zfs_refcount_count(&arc_mru->arcs_esize[type]) > 0) {
4406                 delta = MIN(zfs_refcount_count(&arc_mru->arcs_esize[type]),
4407                     adjustmnt);
4408                 total_evicted += arc_evict_impl(arc_mru, 0, delta, type);
4409                 adjustmnt -= delta;
4410         }
4411
4412         /*
4413          * We can't afford to recalculate adjustmnt here. If we do,
4414          * new metadata buffers can sneak into the MRU or ANON lists,
4415          * thus penalize the MFU metadata. Although the fudge factor is
4416          * small, it has been empirically shown to be significant for
4417          * certain workloads (e.g. creating many empty directories). As
4418          * such, we use the original calculation for adjustmnt, and
4419          * simply decrement the amount of data evicted from the MRU.
4420          */
4421
4422         if (adjustmnt > 0 &&
4423             zfs_refcount_count(&arc_mfu->arcs_esize[type]) > 0) {
4424                 delta = MIN(zfs_refcount_count(&arc_mfu->arcs_esize[type]),
4425                     adjustmnt);
4426                 total_evicted += arc_evict_impl(arc_mfu, 0, delta, type);
4427         }
4428
4429         adjustmnt = meta_used - arc_meta_limit;
4430
4431         if (adjustmnt > 0 &&
4432             zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) {
4433                 delta = MIN(adjustmnt,
4434                     zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]));
4435                 total_evicted += arc_evict_impl(arc_mru_ghost, 0, delta, type);
4436                 adjustmnt -= delta;
4437         }
4438
4439         if (adjustmnt > 0 &&
4440             zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) {
4441                 delta = MIN(adjustmnt,
4442                     zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]));
4443                 total_evicted += arc_evict_impl(arc_mfu_ghost, 0, delta, type);
4444         }
4445
4446         /*
4447          * If after attempting to make the requested adjustment to the ARC
4448          * the meta limit is still being exceeded then request that the
4449          * higher layers drop some cached objects which have holds on ARC
4450          * meta buffers.  Requests to the upper layers will be made with
4451          * increasingly large scan sizes until the ARC is below the limit.
4452          */
4453         if (meta_used > arc_meta_limit) {
4454                 if (type == ARC_BUFC_DATA) {
4455                         type = ARC_BUFC_METADATA;
4456                 } else {
4457                         type = ARC_BUFC_DATA;
4458
4459                         if (zfs_arc_meta_prune) {
4460                                 prune += zfs_arc_meta_prune;
4461                                 arc_prune_async(prune);
4462                         }
4463                 }
4464
4465                 if (restarts > 0) {
4466                         restarts--;
4467                         goto restart;
4468                 }
4469         }
4470         return (total_evicted);
4471 }
4472
4473 /*
4474  * Evict metadata buffers from the cache, such that arcstat_meta_used is
4475  * capped by the arc_meta_limit tunable.
4476  */
4477 static uint64_t
4478 arc_evict_meta_only(uint64_t meta_used)
4479 {
4480         uint64_t total_evicted = 0;
4481         int64_t target;
4482
4483         /*
4484          * If we're over the meta limit, we want to evict enough
4485          * metadata to get back under the meta limit. We don't want to
4486          * evict so much that we drop the MRU below arc_p, though. If
4487          * we're over the meta limit more than we're over arc_p, we
4488          * evict some from the MRU here, and some from the MFU below.
4489          */
4490         target = MIN((int64_t)(meta_used - arc_meta_limit),
4491             (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) +
4492             zfs_refcount_count(&arc_mru->arcs_size) - arc_p));
4493
4494         total_evicted += arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
4495
4496         /*
4497          * Similar to the above, we want to evict enough bytes to get us
4498          * below the meta limit, but not so much as to drop us below the
4499          * space allotted to the MFU (which is defined as arc_c - arc_p).
4500          */
4501         target = MIN((int64_t)(meta_used - arc_meta_limit),
4502             (int64_t)(zfs_refcount_count(&arc_mfu->arcs_size) -
4503             (arc_c - arc_p)));
4504
4505         total_evicted += arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
4506
4507         return (total_evicted);
4508 }
4509
4510 static uint64_t
4511 arc_evict_meta(uint64_t meta_used)
4512 {
4513         if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY)
4514                 return (arc_evict_meta_only(meta_used));
4515         else
4516                 return (arc_evict_meta_balanced(meta_used));
4517 }
4518
4519 /*
4520  * Return the type of the oldest buffer in the given arc state
4521  *
4522  * This function will select a random sublist of type ARC_BUFC_DATA and
4523  * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
4524  * is compared, and the type which contains the "older" buffer will be
4525  * returned.
4526  */
4527 static arc_buf_contents_t
4528 arc_evict_type(arc_state_t *state)
4529 {
4530         multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA];
4531         multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA];
4532         int data_idx = multilist_get_random_index(data_ml);
4533         int meta_idx = multilist_get_random_index(meta_ml);
4534         multilist_sublist_t *data_mls;
4535         multilist_sublist_t *meta_mls;
4536         arc_buf_contents_t type;
4537         arc_buf_hdr_t *data_hdr;
4538         arc_buf_hdr_t *meta_hdr;
4539
4540         /*
4541          * We keep the sublist lock until we're finished, to prevent
4542          * the headers from being destroyed via arc_evict_state().
4543          */
4544         data_mls = multilist_sublist_lock(data_ml, data_idx);
4545         meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
4546
4547         /*
4548          * These two loops are to ensure we skip any markers that
4549          * might be at the tail of the lists due to arc_evict_state().
4550          */
4551
4552         for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
4553             data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
4554                 if (data_hdr->b_spa != 0)
4555                         break;
4556         }
4557
4558         for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
4559             meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
4560                 if (meta_hdr->b_spa != 0)
4561                         break;
4562         }
4563
4564         if (data_hdr == NULL && meta_hdr == NULL) {
4565                 type = ARC_BUFC_DATA;
4566         } else if (data_hdr == NULL) {
4567                 ASSERT3P(meta_hdr, !=, NULL);
4568                 type = ARC_BUFC_METADATA;
4569         } else if (meta_hdr == NULL) {
4570                 ASSERT3P(data_hdr, !=, NULL);
4571                 type = ARC_BUFC_DATA;
4572         } else {
4573                 ASSERT3P(data_hdr, !=, NULL);
4574                 ASSERT3P(meta_hdr, !=, NULL);
4575
4576                 /* The headers can't be on the sublist without an L1 header */
4577                 ASSERT(HDR_HAS_L1HDR(data_hdr));
4578                 ASSERT(HDR_HAS_L1HDR(meta_hdr));
4579
4580                 if (data_hdr->b_l1hdr.b_arc_access <
4581                     meta_hdr->b_l1hdr.b_arc_access) {
4582                         type = ARC_BUFC_DATA;
4583                 } else {
4584                         type = ARC_BUFC_METADATA;
4585                 }
4586         }
4587
4588         multilist_sublist_unlock(meta_mls);
4589         multilist_sublist_unlock(data_mls);
4590
4591         return (type);
4592 }
4593
4594 /*
4595  * Evict buffers from the cache, such that arcstat_size is capped by arc_c.
4596  */
4597 static uint64_t
4598 arc_evict(void)
4599 {
4600         uint64_t total_evicted = 0;
4601         uint64_t bytes;
4602         int64_t target;
4603         uint64_t asize = aggsum_value(&arc_sums.arcstat_size);
4604         uint64_t ameta = aggsum_value(&arc_sums.arcstat_meta_used);
4605
4606         /*
4607          * If we're over arc_meta_limit, we want to correct that before
4608          * potentially evicting data buffers below.
4609          */
4610         total_evicted += arc_evict_meta(ameta);
4611
4612         /*
4613          * Adjust MRU size
4614          *
4615          * If we're over the target cache size, we want to evict enough
4616          * from the list to get back to our target size. We don't want
4617          * to evict too much from the MRU, such that it drops below
4618          * arc_p. So, if we're over our target cache size more than
4619          * the MRU is over arc_p, we'll evict enough to get back to
4620          * arc_p here, and then evict more from the MFU below.
4621          */
4622         target = MIN((int64_t)(asize - arc_c),
4623             (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) +
4624             zfs_refcount_count(&arc_mru->arcs_size) + ameta - arc_p));
4625
4626         /*
4627          * If we're below arc_meta_min, always prefer to evict data.
4628          * Otherwise, try to satisfy the requested number of bytes to
4629          * evict from the type which contains older buffers; in an
4630          * effort to keep newer buffers in the cache regardless of their
4631          * type. If we cannot satisfy the number of bytes from this
4632          * type, spill over into the next type.
4633          */
4634         if (arc_evict_type(arc_mru) == ARC_BUFC_METADATA &&
4635             ameta > arc_meta_min) {
4636                 bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
4637                 total_evicted += bytes;
4638
4639                 /*
4640                  * If we couldn't evict our target number of bytes from
4641                  * metadata, we try to get the rest from data.
4642                  */
4643                 target -= bytes;
4644
4645                 total_evicted +=
4646                     arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA);
4647         } else {
4648                 bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA);
4649                 total_evicted += bytes;
4650
4651                 /*
4652                  * If we couldn't evict our target number of bytes from
4653                  * data, we try to get the rest from metadata.
4654                  */
4655                 target -= bytes;
4656
4657                 total_evicted +=
4658                     arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
4659         }
4660
4661         /*
4662          * Re-sum ARC stats after the first round of evictions.
4663          */
4664         asize = aggsum_value(&arc_sums.arcstat_size);
4665         ameta = aggsum_value(&arc_sums.arcstat_meta_used);
4666
4667
4668         /*
4669          * Adjust MFU size
4670          *
4671          * Now that we've tried to evict enough from the MRU to get its
4672          * size back to arc_p, if we're still above the target cache
4673          * size, we evict the rest from the MFU.
4674          */
4675         target = asize - arc_c;
4676
4677         if (arc_evict_type(arc_mfu) == ARC_BUFC_METADATA &&
4678             ameta > arc_meta_min) {
4679                 bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
4680                 total_evicted += bytes;
4681
4682                 /*
4683                  * If we couldn't evict our target number of bytes from
4684                  * metadata, we try to get the rest from data.
4685                  */
4686                 target -= bytes;
4687
4688                 total_evicted +=
4689                     arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
4690         } else {
4691                 bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
4692                 total_evicted += bytes;
4693
4694                 /*
4695                  * If we couldn't evict our target number of bytes from
4696                  * data, we try to get the rest from data.
4697                  */
4698                 target -= bytes;
4699
4700                 total_evicted +=
4701                     arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
4702         }
4703
4704         /*
4705          * Adjust ghost lists
4706          *
4707          * In addition to the above, the ARC also defines target values
4708          * for the ghost lists. The sum of the mru list and mru ghost
4709          * list should never exceed the target size of the cache, and
4710          * the sum of the mru list, mfu list, mru ghost list, and mfu
4711          * ghost list should never exceed twice the target size of the
4712          * cache. The following logic enforces these limits on the ghost
4713          * caches, and evicts from them as needed.
4714          */
4715         target = zfs_refcount_count(&arc_mru->arcs_size) +
4716             zfs_refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
4717
4718         bytes = arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
4719         total_evicted += bytes;
4720
4721         target -= bytes;
4722
4723         total_evicted +=
4724             arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
4725
4726         /*
4727          * We assume the sum of the mru list and mfu list is less than
4728          * or equal to arc_c (we enforced this above), which means we
4729          * can use the simpler of the two equations below:
4730          *
4731          *      mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
4732          *                  mru ghost + mfu ghost <= arc_c
4733          */
4734         target = zfs_refcount_count(&arc_mru_ghost->arcs_size) +
4735             zfs_refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
4736
4737         bytes = arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
4738         total_evicted += bytes;
4739
4740         target -= bytes;
4741
4742         total_evicted +=
4743             arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
4744
4745         return (total_evicted);
4746 }
4747
4748 void
4749 arc_flush(spa_t *spa, boolean_t retry)
4750 {
4751         uint64_t guid = 0;
4752
4753         /*
4754          * If retry is B_TRUE, a spa must not be specified since we have
4755          * no good way to determine if all of a spa's buffers have been
4756          * evicted from an arc state.
4757          */
4758         ASSERT(!retry || spa == 0);
4759
4760         if (spa != NULL)
4761                 guid = spa_load_guid(spa);
4762
4763         (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
4764         (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
4765
4766         (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
4767         (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
4768
4769         (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
4770         (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
4771
4772         (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
4773         (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
4774 }
4775
4776 void
4777 arc_reduce_target_size(int64_t to_free)
4778 {
4779         uint64_t asize = aggsum_value(&arc_sums.arcstat_size);
4780
4781         /*
4782          * All callers want the ARC to actually evict (at least) this much
4783          * memory.  Therefore we reduce from the lower of the current size and
4784          * the target size.  This way, even if arc_c is much higher than
4785          * arc_size (as can be the case after many calls to arc_freed(), we will
4786          * immediately have arc_c < arc_size and therefore the arc_evict_zthr
4787          * will evict.
4788          */
4789         uint64_t c = MIN(arc_c, asize);
4790
4791         if (c > to_free && c - to_free > arc_c_min) {
4792                 arc_c = c - to_free;
4793                 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
4794                 if (arc_p > arc_c)
4795                         arc_p = (arc_c >> 1);
4796                 ASSERT(arc_c >= arc_c_min);
4797                 ASSERT((int64_t)arc_p >= 0);
4798         } else {
4799                 arc_c = arc_c_min;
4800         }
4801
4802         if (asize > arc_c) {
4803                 /* See comment in arc_evict_cb_check() on why lock+flag */
4804                 mutex_enter(&arc_evict_lock);
4805                 arc_evict_needed = B_TRUE;
4806                 mutex_exit(&arc_evict_lock);
4807                 zthr_wakeup(arc_evict_zthr);
4808         }
4809 }
4810
4811 /*
4812  * Determine if the system is under memory pressure and is asking
4813  * to reclaim memory. A return value of B_TRUE indicates that the system
4814  * is under memory pressure and that the arc should adjust accordingly.
4815  */
4816 boolean_t
4817 arc_reclaim_needed(void)
4818 {
4819         return (arc_available_memory() < 0);
4820 }
4821
4822 void
4823 arc_kmem_reap_soon(void)
4824 {
4825         size_t                  i;
4826         kmem_cache_t            *prev_cache = NULL;
4827         kmem_cache_t            *prev_data_cache = NULL;
4828         extern kmem_cache_t     *zio_buf_cache[];
4829         extern kmem_cache_t     *zio_data_buf_cache[];
4830
4831 #ifdef _KERNEL
4832         if ((aggsum_compare(&arc_sums.arcstat_meta_used,
4833             arc_meta_limit) >= 0) && zfs_arc_meta_prune) {
4834                 /*
4835                  * We are exceeding our meta-data cache limit.
4836                  * Prune some entries to release holds on meta-data.
4837                  */
4838                 arc_prune_async(zfs_arc_meta_prune);
4839         }
4840 #if defined(_ILP32)
4841         /*
4842          * Reclaim unused memory from all kmem caches.
4843          */
4844         kmem_reap();
4845 #endif
4846 #endif
4847
4848         for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
4849 #if defined(_ILP32)
4850                 /* reach upper limit of cache size on 32-bit */
4851                 if (zio_buf_cache[i] == NULL)
4852                         break;
4853 #endif
4854                 if (zio_buf_cache[i] != prev_cache) {
4855                         prev_cache = zio_buf_cache[i];
4856                         kmem_cache_reap_now(zio_buf_cache[i]);
4857                 }
4858                 if (zio_data_buf_cache[i] != prev_data_cache) {
4859                         prev_data_cache = zio_data_buf_cache[i];
4860                         kmem_cache_reap_now(zio_data_buf_cache[i]);
4861                 }
4862         }
4863         kmem_cache_reap_now(buf_cache);
4864         kmem_cache_reap_now(hdr_full_cache);
4865         kmem_cache_reap_now(hdr_l2only_cache);
4866         kmem_cache_reap_now(zfs_btree_leaf_cache);
4867         abd_cache_reap_now();
4868 }
4869
4870 /* ARGSUSED */
4871 static boolean_t
4872 arc_evict_cb_check(void *arg, zthr_t *zthr)
4873 {
4874 #ifdef ZFS_DEBUG
4875         /*
4876          * This is necessary in order to keep the kstat information
4877          * up to date for tools that display kstat data such as the
4878          * mdb ::arc dcmd and the Linux crash utility.  These tools
4879          * typically do not call kstat's update function, but simply
4880          * dump out stats from the most recent update.  Without
4881          * this call, these commands may show stale stats for the
4882          * anon, mru, mru_ghost, mfu, and mfu_ghost lists.  Even
4883          * with this call, the data might be out of date if the
4884          * evict thread hasn't been woken recently; but that should
4885          * suffice.  The arc_state_t structures can be queried
4886          * directly if more accurate information is needed.
4887          */
4888         if (arc_ksp != NULL)
4889                 arc_ksp->ks_update(arc_ksp, KSTAT_READ);
4890 #endif
4891
4892         /*
4893          * We have to rely on arc_wait_for_eviction() to tell us when to
4894          * evict, rather than checking if we are overflowing here, so that we
4895          * are sure to not leave arc_wait_for_eviction() waiting on aew_cv.
4896          * If we have become "not overflowing" since arc_wait_for_eviction()
4897          * checked, we need to wake it up.  We could broadcast the CV here,
4898          * but arc_wait_for_eviction() may have not yet gone to sleep.  We
4899          * would need to use a mutex to ensure that this function doesn't
4900          * broadcast until arc_wait_for_eviction() has gone to sleep (e.g.
4901          * the arc_evict_lock).  However, the lock ordering of such a lock
4902          * would necessarily be incorrect with respect to the zthr_lock,
4903          * which is held before this function is called, and is held by
4904          * arc_wait_for_eviction() when it calls zthr_wakeup().
4905          */
4906         return (arc_evict_needed);
4907 }
4908
4909 /*
4910  * Keep arc_size under arc_c by running arc_evict which evicts data
4911  * from the ARC.
4912  */
4913 /* ARGSUSED */
4914 static void
4915 arc_evict_cb(void *arg, zthr_t *zthr)
4916 {
4917         uint64_t evicted = 0;
4918         fstrans_cookie_t cookie = spl_fstrans_mark();
4919
4920         /* Evict from cache */
4921         evicted = arc_evict();
4922
4923         /*
4924          * If evicted is zero, we couldn't evict anything
4925          * via arc_evict(). This could be due to hash lock
4926          * collisions, but more likely due to the majority of
4927          * arc buffers being unevictable. Therefore, even if
4928          * arc_size is above arc_c, another pass is unlikely to
4929          * be helpful and could potentially cause us to enter an
4930          * infinite loop.  Additionally, zthr_iscancelled() is
4931          * checked here so that if the arc is shutting down, the
4932          * broadcast will wake any remaining arc evict waiters.
4933          */
4934         mutex_enter(&arc_evict_lock);
4935         arc_evict_needed = !zthr_iscancelled(arc_evict_zthr) &&
4936             evicted > 0 && aggsum_compare(&arc_sums.arcstat_size, arc_c) > 0;
4937         if (!arc_evict_needed) {
4938                 /*
4939                  * We're either no longer overflowing, or we
4940                  * can't evict anything more, so we should wake
4941                  * arc_get_data_impl() sooner.
4942                  */
4943                 arc_evict_waiter_t *aw;
4944                 while ((aw = list_remove_head(&arc_evict_waiters)) != NULL) {
4945                         cv_broadcast(&aw->aew_cv);
4946                 }
4947                 arc_set_need_free();
4948         }
4949         mutex_exit(&arc_evict_lock);
4950         spl_fstrans_unmark(cookie);
4951 }
4952
4953 /* ARGSUSED */
4954 static boolean_t
4955 arc_reap_cb_check(void *arg, zthr_t *zthr)
4956 {
4957         int64_t free_memory = arc_available_memory();
4958         static int reap_cb_check_counter = 0;
4959
4960         /*
4961          * If a kmem reap is already active, don't schedule more.  We must
4962          * check for this because kmem_cache_reap_soon() won't actually
4963          * block on the cache being reaped (this is to prevent callers from
4964          * becoming implicitly blocked by a system-wide kmem reap -- which,
4965          * on a system with many, many full magazines, can take minutes).
4966          */
4967         if (!kmem_cache_reap_active() && free_memory < 0) {
4968
4969                 arc_no_grow = B_TRUE;
4970                 arc_warm = B_TRUE;
4971                 /*
4972                  * Wait at least zfs_grow_retry (default 5) seconds
4973                  * before considering growing.
4974                  */
4975                 arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
4976                 return (B_TRUE);
4977         } else if (free_memory < arc_c >> arc_no_grow_shift) {
4978                 arc_no_grow = B_TRUE;
4979         } else if (gethrtime() >= arc_growtime) {
4980                 arc_no_grow = B_FALSE;
4981         }
4982
4983         /*
4984          * Called unconditionally every 60 seconds to reclaim unused
4985          * zstd compression and decompression context. This is done
4986          * here to avoid the need for an independent thread.
4987          */
4988         if (!((reap_cb_check_counter++) % 60))
4989                 zfs_zstd_cache_reap_now();
4990
4991         return (B_FALSE);
4992 }
4993
4994 /*
4995  * Keep enough free memory in the system by reaping the ARC's kmem
4996  * caches.  To cause more slabs to be reapable, we may reduce the
4997  * target size of the cache (arc_c), causing the arc_evict_cb()
4998  * to free more buffers.
4999  */
5000 /* ARGSUSED */
5001 static void
5002 arc_reap_cb(void *arg, zthr_t *zthr)
5003 {
5004         int64_t free_memory;
5005         fstrans_cookie_t cookie = spl_fstrans_mark();
5006
5007         /*
5008          * Kick off asynchronous kmem_reap()'s of all our caches.
5009          */
5010         arc_kmem_reap_soon();
5011
5012         /*
5013          * Wait at least arc_kmem_cache_reap_retry_ms between
5014          * arc_kmem_reap_soon() calls. Without this check it is possible to
5015          * end up in a situation where we spend lots of time reaping
5016          * caches, while we're near arc_c_min.  Waiting here also gives the
5017          * subsequent free memory check a chance of finding that the
5018          * asynchronous reap has already freed enough memory, and we don't
5019          * need to call arc_reduce_target_size().
5020          */
5021         delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000);
5022
5023         /*
5024          * Reduce the target size as needed to maintain the amount of free
5025          * memory in the system at a fraction of the arc_size (1/128th by
5026          * default).  If oversubscribed (free_memory < 0) then reduce the
5027          * target arc_size by the deficit amount plus the fractional
5028          * amount.  If free memory is positive but less than the fractional
5029          * amount, reduce by what is needed to hit the fractional amount.
5030          */
5031         free_memory = arc_available_memory();
5032
5033         int64_t to_free =
5034             (arc_c >> arc_shrink_shift) - free_memory;
5035         if (to_free > 0) {
5036                 arc_reduce_target_size(to_free);
5037         }
5038         spl_fstrans_unmark(cookie);
5039 }
5040
5041 #ifdef _KERNEL
5042 /*
5043  * Determine the amount of memory eligible for eviction contained in the
5044  * ARC. All clean data reported by the ghost lists can always be safely
5045  * evicted. Due to arc_c_min, the same does not hold for all clean data
5046  * contained by the regular mru and mfu lists.
5047  *
5048  * In the case of the regular mru and mfu lists, we need to report as
5049  * much clean data as possible, such that evicting that same reported
5050  * data will not bring arc_size below arc_c_min. Thus, in certain
5051  * circumstances, the total amount of clean data in the mru and mfu
5052  * lists might not actually be evictable.
5053  *
5054  * The following two distinct cases are accounted for:
5055  *
5056  * 1. The sum of the amount of dirty data contained by both the mru and
5057  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
5058  *    is greater than or equal to arc_c_min.
5059  *    (i.e. amount of dirty data >= arc_c_min)
5060  *
5061  *    This is the easy case; all clean data contained by the mru and mfu
5062  *    lists is evictable. Evicting all clean data can only drop arc_size
5063  *    to the amount of dirty data, which is greater than arc_c_min.
5064  *
5065  * 2. The sum of the amount of dirty data contained by both the mru and
5066  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
5067  *    is less than arc_c_min.
5068  *    (i.e. arc_c_min > amount of dirty data)
5069  *
5070  *    2.1. arc_size is greater than or equal arc_c_min.
5071  *         (i.e. arc_size >= arc_c_min > amount of dirty data)
5072  *
5073  *         In this case, not all clean data from the regular mru and mfu
5074  *         lists is actually evictable; we must leave enough clean data
5075  *         to keep arc_size above arc_c_min. Thus, the maximum amount of
5076  *         evictable data from the two lists combined, is exactly the
5077  *         difference between arc_size and arc_c_min.
5078  *
5079  *    2.2. arc_size is less than arc_c_min
5080  *         (i.e. arc_c_min > arc_size > amount of dirty data)
5081  *
5082  *         In this case, none of the data contained in the mru and mfu
5083  *         lists is evictable, even if it's clean. Since arc_size is
5084  *         already below arc_c_min, evicting any more would only
5085  *         increase this negative difference.
5086  */
5087
5088 #endif /* _KERNEL */
5089
5090 /*
5091  * Adapt arc info given the number of bytes we are trying to add and
5092  * the state that we are coming from.  This function is only called
5093  * when we are adding new content to the cache.
5094  */
5095 static void
5096 arc_adapt(int bytes, arc_state_t *state)
5097 {
5098         int mult;
5099         uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
5100         int64_t mrug_size = zfs_refcount_count(&arc_mru_ghost->arcs_size);
5101         int64_t mfug_size = zfs_refcount_count(&arc_mfu_ghost->arcs_size);
5102
5103         ASSERT(bytes > 0);
5104         /*
5105          * Adapt the target size of the MRU list:
5106          *      - if we just hit in the MRU ghost list, then increase
5107          *        the target size of the MRU list.
5108          *      - if we just hit in the MFU ghost list, then increase
5109          *        the target size of the MFU list by decreasing the
5110          *        target size of the MRU list.
5111          */
5112         if (state == arc_mru_ghost) {
5113                 mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
5114                 if (!zfs_arc_p_dampener_disable)
5115                         mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
5116
5117                 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
5118         } else if (state == arc_mfu_ghost) {
5119                 uint64_t delta;
5120
5121                 mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
5122                 if (!zfs_arc_p_dampener_disable)
5123                         mult = MIN(mult, 10);
5124
5125                 delta = MIN(bytes * mult, arc_p);
5126                 arc_p = MAX(arc_p_min, arc_p - delta);
5127         }
5128         ASSERT((int64_t)arc_p >= 0);
5129
5130         /*
5131          * Wake reap thread if we do not have any available memory
5132          */
5133         if (arc_reclaim_needed()) {
5134                 zthr_wakeup(arc_reap_zthr);
5135                 return;
5136         }
5137
5138         if (arc_no_grow)
5139                 return;
5140
5141         if (arc_c >= arc_c_max)
5142                 return;
5143
5144         /*
5145          * If we're within (2 * maxblocksize) bytes of the target
5146          * cache size, increment the target cache size
5147          */
5148         ASSERT3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT);
5149         if (aggsum_upper_bound(&arc_sums.arcstat_size) >=
5150             arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
5151                 atomic_add_64(&arc_c, (int64_t)bytes);
5152                 if (arc_c > arc_c_max)
5153                         arc_c = arc_c_max;
5154                 else if (state == arc_anon)
5155                         atomic_add_64(&arc_p, (int64_t)bytes);
5156                 if (arc_p > arc_c)
5157                         arc_p = arc_c;
5158         }
5159         ASSERT((int64_t)arc_p >= 0);
5160 }
5161
5162 /*
5163  * Check if arc_size has grown past our upper threshold, determined by
5164  * zfs_arc_overflow_shift.
5165  */
5166 static arc_ovf_level_t
5167 arc_is_overflowing(boolean_t use_reserve)
5168 {
5169         /* Always allow at least one block of overflow */
5170         int64_t overflow = MAX(SPA_MAXBLOCKSIZE,
5171             arc_c >> zfs_arc_overflow_shift);
5172
5173         /*
5174          * We just compare the lower bound here for performance reasons. Our
5175          * primary goals are to make sure that the arc never grows without
5176          * bound, and that it can reach its maximum size. This check
5177          * accomplishes both goals. The maximum amount we could run over by is
5178          * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block
5179          * in the ARC. In practice, that's in the tens of MB, which is low
5180          * enough to be safe.
5181          */
5182         int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) -
5183             arc_c - overflow / 2;
5184         if (!use_reserve)
5185                 overflow /= 2;
5186         return (over < 0 ? ARC_OVF_NONE :
5187             over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
5188 }
5189
5190 static abd_t *
5191 arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
5192     int alloc_flags)
5193 {
5194         arc_buf_contents_t type = arc_buf_type(hdr);
5195
5196         arc_get_data_impl(hdr, size, tag, alloc_flags);
5197         if (type == ARC_BUFC_METADATA) {
5198                 return (abd_alloc(size, B_TRUE));
5199         } else {
5200                 ASSERT(type == ARC_BUFC_DATA);
5201                 return (abd_alloc(size, B_FALSE));
5202         }
5203 }
5204
5205 static void *
5206 arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
5207 {
5208         arc_buf_contents_t type = arc_buf_type(hdr);
5209
5210         arc_get_data_impl(hdr, size, tag, ARC_HDR_DO_ADAPT);
5211         if (type == ARC_BUFC_METADATA) {
5212                 return (zio_buf_alloc(size));
5213         } else {
5214                 ASSERT(type == ARC_BUFC_DATA);
5215                 return (zio_data_buf_alloc(size));
5216         }
5217 }
5218
5219 /*
5220  * Wait for the specified amount of data (in bytes) to be evicted from the
5221  * ARC, and for there to be sufficient free memory in the system.  Waiting for
5222  * eviction ensures that the memory used by the ARC decreases.  Waiting for
5223  * free memory ensures that the system won't run out of free pages, regardless
5224  * of ARC behavior and settings.  See arc_lowmem_init().
5225  */
5226 void
5227 arc_wait_for_eviction(uint64_t amount, boolean_t use_reserve)
5228 {
5229         switch (arc_is_overflowing(use_reserve)) {
5230         case ARC_OVF_NONE:
5231                 return;
5232         case ARC_OVF_SOME:
5233                 /*
5234                  * This is a bit racy without taking arc_evict_lock, but the
5235                  * worst that can happen is we either call zthr_wakeup() extra
5236                  * time due to race with other thread here, or the set flag
5237                  * get cleared by arc_evict_cb(), which is unlikely due to
5238                  * big hysteresis, but also not important since at this level
5239                  * of overflow the eviction is purely advisory.  Same time
5240                  * taking the global lock here every time without waiting for
5241                  * the actual eviction creates a significant lock contention.
5242                  */
5243                 if (!arc_evict_needed) {
5244                         arc_evict_needed = B_TRUE;
5245                         zthr_wakeup(arc_evict_zthr);
5246                 }
5247                 return;
5248         case ARC_OVF_SEVERE:
5249         default:
5250         {
5251                 arc_evict_waiter_t aw;
5252                 list_link_init(&aw.aew_node);
5253                 cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL);
5254
5255                 uint64_t last_count = 0;
5256                 mutex_enter(&arc_evict_lock);
5257                 if (!list_is_empty(&arc_evict_waiters)) {
5258                         arc_evict_waiter_t *last =
5259                             list_tail(&arc_evict_waiters);
5260                         last_count = last->aew_count;
5261                 } else if (!arc_evict_needed) {
5262                         arc_evict_needed = B_TRUE;
5263                         zthr_wakeup(arc_evict_zthr);
5264                 }
5265                 /*
5266                  * Note, the last waiter's count may be less than
5267                  * arc_evict_count if we are low on memory in which
5268                  * case arc_evict_state_impl() may have deferred
5269                  * wakeups (but still incremented arc_evict_count).
5270                  */
5271                 aw.aew_count = MAX(last_count, arc_evict_count) + amount;
5272
5273                 list_insert_tail(&arc_evict_waiters, &aw);
5274
5275                 arc_set_need_free();
5276
5277                 DTRACE_PROBE3(arc__wait__for__eviction,
5278                     uint64_t, amount,
5279                     uint64_t, arc_evict_count,
5280                     uint64_t, aw.aew_count);
5281
5282                 /*
5283                  * We will be woken up either when arc_evict_count reaches
5284                  * aew_count, or when the ARC is no longer overflowing and
5285                  * eviction completes.
5286                  * In case of "false" wakeup, we will still be on the list.
5287                  */
5288                 do {
5289                         cv_wait(&aw.aew_cv, &arc_evict_lock);
5290                 } while (list_link_active(&aw.aew_node));
5291                 mutex_exit(&arc_evict_lock);
5292
5293                 cv_destroy(&aw.aew_cv);
5294         }
5295         }
5296 }
5297
5298 /*
5299  * Allocate a block and return it to the caller. If we are hitting the
5300  * hard limit for the cache size, we must sleep, waiting for the eviction
5301  * thread to catch up. If we're past the target size but below the hard
5302  * limit, we'll only signal the reclaim thread and continue on.
5303  */
5304 static void
5305 arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
5306     int alloc_flags)
5307 {
5308         arc_state_t *state = hdr->b_l1hdr.b_state;
5309         arc_buf_contents_t type = arc_buf_type(hdr);
5310
5311         if (alloc_flags & ARC_HDR_DO_ADAPT)
5312                 arc_adapt(size, state);
5313
5314         /*
5315          * If arc_size is currently overflowing, we must be adding data
5316          * faster than we are evicting.  To ensure we don't compound the
5317          * problem by adding more data and forcing arc_size to grow even
5318          * further past it's target size, we wait for the eviction thread to
5319          * make some progress.  We also wait for there to be sufficient free
5320          * memory in the system, as measured by arc_free_memory().
5321          *
5322          * Specifically, we wait for zfs_arc_eviction_pct percent of the
5323          * requested size to be evicted.  This should be more than 100%, to
5324          * ensure that that progress is also made towards getting arc_size
5325          * under arc_c.  See the comment above zfs_arc_eviction_pct.
5326          */
5327         arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100,
5328             alloc_flags & ARC_HDR_USE_RESERVE);
5329
5330         VERIFY3U(hdr->b_type, ==, type);
5331         if (type == ARC_BUFC_METADATA) {
5332                 arc_space_consume(size, ARC_SPACE_META);
5333         } else {
5334                 arc_space_consume(size, ARC_SPACE_DATA);
5335         }
5336
5337         /*
5338          * Update the state size.  Note that ghost states have a
5339          * "ghost size" and so don't need to be updated.
5340          */
5341         if (!GHOST_STATE(state)) {
5342
5343                 (void) zfs_refcount_add_many(&state->arcs_size, size, tag);
5344
5345                 /*
5346                  * If this is reached via arc_read, the link is
5347                  * protected by the hash lock. If reached via
5348                  * arc_buf_alloc, the header should not be accessed by
5349                  * any other thread. And, if reached via arc_read_done,
5350                  * the hash lock will protect it if it's found in the
5351                  * hash table; otherwise no other thread should be
5352                  * trying to [add|remove]_reference it.
5353                  */
5354                 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
5355                         ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
5356                         (void) zfs_refcount_add_many(&state->arcs_esize[type],
5357                             size, tag);
5358                 }
5359
5360                 /*
5361                  * If we are growing the cache, and we are adding anonymous
5362                  * data, and we have outgrown arc_p, update arc_p
5363                  */
5364                 if (aggsum_upper_bound(&arc_sums.arcstat_size) < arc_c &&
5365                     hdr->b_l1hdr.b_state == arc_anon &&
5366                     (zfs_refcount_count(&arc_anon->arcs_size) +
5367                     zfs_refcount_count(&arc_mru->arcs_size) > arc_p))
5368                         arc_p = MIN(arc_c, arc_p + size);
5369         }
5370 }
5371
5372 static void
5373 arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag)
5374 {
5375         arc_free_data_impl(hdr, size, tag);
5376         abd_free(abd);
5377 }
5378
5379 static void
5380 arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag)
5381 {
5382         arc_buf_contents_t type = arc_buf_type(hdr);
5383
5384         arc_free_data_impl(hdr, size, tag);
5385         if (type == ARC_BUFC_METADATA) {
5386                 zio_buf_free(buf, size);
5387         } else {
5388                 ASSERT(type == ARC_BUFC_DATA);
5389                 zio_data_buf_free(buf, size);
5390         }
5391 }
5392
5393 /*
5394  * Free the arc data buffer.
5395  */
5396 static void
5397 arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
5398 {
5399         arc_state_t *state = hdr->b_l1hdr.b_state;
5400         arc_buf_contents_t type = arc_buf_type(hdr);
5401
5402         /* protected by hash lock, if in the hash table */
5403         if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
5404                 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
5405                 ASSERT(state != arc_anon && state != arc_l2c_only);
5406
5407                 (void) zfs_refcount_remove_many(&state->arcs_esize[type],
5408                     size, tag);
5409         }
5410         (void) zfs_refcount_remove_many(&state->arcs_size, size, tag);
5411
5412         VERIFY3U(hdr->b_type, ==, type);
5413         if (type == ARC_BUFC_METADATA) {
5414                 arc_space_return(size, ARC_SPACE_META);
5415         } else {
5416                 ASSERT(type == ARC_BUFC_DATA);
5417                 arc_space_return(size, ARC_SPACE_DATA);
5418         }
5419 }
5420
5421 /*
5422  * This routine is called whenever a buffer is accessed.
5423  * NOTE: the hash lock is dropped in this function.
5424  */
5425 static void
5426 arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
5427 {
5428         clock_t now;
5429
5430         ASSERT(MUTEX_HELD(hash_lock));
5431         ASSERT(HDR_HAS_L1HDR(hdr));
5432
5433         if (hdr->b_l1hdr.b_state == arc_anon) {
5434                 /*
5435                  * This buffer is not in the cache, and does not
5436                  * appear in our "ghost" list.  Add the new buffer
5437                  * to the MRU state.
5438                  */
5439
5440                 ASSERT0(hdr->b_l1hdr.b_arc_access);
5441                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
5442                 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
5443                 arc_change_state(arc_mru, hdr, hash_lock);
5444
5445         } else if (hdr->b_l1hdr.b_state == arc_mru) {
5446                 now = ddi_get_lbolt();
5447
5448                 /*
5449                  * If this buffer is here because of a prefetch, then either:
5450                  * - clear the flag if this is a "referencing" read
5451                  *   (any subsequent access will bump this into the MFU state).
5452                  * or
5453                  * - move the buffer to the head of the list if this is
5454                  *   another prefetch (to make it less likely to be evicted).
5455                  */
5456                 if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
5457                         if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
5458                                 /* link protected by hash lock */
5459                                 ASSERT(multilist_link_active(
5460                                     &hdr->b_l1hdr.b_arc_node));
5461                         } else {
5462                                 if (HDR_HAS_L2HDR(hdr))
5463                                         l2arc_hdr_arcstats_decrement_state(hdr);
5464                                 arc_hdr_clear_flags(hdr,
5465                                     ARC_FLAG_PREFETCH |
5466                                     ARC_FLAG_PRESCIENT_PREFETCH);
5467                                 hdr->b_l1hdr.b_mru_hits++;
5468                                 ARCSTAT_BUMP(arcstat_mru_hits);
5469                                 if (HDR_HAS_L2HDR(hdr))
5470                                         l2arc_hdr_arcstats_increment_state(hdr);
5471                         }
5472                         hdr->b_l1hdr.b_arc_access = now;
5473                         return;
5474                 }
5475
5476                 /*
5477                  * This buffer has been "accessed" only once so far,
5478                  * but it is still in the cache. Move it to the MFU
5479                  * state.
5480                  */
5481                 if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access +
5482                     ARC_MINTIME)) {
5483                         /*
5484                          * More than 125ms have passed since we
5485                          * instantiated this buffer.  Move it to the
5486                          * most frequently used state.
5487                          */
5488                         hdr->b_l1hdr.b_arc_access = now;
5489                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
5490                         arc_change_state(arc_mfu, hdr, hash_lock);
5491                 }
5492                 hdr->b_l1hdr.b_mru_hits++;
5493                 ARCSTAT_BUMP(arcstat_mru_hits);
5494         } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
5495                 arc_state_t     *new_state;
5496                 /*
5497                  * This buffer has been "accessed" recently, but
5498                  * was evicted from the cache.  Move it to the
5499                  * MFU state.
5500                  */
5501                 if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
5502                         new_state = arc_mru;
5503                         if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) {
5504                                 if (HDR_HAS_L2HDR(hdr))
5505                                         l2arc_hdr_arcstats_decrement_state(hdr);
5506                                 arc_hdr_clear_flags(hdr,
5507                                     ARC_FLAG_PREFETCH |
5508                                     ARC_FLAG_PRESCIENT_PREFETCH);
5509                                 if (HDR_HAS_L2HDR(hdr))
5510                                         l2arc_hdr_arcstats_increment_state(hdr);
5511                         }
5512                         DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
5513                 } else {
5514                         new_state = arc_mfu;
5515                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
5516                 }
5517
5518                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
5519                 arc_change_state(new_state, hdr, hash_lock);
5520
5521                 hdr->b_l1hdr.b_mru_ghost_hits++;
5522                 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
5523         } else if (hdr->b_l1hdr.b_state == arc_mfu) {
5524                 /*
5525                  * This buffer has been accessed more than once and is
5526                  * still in the cache.  Keep it in the MFU state.
5527                  *
5528                  * NOTE: an add_reference() that occurred when we did
5529                  * the arc_read() will have kicked this off the list.
5530                  * If it was a prefetch, we will explicitly move it to
5531                  * the head of the list now.
5532                  */
5533
5534                 hdr->b_l1hdr.b_mfu_hits++;
5535                 ARCSTAT_BUMP(arcstat_mfu_hits);
5536                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
5537         } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
5538                 arc_state_t     *new_state = arc_mfu;
5539                 /*
5540                  * This buffer has been accessed more than once but has
5541                  * been evicted from the cache.  Move it back to the
5542                  * MFU state.
5543                  */
5544
5545                 if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
5546                         /*
5547                          * This is a prefetch access...
5548                          * move this block back to the MRU state.
5549                          */
5550                         new_state = arc_mru;
5551                 }
5552
5553                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
5554                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
5555                 arc_change_state(new_state, hdr, hash_lock);
5556
5557                 hdr->b_l1hdr.b_mfu_ghost_hits++;
5558                 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
5559         } else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
5560                 /*
5561                  * This buffer is on the 2nd Level ARC.
5562                  */
5563
5564                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
5565                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
5566                 arc_change_state(arc_mfu, hdr, hash_lock);
5567         } else {
5568                 cmn_err(CE_PANIC, "invalid arc state 0x%p",
5569                     hdr->b_l1hdr.b_state);
5570         }
5571 }
5572
5573 /*
5574  * This routine is called by dbuf_hold() to update the arc_access() state
5575  * which otherwise would be skipped for entries in the dbuf cache.
5576  */
5577 void
5578 arc_buf_access(arc_buf_t *buf)
5579 {
5580         mutex_enter(&buf->b_evict_lock);
5581         arc_buf_hdr_t *hdr = buf->b_hdr;
5582
5583         /*
5584          * Avoid taking the hash_lock when possible as an optimization.
5585          * The header must be checked again under the hash_lock in order
5586          * to handle the case where it is concurrently being released.
5587          */
5588         if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
5589                 mutex_exit(&buf->b_evict_lock);
5590                 return;
5591         }
5592
5593         kmutex_t *hash_lock = HDR_LOCK(hdr);
5594         mutex_enter(hash_lock);
5595
5596         if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
5597                 mutex_exit(hash_lock);
5598                 mutex_exit(&buf->b_evict_lock);
5599                 ARCSTAT_BUMP(arcstat_access_skip);
5600                 return;
5601         }
5602
5603         mutex_exit(&buf->b_evict_lock);
5604
5605         ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
5606             hdr->b_l1hdr.b_state == arc_mfu);
5607
5608         DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
5609         arc_access(hdr, hash_lock);
5610         mutex_exit(hash_lock);
5611
5612         ARCSTAT_BUMP(arcstat_hits);
5613         ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr) && !HDR_PRESCIENT_PREFETCH(hdr),
5614             demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
5615 }
5616
5617 /* a generic arc_read_done_func_t which you can use */
5618 /* ARGSUSED */
5619 void
5620 arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
5621     arc_buf_t *buf, void *arg)
5622 {
5623         if (buf == NULL)
5624                 return;
5625
5626         bcopy(buf->b_data, arg, arc_buf_size(buf));
5627         arc_buf_destroy(buf, arg);
5628 }
5629
5630 /* a generic arc_read_done_func_t */
5631 /* ARGSUSED */
5632 void
5633 arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
5634     arc_buf_t *buf, void *arg)
5635 {
5636         arc_buf_t **bufp = arg;
5637
5638         if (buf == NULL) {
5639                 ASSERT(zio == NULL || zio->io_error != 0);
5640                 *bufp = NULL;
5641         } else {
5642                 ASSERT(zio == NULL || zio->io_error == 0);
5643                 *bufp = buf;
5644                 ASSERT(buf->b_data != NULL);
5645         }
5646 }
5647
5648 static void
5649 arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp)
5650 {
5651         if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
5652                 ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0);
5653                 ASSERT3U(arc_hdr_get_compress(hdr), ==, ZIO_COMPRESS_OFF);
5654         } else {
5655                 if (HDR_COMPRESSION_ENABLED(hdr)) {
5656                         ASSERT3U(arc_hdr_get_compress(hdr), ==,
5657                             BP_GET_COMPRESS(bp));
5658                 }
5659                 ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
5660                 ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp));
5661                 ASSERT3U(!!HDR_PROTECTED(hdr), ==, BP_IS_PROTECTED(bp));
5662         }
5663 }
5664
5665 static void
5666 arc_read_done(zio_t *zio)
5667 {
5668         blkptr_t        *bp = zio->io_bp;
5669         arc_buf_hdr_t   *hdr = zio->io_private;
5670         kmutex_t        *hash_lock = NULL;
5671         arc_callback_t  *callback_list;
5672         arc_callback_t  *acb;
5673         boolean_t       freeable = B_FALSE;
5674
5675         /*
5676          * The hdr was inserted into hash-table and removed from lists
5677          * prior to starting I/O.  We should find this header, since
5678          * it's in the hash table, and it should be legit since it's
5679          * not possible to evict it during the I/O.  The only possible
5680          * reason for it not to be found is if we were freed during the
5681          * read.
5682          */
5683         if (HDR_IN_HASH_TABLE(hdr)) {
5684                 arc_buf_hdr_t *found;
5685
5686                 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
5687                 ASSERT3U(hdr->b_dva.dva_word[0], ==,
5688                     BP_IDENTITY(zio->io_bp)->dva_word[0]);
5689                 ASSERT3U(hdr->b_dva.dva_word[1], ==,
5690                     BP_IDENTITY(zio->io_bp)->dva_word[1]);
5691
5692                 found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock);
5693
5694                 ASSERT((found == hdr &&
5695                     DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
5696                     (found == hdr && HDR_L2_READING(hdr)));
5697                 ASSERT3P(hash_lock, !=, NULL);
5698         }
5699
5700         if (BP_IS_PROTECTED(bp)) {
5701                 hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
5702                 hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
5703                 zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
5704                     hdr->b_crypt_hdr.b_iv);
5705
5706                 if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) {
5707                         void *tmpbuf;
5708
5709                         tmpbuf = abd_borrow_buf_copy(zio->io_abd,
5710                             sizeof (zil_chain_t));
5711                         zio_crypt_decode_mac_zil(tmpbuf,
5712                             hdr->b_crypt_hdr.b_mac);
5713                         abd_return_buf(zio->io_abd, tmpbuf,
5714                             sizeof (zil_chain_t));
5715                 } else {
5716                         zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
5717                 }
5718         }
5719
5720         if (zio->io_error == 0) {
5721                 /* byteswap if necessary */
5722                 if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
5723                         if (BP_GET_LEVEL(zio->io_bp) > 0) {
5724                                 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
5725                         } else {
5726                                 hdr->b_l1hdr.b_byteswap =
5727                                     DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
5728                         }
5729                 } else {
5730                         hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
5731                 }
5732                 if (!HDR_L2_READING(hdr)) {
5733                         hdr->b_complevel = zio->io_prop.zp_complevel;
5734                 }
5735         }
5736
5737         arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED);
5738         if (l2arc_noprefetch && HDR_PREFETCH(hdr))
5739                 arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);
5740
5741         callback_list = hdr->b_l1hdr.b_acb;
5742         ASSERT3P(callback_list, !=, NULL);
5743
5744         if (hash_lock && zio->io_error == 0 &&
5745             hdr->b_l1hdr.b_state == arc_anon) {
5746                 /*
5747                  * Only call arc_access on anonymous buffers.  This is because
5748                  * if we've issued an I/O for an evicted buffer, we've already
5749                  * called arc_access (to prevent any simultaneous readers from
5750                  * getting confused).
5751                  */
5752                 arc_access(hdr, hash_lock);
5753         }
5754
5755         /*
5756          * If a read request has a callback (i.e. acb_done is not NULL), then we
5757          * make a buf containing the data according to the parameters which were
5758          * passed in. The implementation of arc_buf_alloc_impl() ensures that we
5759          * aren't needlessly decompressing the data multiple times.
5760          */
5761         int callback_cnt = 0;
5762         for (acb = callback_list; acb != NULL; acb = acb->acb_next) {
5763                 if (!acb->acb_done || acb->acb_nobuf)
5764                         continue;
5765
5766                 callback_cnt++;
5767
5768                 if (zio->io_error != 0)
5769                         continue;
5770
5771                 int error = arc_buf_alloc_impl(hdr, zio->io_spa,
5772                     &acb->acb_zb, acb->acb_private, acb->acb_encrypted,
5773                     acb->acb_compressed, acb->acb_noauth, B_TRUE,
5774                     &acb->acb_buf);
5775
5776                 /*
5777                  * Assert non-speculative zios didn't fail because an
5778                  * encryption key wasn't loaded
5779                  */
5780                 ASSERT((zio->io_flags & ZIO_FLAG_SPECULATIVE) ||
5781                     error != EACCES);
5782
5783                 /*
5784                  * If we failed to decrypt, report an error now (as the zio
5785                  * layer would have done if it had done the transforms).
5786                  */
5787                 if (error == ECKSUM) {
5788                         ASSERT(BP_IS_PROTECTED(bp));
5789                         error = SET_ERROR(EIO);
5790                         if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
5791                                 spa_log_error(zio->io_spa, &acb->acb_zb);
5792                                 (void) zfs_ereport_post(
5793                                     FM_EREPORT_ZFS_AUTHENTICATION,
5794                                     zio->io_spa, NULL, &acb->acb_zb, zio, 0);
5795                         }
5796                 }
5797
5798                 if (error != 0) {
5799                         /*
5800                          * Decompression or decryption failed.  Set
5801                          * io_error so that when we call acb_done
5802                          * (below), we will indicate that the read
5803                          * failed. Note that in the unusual case
5804                          * where one callback is compressed and another
5805                          * uncompressed, we will mark all of them
5806                          * as failed, even though the uncompressed
5807                          * one can't actually fail.  In this case,
5808                          * the hdr will not be anonymous, because
5809                          * if there are multiple callbacks, it's
5810                          * because multiple threads found the same
5811                          * arc buf in the hash table.
5812                          */
5813                         zio->io_error = error;
5814                 }
5815         }
5816
5817         /*
5818          * If there are multiple callbacks, we must have the hash lock,
5819          * because the only way for multiple threads to find this hdr is
5820          * in the hash table.  This ensures that if there are multiple
5821          * callbacks, the hdr is not anonymous.  If it were anonymous,
5822          * we couldn't use arc_buf_destroy() in the error case below.
5823          */
5824         ASSERT(callback_cnt < 2 || hash_lock != NULL);
5825
5826         hdr->b_l1hdr.b_acb = NULL;
5827         arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
5828         if (callback_cnt == 0)
5829                 ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
5830
5831         ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
5832             callback_list != NULL);
5833
5834         if (zio->io_error == 0) {
5835                 arc_hdr_verify(hdr, zio->io_bp);
5836         } else {
5837                 arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
5838                 if (hdr->b_l1hdr.b_state != arc_anon)
5839                         arc_change_state(arc_anon, hdr, hash_lock);
5840                 if (HDR_IN_HASH_TABLE(hdr))
5841                         buf_hash_remove(hdr);
5842                 freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
5843         }
5844
5845         /*
5846          * Broadcast before we drop the hash_lock to avoid the possibility
5847          * that the hdr (and hence the cv) might be freed before we get to
5848          * the cv_broadcast().
5849          */
5850         cv_broadcast(&hdr->b_l1hdr.b_cv);
5851
5852         if (hash_lock != NULL) {
5853                 mutex_exit(hash_lock);
5854         } else {
5855                 /*
5856                  * This block was freed while we waited for the read to
5857                  * complete.  It has been removed from the hash table and
5858                  * moved to the anonymous state (so that it won't show up
5859                  * in the cache).
5860                  */
5861                 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
5862                 freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
5863         }
5864
5865         /* execute each callback and free its structure */
5866         while ((acb = callback_list) != NULL) {
5867                 if (acb->acb_done != NULL) {
5868                         if (zio->io_error != 0 && acb->acb_buf != NULL) {
5869                                 /*
5870                                  * If arc_buf_alloc_impl() fails during
5871                                  * decompression, the buf will still be
5872                                  * allocated, and needs to be freed here.
5873                                  */
5874                                 arc_buf_destroy(acb->acb_buf,
5875                                     acb->acb_private);
5876                                 acb->acb_buf = NULL;
5877                         }
5878                         acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
5879                             acb->acb_buf, acb->acb_private);
5880                 }
5881
5882                 if (acb->acb_zio_dummy != NULL) {
5883                         acb->acb_zio_dummy->io_error = zio->io_error;
5884                         zio_nowait(acb->acb_zio_dummy);
5885                 }
5886
5887                 callback_list = acb->acb_next;
5888                 kmem_free(acb, sizeof (arc_callback_t));
5889         }
5890
5891         if (freeable)
5892                 arc_hdr_destroy(hdr);
5893 }
5894
5895 /*
5896  * "Read" the block at the specified DVA (in bp) via the
5897  * cache.  If the block is found in the cache, invoke the provided
5898  * callback immediately and return.  Note that the `zio' parameter
5899  * in the callback will be NULL in this case, since no IO was
5900  * required.  If the block is not in the cache pass the read request
5901  * on to the spa with a substitute callback function, so that the
5902  * requested block will be added to the cache.
5903  *
5904  * If a read request arrives for a block that has a read in-progress,
5905  * either wait for the in-progress read to complete (and return the
5906  * results); or, if this is a read with a "done" func, add a record
5907  * to the read to invoke the "done" func when the read completes,
5908  * and return; or just return.
5909  *
5910  * arc_read_done() will invoke all the requested "done" functions
5911  * for readers of this block.
5912  */
5913 int
5914 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
5915     arc_read_done_func_t *done, void *private, zio_priority_t priority,
5916     int zio_flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
5917 {
5918         arc_buf_hdr_t *hdr = NULL;
5919         kmutex_t *hash_lock = NULL;
5920         zio_t *rzio;
5921         uint64_t guid = spa_load_guid(spa);
5922         boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW_COMPRESS) != 0;
5923         boolean_t encrypted_read = BP_IS_ENCRYPTED(bp) &&
5924             (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
5925         boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) &&
5926             (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
5927         boolean_t embedded_bp = !!BP_IS_EMBEDDED(bp);
5928         boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF;
5929         int rc = 0;
5930
5931         ASSERT(!embedded_bp ||
5932             BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
5933         ASSERT(!BP_IS_HOLE(bp));
5934         ASSERT(!BP_IS_REDACTED(bp));
5935
5936         /*
5937          * Normally SPL_FSTRANS will already be set since kernel threads which
5938          * expect to call the DMU interfaces will set it when created.  System
5939          * calls are similarly handled by setting/cleaning the bit in the
5940          * registered callback (module/os/.../zfs/zpl_*).
5941          *
5942          * External consumers such as Lustre which call the exported DMU
5943          * interfaces may not have set SPL_FSTRANS.  To avoid a deadlock
5944          * on the hash_lock always set and clear the bit.
5945          */
5946         fstrans_cookie_t cookie = spl_fstrans_mark();
5947 top:
5948         /*
5949          * Verify the block pointer contents are reasonable.  This should
5950          * always be the case since the blkptr is protected by a checksum.
5951          * However, if there is damage it's desirable to detect this early
5952          * and treat it as a checksum error.  This allows an alternate blkptr
5953          * to be tried when one is available (e.g. ditto blocks).
5954          */
5955         if (!zfs_blkptr_verify(spa, bp, zio_flags & ZIO_FLAG_CONFIG_WRITER,
5956             BLK_VERIFY_LOG)) {
5957                 rc = SET_ERROR(ECKSUM);
5958                 goto out;
5959         }
5960
5961         if (!embedded_bp) {
5962                 /*
5963                  * Embedded BP's have no DVA and require no I/O to "read".
5964                  * Create an anonymous arc buf to back it.
5965                  */
5966                 hdr = buf_hash_find(guid, bp, &hash_lock);
5967         }
5968
5969         /*
5970          * Determine if we have an L1 cache hit or a cache miss. For simplicity
5971          * we maintain encrypted data separately from compressed / uncompressed
5972          * data. If the user is requesting raw encrypted data and we don't have
5973          * that in the header we will read from disk to guarantee that we can
5974          * get it even if the encryption keys aren't loaded.
5975          */
5976         if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) ||
5977             (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) {
5978                 arc_buf_t *buf = NULL;
5979                 *arc_flags |= ARC_FLAG_CACHED;
5980
5981                 if (HDR_IO_IN_PROGRESS(hdr)) {
5982                         zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
5983
5984                         if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
5985                                 mutex_exit(hash_lock);
5986                                 ARCSTAT_BUMP(arcstat_cached_only_in_progress);
5987                                 rc = SET_ERROR(ENOENT);
5988                                 goto out;
5989                         }
5990
5991                         ASSERT3P(head_zio, !=, NULL);
5992                         if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
5993                             priority == ZIO_PRIORITY_SYNC_READ) {
5994                                 /*
5995                                  * This is a sync read that needs to wait for
5996                                  * an in-flight async read. Request that the
5997                                  * zio have its priority upgraded.
5998                                  */
5999                                 zio_change_priority(head_zio, priority);
6000                                 DTRACE_PROBE1(arc__async__upgrade__sync,
6001                                     arc_buf_hdr_t *, hdr);
6002                                 ARCSTAT_BUMP(arcstat_async_upgrade_sync);
6003                         }
6004                         if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
6005                                 arc_hdr_clear_flags(hdr,
6006                                     ARC_FLAG_PREDICTIVE_PREFETCH);
6007                         }
6008
6009                         if (*arc_flags & ARC_FLAG_WAIT) {
6010                                 cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
6011                                 mutex_exit(hash_lock);
6012                                 goto top;
6013                         }
6014                         ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
6015
6016                         if (done) {
6017                                 arc_callback_t *acb = NULL;
6018
6019                                 acb = kmem_zalloc(sizeof (arc_callback_t),
6020                                     KM_SLEEP);
6021                                 acb->acb_done = done;
6022                                 acb->acb_private = private;
6023                                 acb->acb_compressed = compressed_read;
6024                                 acb->acb_encrypted = encrypted_read;
6025                                 acb->acb_noauth = noauth_read;
6026                                 acb->acb_nobuf = no_buf;
6027                                 acb->acb_zb = *zb;
6028                                 if (pio != NULL)
6029                                         acb->acb_zio_dummy = zio_null(pio,
6030                                             spa, NULL, NULL, NULL, zio_flags);
6031
6032                                 ASSERT3P(acb->acb_done, !=, NULL);
6033                                 acb->acb_zio_head = head_zio;
6034                                 acb->acb_next = hdr->b_l1hdr.b_acb;
6035                                 hdr->b_l1hdr.b_acb = acb;
6036                         }
6037                         mutex_exit(hash_lock);
6038                         goto out;
6039                 }
6040
6041                 ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
6042                     hdr->b_l1hdr.b_state == arc_mfu);
6043
6044                 if (done && !no_buf) {
6045                         if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
6046                                 /*
6047                                  * This is a demand read which does not have to
6048                                  * wait for i/o because we did a predictive
6049                                  * prefetch i/o for it, which has completed.
6050                                  */
6051                                 DTRACE_PROBE1(
6052                                     arc__demand__hit__predictive__prefetch,
6053                                     arc_buf_hdr_t *, hdr);
6054                                 ARCSTAT_BUMP(
6055                                     arcstat_demand_hit_predictive_prefetch);
6056                                 arc_hdr_clear_flags(hdr,
6057                                     ARC_FLAG_PREDICTIVE_PREFETCH);
6058                         }
6059
6060                         if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
6061                                 ARCSTAT_BUMP(
6062                                     arcstat_demand_hit_prescient_prefetch);
6063                                 arc_hdr_clear_flags(hdr,
6064                                     ARC_FLAG_PRESCIENT_PREFETCH);
6065                         }
6066
6067                         ASSERT(!embedded_bp || !BP_IS_HOLE(bp));
6068
6069                         /* Get a buf with the desired data in it. */
6070                         rc = arc_buf_alloc_impl(hdr, spa, zb, private,
6071                             encrypted_read, compressed_read, noauth_read,
6072                             B_TRUE, &buf);
6073                         if (rc == ECKSUM) {
6074                                 /*
6075                                  * Convert authentication and decryption errors
6076                                  * to EIO (and generate an ereport if needed)
6077                                  * before leaving the ARC.
6078                                  */
6079                                 rc = SET_ERROR(EIO);
6080                                 if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) {
6081                                         spa_log_error(spa, zb);
6082                                         (void) zfs_ereport_post(
6083                                             FM_EREPORT_ZFS_AUTHENTICATION,
6084                                             spa, NULL, zb, NULL, 0);
6085                                 }
6086                         }
6087                         if (rc != 0) {
6088                                 (void) remove_reference(hdr, hash_lock,
6089                                     private);
6090                                 arc_buf_destroy_impl(buf);
6091                                 buf = NULL;
6092                         }
6093
6094                         /* assert any errors weren't due to unloaded keys */
6095                         ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) ||
6096                             rc != EACCES);
6097                 } else if (*arc_flags & ARC_FLAG_PREFETCH &&
6098                     zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
6099                         if (HDR_HAS_L2HDR(hdr))
6100                                 l2arc_hdr_arcstats_decrement_state(hdr);
6101                         arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
6102                         if (HDR_HAS_L2HDR(hdr))
6103                                 l2arc_hdr_arcstats_increment_state(hdr);
6104                 }
6105                 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
6106                 arc_access(hdr, hash_lock);
6107                 if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
6108                         arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
6109                 if (*arc_flags & ARC_FLAG_L2CACHE)
6110                         arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
6111                 mutex_exit(hash_lock);
6112                 ARCSTAT_BUMP(arcstat_hits);
6113                 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
6114                     demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
6115                     data, metadata, hits);
6116
6117                 if (done)
6118                         done(NULL, zb, bp, buf, private);
6119         } else {
6120                 uint64_t lsize = BP_GET_LSIZE(bp);
6121                 uint64_t psize = BP_GET_PSIZE(bp);
6122                 arc_callback_t *acb;
6123                 vdev_t *vd = NULL;
6124                 uint64_t addr = 0;
6125                 boolean_t devw = B_FALSE;
6126                 uint64_t size;
6127                 abd_t *hdr_abd;
6128                 int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0;
6129
6130                 if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
6131                         rc = SET_ERROR(ENOENT);
6132                         if (hash_lock != NULL)
6133                                 mutex_exit(hash_lock);
6134                         goto out;
6135                 }
6136
6137                 if (hdr == NULL) {
6138                         /*
6139                          * This block is not in the cache or it has
6140                          * embedded data.
6141                          */
6142                         arc_buf_hdr_t *exists = NULL;
6143                         arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
6144                         hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
6145                             BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type);
6146
6147                         if (!embedded_bp) {
6148                                 hdr->b_dva = *BP_IDENTITY(bp);
6149                                 hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
6150                                 exists = buf_hash_insert(hdr, &hash_lock);
6151                         }
6152                         if (exists != NULL) {
6153                                 /* somebody beat us to the hash insert */
6154                                 mutex_exit(hash_lock);
6155                                 buf_discard_identity(hdr);
6156                                 arc_hdr_destroy(hdr);
6157                                 goto top; /* restart the IO request */
6158                         }
6159                         alloc_flags |= ARC_HDR_DO_ADAPT;
6160                 } else {
6161                         /*
6162                          * This block is in the ghost cache or encrypted data
6163                          * was requested and we didn't have it. If it was
6164                          * L2-only (and thus didn't have an L1 hdr),
6165                          * we realloc the header to add an L1 hdr.
6166                          */
6167                         if (!HDR_HAS_L1HDR(hdr)) {
6168                                 hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
6169                                     hdr_full_cache);
6170                         }
6171
6172                         if (GHOST_STATE(hdr->b_l1hdr.b_state)) {
6173                                 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
6174                                 ASSERT(!HDR_HAS_RABD(hdr));
6175                                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
6176                                 ASSERT0(zfs_refcount_count(
6177                                     &hdr->b_l1hdr.b_refcnt));
6178                                 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
6179                                 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
6180                         } else if (HDR_IO_IN_PROGRESS(hdr)) {
6181                                 /*
6182                                  * If this header already had an IO in progress
6183                                  * and we are performing another IO to fetch
6184                                  * encrypted data we must wait until the first
6185                                  * IO completes so as not to confuse
6186                                  * arc_read_done(). This should be very rare
6187                                  * and so the performance impact shouldn't
6188                                  * matter.
6189                                  */
6190                                 cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
6191                                 mutex_exit(hash_lock);
6192                                 goto top;
6193                         }
6194
6195                         /*
6196                          * This is a delicate dance that we play here.
6197                          * This hdr might be in the ghost list so we access
6198                          * it to move it out of the ghost list before we
6199                          * initiate the read. If it's a prefetch then
6200                          * it won't have a callback so we'll remove the
6201                          * reference that arc_buf_alloc_impl() created. We
6202                          * do this after we've called arc_access() to
6203                          * avoid hitting an assert in remove_reference().
6204                          */
6205                         arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state);
6206                         arc_access(hdr, hash_lock);
6207                 }
6208
6209                 arc_hdr_alloc_abd(hdr, alloc_flags);
6210                 if (encrypted_read) {
6211                         ASSERT(HDR_HAS_RABD(hdr));
6212                         size = HDR_GET_PSIZE(hdr);
6213                         hdr_abd = hdr->b_crypt_hdr.b_rabd;
6214                         zio_flags |= ZIO_FLAG_RAW;
6215                 } else {
6216                         ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
6217                         size = arc_hdr_size(hdr);
6218                         hdr_abd = hdr->b_l1hdr.b_pabd;
6219
6220                         if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
6221                                 zio_flags |= ZIO_FLAG_RAW_COMPRESS;
6222                         }
6223
6224                         /*
6225                          * For authenticated bp's, we do not ask the ZIO layer
6226                          * to authenticate them since this will cause the entire
6227                          * IO to fail if the key isn't loaded. Instead, we
6228                          * defer authentication until arc_buf_fill(), which will
6229                          * verify the data when the key is available.
6230                          */
6231                         if (BP_IS_AUTHENTICATED(bp))
6232                                 zio_flags |= ZIO_FLAG_RAW_ENCRYPT;
6233                 }
6234
6235                 if (*arc_flags & ARC_FLAG_PREFETCH &&
6236                     zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
6237                         if (HDR_HAS_L2HDR(hdr))
6238                                 l2arc_hdr_arcstats_decrement_state(hdr);
6239                         arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
6240                         if (HDR_HAS_L2HDR(hdr))
6241                                 l2arc_hdr_arcstats_increment_state(hdr);
6242                 }
6243                 if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
6244                         arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
6245                 if (*arc_flags & ARC_FLAG_L2CACHE)
6246                         arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
6247                 if (BP_IS_AUTHENTICATED(bp))
6248                         arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
6249                 if (BP_GET_LEVEL(bp) > 0)
6250                         arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT);
6251                 if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
6252                         arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH);
6253                 ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
6254
6255                 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
6256                 acb->acb_done = done;
6257                 acb->acb_private = private;
6258                 acb->acb_compressed = compressed_read;
6259                 acb->acb_encrypted = encrypted_read;
6260                 acb->acb_noauth = noauth_read;
6261                 acb->acb_zb = *zb;
6262
6263                 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
6264                 hdr->b_l1hdr.b_acb = acb;
6265                 arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
6266
6267                 if (HDR_HAS_L2HDR(hdr) &&
6268                     (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
6269                         devw = hdr->b_l2hdr.b_dev->l2ad_writing;
6270                         addr = hdr->b_l2hdr.b_daddr;
6271                         /*
6272                          * Lock out L2ARC device removal.
6273                          */
6274                         if (vdev_is_dead(vd) ||
6275                             !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
6276                                 vd = NULL;
6277                 }
6278
6279                 /*
6280                  * We count both async reads and scrub IOs as asynchronous so
6281                  * that both can be upgraded in the event of a cache hit while
6282                  * the read IO is still in-flight.
6283                  */
6284                 if (priority == ZIO_PRIORITY_ASYNC_READ ||
6285                     priority == ZIO_PRIORITY_SCRUB)
6286                         arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
6287                 else
6288                         arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
6289
6290                 /*
6291                  * At this point, we have a level 1 cache miss or a blkptr
6292                  * with embedded data.  Try again in L2ARC if possible.
6293                  */
6294                 ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize);
6295
6296                 /*
6297                  * Skip ARC stat bump for block pointers with embedded
6298                  * data. The data are read from the blkptr itself via
6299                  * decode_embedded_bp_compressed().
6300                  */
6301                 if (!embedded_bp) {
6302                         DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr,
6303                             blkptr_t *, bp, uint64_t, lsize,
6304                             zbookmark_phys_t *, zb);
6305                         ARCSTAT_BUMP(arcstat_misses);
6306                         ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
6307                             demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data,
6308                             metadata, misses);
6309                         zfs_racct_read(size, 1);
6310                 }
6311
6312                 /* Check if the spa even has l2 configured */
6313                 const boolean_t spa_has_l2 = l2arc_ndev != 0 &&
6314                     spa->spa_l2cache.sav_count > 0;
6315
6316                 if (vd != NULL && spa_has_l2 && !(l2arc_norw && devw)) {
6317                         /*
6318                          * Read from the L2ARC if the following are true:
6319                          * 1. The L2ARC vdev was previously cached.
6320                          * 2. This buffer still has L2ARC metadata.
6321                          * 3. This buffer isn't currently writing to the L2ARC.
6322                          * 4. The L2ARC entry wasn't evicted, which may
6323                          *    also have invalidated the vdev.
6324                          * 5. This isn't prefetch or l2arc_noprefetch is 0.
6325                          */
6326                         if (HDR_HAS_L2HDR(hdr) &&
6327                             !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
6328                             !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
6329                                 l2arc_read_callback_t *cb;
6330                                 abd_t *abd;
6331                                 uint64_t asize;
6332
6333                                 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
6334                                 ARCSTAT_BUMP(arcstat_l2_hits);
6335                                 hdr->b_l2hdr.b_hits++;
6336
6337                                 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
6338                                     KM_SLEEP);
6339                                 cb->l2rcb_hdr = hdr;
6340                                 cb->l2rcb_bp = *bp;
6341                                 cb->l2rcb_zb = *zb;
6342                                 cb->l2rcb_flags = zio_flags;
6343
6344                                 /*
6345                                  * When Compressed ARC is disabled, but the
6346                                  * L2ARC block is compressed, arc_hdr_size()
6347                                  * will have returned LSIZE rather than PSIZE.
6348                                  */
6349                                 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
6350                                     !HDR_COMPRESSION_ENABLED(hdr) &&
6351                                     HDR_GET_PSIZE(hdr) != 0) {
6352                                         size = HDR_GET_PSIZE(hdr);
6353                                 }
6354
6355                                 asize = vdev_psize_to_asize(vd, size);
6356                                 if (asize != size) {
6357                                         abd = abd_alloc_for_io(asize,
6358                                             HDR_ISTYPE_METADATA(hdr));
6359                                         cb->l2rcb_abd = abd;
6360                                 } else {
6361                                         abd = hdr_abd;
6362                                 }
6363
6364                                 ASSERT(addr >= VDEV_LABEL_START_SIZE &&
6365                                     addr + asize <= vd->vdev_psize -
6366                                     VDEV_LABEL_END_SIZE);
6367
6368                                 /*
6369                                  * l2arc read.  The SCL_L2ARC lock will be
6370                                  * released by l2arc_read_done().
6371                                  * Issue a null zio if the underlying buffer
6372                                  * was squashed to zero size by compression.
6373                                  */
6374                                 ASSERT3U(arc_hdr_get_compress(hdr), !=,
6375                                     ZIO_COMPRESS_EMPTY);
6376                                 rzio = zio_read_phys(pio, vd, addr,
6377                                     asize, abd,
6378                                     ZIO_CHECKSUM_OFF,
6379                                     l2arc_read_done, cb, priority,
6380                                     zio_flags | ZIO_FLAG_DONT_CACHE |
6381                                     ZIO_FLAG_CANFAIL |
6382                                     ZIO_FLAG_DONT_PROPAGATE |
6383                                     ZIO_FLAG_DONT_RETRY, B_FALSE);
6384                                 acb->acb_zio_head = rzio;
6385
6386                                 if (hash_lock != NULL)
6387                                         mutex_exit(hash_lock);
6388
6389                                 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
6390                                     zio_t *, rzio);
6391                                 ARCSTAT_INCR(arcstat_l2_read_bytes,
6392                                     HDR_GET_PSIZE(hdr));
6393
6394                                 if (*arc_flags & ARC_FLAG_NOWAIT) {
6395                                         zio_nowait(rzio);
6396                                         goto out;
6397                                 }
6398
6399                                 ASSERT(*arc_flags & ARC_FLAG_WAIT);
6400                                 if (zio_wait(rzio) == 0)
6401                                         goto out;
6402
6403                                 /* l2arc read error; goto zio_read() */
6404                                 if (hash_lock != NULL)
6405                                         mutex_enter(hash_lock);
6406                         } else {
6407                                 DTRACE_PROBE1(l2arc__miss,
6408                                     arc_buf_hdr_t *, hdr);
6409                                 ARCSTAT_BUMP(arcstat_l2_misses);
6410                                 if (HDR_L2_WRITING(hdr))
6411                                         ARCSTAT_BUMP(arcstat_l2_rw_clash);
6412                                 spa_config_exit(spa, SCL_L2ARC, vd);
6413                         }
6414                 } else {
6415                         if (vd != NULL)
6416                                 spa_config_exit(spa, SCL_L2ARC, vd);
6417
6418                         /*
6419                          * Only a spa with l2 should contribute to l2
6420                          * miss stats.  (Including the case of having a
6421                          * faulted cache device - that's also a miss.)
6422                          */
6423                         if (spa_has_l2) {
6424                                 /*
6425                                  * Skip ARC stat bump for block pointers with
6426                                  * embedded data. The data are read from the
6427                                  * blkptr itself via
6428                                  * decode_embedded_bp_compressed().
6429                                  */
6430                                 if (!embedded_bp) {
6431                                         DTRACE_PROBE1(l2arc__miss,
6432                                             arc_buf_hdr_t *, hdr);
6433                                         ARCSTAT_BUMP(arcstat_l2_misses);
6434                                 }
6435                         }
6436                 }
6437
6438                 rzio = zio_read(pio, spa, bp, hdr_abd, size,
6439                     arc_read_done, hdr, priority, zio_flags, zb);
6440                 acb->acb_zio_head = rzio;
6441
6442                 if (hash_lock != NULL)
6443                         mutex_exit(hash_lock);
6444
6445                 if (*arc_flags & ARC_FLAG_WAIT) {
6446                         rc = zio_wait(rzio);
6447                         goto out;
6448                 }
6449
6450                 ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
6451                 zio_nowait(rzio);
6452         }
6453
6454 out:
6455         /* embedded bps don't actually go to disk */
6456         if (!embedded_bp)
6457                 spa_read_history_add(spa, zb, *arc_flags);
6458         spl_fstrans_unmark(cookie);
6459         return (rc);
6460 }
6461
6462 arc_prune_t *
6463 arc_add_prune_callback(arc_prune_func_t *func, void *private)
6464 {
6465         arc_prune_t *p;
6466
6467         p = kmem_alloc(sizeof (*p), KM_SLEEP);
6468         p->p_pfunc = func;
6469         p->p_private = private;
6470         list_link_init(&p->p_node);
6471         zfs_refcount_create(&p->p_refcnt);
6472
6473         mutex_enter(&arc_prune_mtx);
6474         zfs_refcount_add(&p->p_refcnt, &arc_prune_list);
6475         list_insert_head(&arc_prune_list, p);
6476         mutex_exit(&arc_prune_mtx);
6477
6478         return (p);
6479 }
6480
6481 void
6482 arc_remove_prune_callback(arc_prune_t *p)
6483 {
6484         boolean_t wait = B_FALSE;
6485         mutex_enter(&arc_prune_mtx);
6486         list_remove(&arc_prune_list, p);
6487         if (zfs_refcount_remove(&p->p_refcnt, &arc_prune_list) > 0)
6488                 wait = B_TRUE;
6489         mutex_exit(&arc_prune_mtx);
6490
6491         /* wait for arc_prune_task to finish */
6492         if (wait)
6493                 taskq_wait_outstanding(arc_prune_taskq, 0);
6494         ASSERT0(zfs_refcount_count(&p->p_refcnt));
6495         zfs_refcount_destroy(&p->p_refcnt);
6496         kmem_free(p, sizeof (*p));
6497 }
6498
6499 /*
6500  * Notify the arc that a block was freed, and thus will never be used again.
6501  */
6502 void
6503 arc_freed(spa_t *spa, const blkptr_t *bp)
6504 {
6505         arc_buf_hdr_t *hdr;
6506         kmutex_t *hash_lock;
6507         uint64_t guid = spa_load_guid(spa);
6508
6509         ASSERT(!BP_IS_EMBEDDED(bp));
6510
6511         hdr = buf_hash_find(guid, bp, &hash_lock);
6512         if (hdr == NULL)
6513                 return;
6514
6515         /*
6516          * We might be trying to free a block that is still doing I/O
6517          * (i.e. prefetch) or has a reference (i.e. a dedup-ed,
6518          * dmu_sync-ed block). If this block is being prefetched, then it
6519          * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr
6520          * until the I/O completes. A block may also have a reference if it is
6521          * part of a dedup-ed, dmu_synced write. The dmu_sync() function would
6522          * have written the new block to its final resting place on disk but
6523          * without the dedup flag set. This would have left the hdr in the MRU
6524          * state and discoverable. When the txg finally syncs it detects that
6525          * the block was overridden in open context and issues an override I/O.
6526          * Since this is a dedup block, the override I/O will determine if the
6527          * block is already in the DDT. If so, then it will replace the io_bp
6528          * with the bp from the DDT and allow the I/O to finish. When the I/O
6529          * reaches the done callback, dbuf_write_override_done, it will
6530          * check to see if the io_bp and io_bp_override are identical.
6531          * If they are not, then it indicates that the bp was replaced with
6532          * the bp in the DDT and the override bp is freed. This allows
6533          * us to arrive here with a reference on a block that is being
6534          * freed. So if we have an I/O in progress, or a reference to
6535          * this hdr, then we don't destroy the hdr.
6536          */
6537         if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) &&
6538             zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) {
6539                 arc_change_state(arc_anon, hdr, hash_lock);
6540                 arc_hdr_destroy(hdr);
6541                 mutex_exit(hash_lock);
6542         } else {
6543                 mutex_exit(hash_lock);
6544         }
6545
6546 }
6547
6548 /*
6549  * Release this buffer from the cache, making it an anonymous buffer.  This
6550  * must be done after a read and prior to modifying the buffer contents.
6551  * If the buffer has more than one reference, we must make
6552  * a new hdr for the buffer.
6553  */
6554 void
6555 arc_release(arc_buf_t *buf, void *tag)
6556 {
6557         arc_buf_hdr_t *hdr = buf->b_hdr;
6558
6559         /*
6560          * It would be nice to assert that if its DMU metadata (level >
6561          * 0 || it's the dnode file), then it must be syncing context.
6562          * But we don't know that information at this level.
6563          */
6564
6565         mutex_enter(&buf->b_evict_lock);
6566
6567         ASSERT(HDR_HAS_L1HDR(hdr));
6568
6569         /*
6570          * We don't grab the hash lock prior to this check, because if
6571          * the buffer's header is in the arc_anon state, it won't be
6572          * linked into the hash table.
6573          */
6574         if (hdr->b_l1hdr.b_state == arc_anon) {
6575                 mutex_exit(&buf->b_evict_lock);
6576                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
6577                 ASSERT(!HDR_IN_HASH_TABLE(hdr));
6578                 ASSERT(!HDR_HAS_L2HDR(hdr));
6579
6580                 ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
6581                 ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
6582                 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
6583
6584                 hdr->b_l1hdr.b_arc_access = 0;
6585
6586                 /*
6587                  * If the buf is being overridden then it may already
6588                  * have a hdr that is not empty.
6589                  */
6590                 buf_discard_identity(hdr);
6591                 arc_buf_thaw(buf);
6592
6593                 return;
6594         }
6595
6596         kmutex_t *hash_lock = HDR_LOCK(hdr);
6597         mutex_enter(hash_lock);
6598
6599         /*
6600          * This assignment is only valid as long as the hash_lock is
6601          * held, we must be careful not to reference state or the
6602          * b_state field after dropping the lock.
6603          */
6604         arc_state_t *state = hdr->b_l1hdr.b_state;
6605         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
6606         ASSERT3P(state, !=, arc_anon);
6607
6608         /* this buffer is not on any list */
6609         ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0);
6610
6611         if (HDR_HAS_L2HDR(hdr)) {
6612                 mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
6613
6614                 /*
6615                  * We have to recheck this conditional again now that
6616                  * we're holding the l2ad_mtx to prevent a race with
6617                  * another thread which might be concurrently calling
6618                  * l2arc_evict(). In that case, l2arc_evict() might have
6619                  * destroyed the header's L2 portion as we were waiting
6620                  * to acquire the l2ad_mtx.
6621                  */
6622                 if (HDR_HAS_L2HDR(hdr))
6623                         arc_hdr_l2hdr_destroy(hdr);
6624
6625                 mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
6626         }
6627
6628         /*
6629          * Do we have more than one buf?
6630          */
6631         if (hdr->b_l1hdr.b_bufcnt > 1) {
6632                 arc_buf_hdr_t *nhdr;
6633                 uint64_t spa = hdr->b_spa;
6634                 uint64_t psize = HDR_GET_PSIZE(hdr);
6635                 uint64_t lsize = HDR_GET_LSIZE(hdr);
6636                 boolean_t protected = HDR_PROTECTED(hdr);
6637                 enum zio_compress compress = arc_hdr_get_compress(hdr);
6638                 arc_buf_contents_t type = arc_buf_type(hdr);
6639                 VERIFY3U(hdr->b_type, ==, type);
6640
6641                 ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
6642                 (void) remove_reference(hdr, hash_lock, tag);
6643
6644                 if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) {
6645                         ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
6646                         ASSERT(ARC_BUF_LAST(buf));
6647                 }
6648
6649                 /*
6650                  * Pull the data off of this hdr and attach it to
6651                  * a new anonymous hdr. Also find the last buffer
6652                  * in the hdr's buffer list.
6653                  */
6654                 arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
6655                 ASSERT3P(lastbuf, !=, NULL);
6656
6657                 /*
6658                  * If the current arc_buf_t and the hdr are sharing their data
6659                  * buffer, then we must stop sharing that block.
6660                  */
6661                 if (arc_buf_is_shared(buf)) {
6662                         ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
6663                         VERIFY(!arc_buf_is_shared(lastbuf));
6664
6665                         /*
6666                          * First, sever the block sharing relationship between
6667                          * buf and the arc_buf_hdr_t.
6668                          */
6669                         arc_unshare_buf(hdr, buf);
6670
6671                         /*
6672                          * Now we need to recreate the hdr's b_pabd. Since we
6673                          * have lastbuf handy, we try to share with it, but if
6674                          * we can't then we allocate a new b_pabd and copy the
6675                          * data from buf into it.
6676                          */
6677                         if (arc_can_share(hdr, lastbuf)) {
6678                                 arc_share_buf(hdr, lastbuf);
6679                         } else {
6680                                 arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT);
6681                                 abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
6682                                     buf->b_data, psize);
6683                         }
6684                         VERIFY3P(lastbuf->b_data, !=, NULL);
6685                 } else if (HDR_SHARED_DATA(hdr)) {
6686                         /*
6687                          * Uncompressed shared buffers are always at the end
6688                          * of the list. Compressed buffers don't have the
6689                          * same requirements. This makes it hard to
6690                          * simply assert that the lastbuf is shared so
6691                          * we rely on the hdr's compression flags to determine
6692                          * if we have a compressed, shared buffer.
6693                          */
6694                         ASSERT(arc_buf_is_shared(lastbuf) ||
6695                             arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
6696                         ASSERT(!ARC_BUF_SHARED(buf));
6697                 }
6698
6699                 ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
6700                 ASSERT3P(state, !=, arc_l2c_only);
6701
6702                 (void) zfs_refcount_remove_many(&state->arcs_size,
6703                     arc_buf_size(buf), buf);
6704
6705                 if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
6706                         ASSERT3P(state, !=, arc_l2c_only);
6707                         (void) zfs_refcount_remove_many(
6708                             &state->arcs_esize[type],
6709                             arc_buf_size(buf), buf);
6710                 }
6711
6712                 hdr->b_l1hdr.b_bufcnt -= 1;
6713                 if (ARC_BUF_ENCRYPTED(buf))
6714                         hdr->b_crypt_hdr.b_ebufcnt -= 1;
6715
6716                 arc_cksum_verify(buf);
6717                 arc_buf_unwatch(buf);
6718
6719                 /* if this is the last uncompressed buf free the checksum */
6720                 if (!arc_hdr_has_uncompressed_buf(hdr))
6721                         arc_cksum_free(hdr);
6722
6723                 mutex_exit(hash_lock);
6724
6725                 /*
6726                  * Allocate a new hdr. The new hdr will contain a b_pabd
6727                  * buffer which will be freed in arc_write().
6728                  */
6729                 nhdr = arc_hdr_alloc(spa, psize, lsize, protected,
6730                     compress, hdr->b_complevel, type);
6731                 ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL);
6732                 ASSERT0(nhdr->b_l1hdr.b_bufcnt);
6733                 ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt));
6734                 VERIFY3U(nhdr->b_type, ==, type);
6735                 ASSERT(!HDR_SHARED_DATA(nhdr));
6736
6737                 nhdr->b_l1hdr.b_buf = buf;
6738                 nhdr->b_l1hdr.b_bufcnt = 1;
6739                 if (ARC_BUF_ENCRYPTED(buf))
6740                         nhdr->b_crypt_hdr.b_ebufcnt = 1;
6741                 (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
6742                 buf->b_hdr = nhdr;
6743
6744                 mutex_exit(&buf->b_evict_lock);
6745                 (void) zfs_refcount_add_many(&arc_anon->arcs_size,
6746                     arc_buf_size(buf), buf);
6747         } else {
6748                 mutex_exit(&buf->b_evict_lock);
6749                 ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
6750                 /* protected by hash lock, or hdr is on arc_anon */
6751                 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
6752                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
6753                 hdr->b_l1hdr.b_mru_hits = 0;
6754                 hdr->b_l1hdr.b_mru_ghost_hits = 0;
6755                 hdr->b_l1hdr.b_mfu_hits = 0;
6756                 hdr->b_l1hdr.b_mfu_ghost_hits = 0;
6757                 arc_change_state(arc_anon, hdr, hash_lock);
6758                 hdr->b_l1hdr.b_arc_access = 0;
6759
6760                 mutex_exit(hash_lock);
6761                 buf_discard_identity(hdr);
6762                 arc_buf_thaw(buf);
6763         }
6764 }
6765
6766 int
6767 arc_released(arc_buf_t *buf)
6768 {
6769         int released;
6770
6771         mutex_enter(&buf->b_evict_lock);
6772         released = (buf->b_data != NULL &&
6773             buf->b_hdr->b_l1hdr.b_state == arc_anon);
6774         mutex_exit(&buf->b_evict_lock);
6775         return (released);
6776 }
6777
6778 #ifdef ZFS_DEBUG
6779 int
6780 arc_referenced(arc_buf_t *buf)
6781 {
6782         int referenced;
6783
6784         mutex_enter(&buf->b_evict_lock);
6785         referenced = (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
6786         mutex_exit(&buf->b_evict_lock);
6787         return (referenced);
6788 }
6789 #endif
6790
6791 static void
6792 arc_write_ready(zio_t *zio)
6793 {
6794         arc_write_callback_t *callback = zio->io_private;
6795         arc_buf_t *buf = callback->awcb_buf;
6796         arc_buf_hdr_t *hdr = buf->b_hdr;
6797         blkptr_t *bp = zio->io_bp;
6798         uint64_t psize = BP_IS_HOLE(bp) ? 0 : BP_GET_PSIZE(bp);
6799         fstrans_cookie_t cookie = spl_fstrans_mark();
6800
6801         ASSERT(HDR_HAS_L1HDR(hdr));
6802         ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
6803         ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
6804
6805         /*
6806          * If we're reexecuting this zio because the pool suspended, then
6807          * cleanup any state that was previously set the first time the
6808          * callback was invoked.
6809          */
6810         if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
6811                 arc_cksum_free(hdr);
6812                 arc_buf_unwatch(buf);
6813                 if (hdr->b_l1hdr.b_pabd != NULL) {
6814                         if (arc_buf_is_shared(buf)) {
6815                                 arc_unshare_buf(hdr, buf);
6816                         } else {
6817                                 arc_hdr_free_abd(hdr, B_FALSE);
6818                         }
6819                 }
6820
6821                 if (HDR_HAS_RABD(hdr))
6822                         arc_hdr_free_abd(hdr, B_TRUE);
6823         }
6824         ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
6825         ASSERT(!HDR_HAS_RABD(hdr));
6826         ASSERT(!HDR_SHARED_DATA(hdr));
6827         ASSERT(!arc_buf_is_shared(buf));
6828
6829         callback->awcb_ready(zio, buf, callback->awcb_private);
6830
6831         if (HDR_IO_IN_PROGRESS(hdr))
6832                 ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);
6833
6834         arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
6835
6836         if (BP_IS_PROTECTED(bp) != !!HDR_PROTECTED(hdr))
6837                 hdr = arc_hdr_realloc_crypt(hdr, BP_IS_PROTECTED(bp));
6838
6839         if (BP_IS_PROTECTED(bp)) {
6840                 /* ZIL blocks are written through zio_rewrite */
6841                 ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
6842                 ASSERT(HDR_PROTECTED(hdr));
6843
6844                 if (BP_SHOULD_BYTESWAP(bp)) {
6845                         if (BP_GET_LEVEL(bp) > 0) {
6846                                 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
6847                         } else {
6848                                 hdr->b_l1hdr.b_byteswap =
6849                                     DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
6850                         }
6851                 } else {
6852                         hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
6853                 }
6854
6855                 hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
6856                 hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
6857                 zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
6858                     hdr->b_crypt_hdr.b_iv);
6859                 zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
6860         }
6861
6862         /*
6863          * If this block was written for raw encryption but the zio layer
6864          * ended up only authenticating it, adjust the buffer flags now.
6865          */
6866         if (BP_IS_AUTHENTICATED(bp) && ARC_BUF_ENCRYPTED(buf)) {
6867                 arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
6868                 buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
6869                 if (BP_GET_COMPRESS(bp) == ZIO_COMPRESS_OFF)
6870                         buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
6871         } else if (BP_IS_HOLE(bp) && ARC_BUF_ENCRYPTED(buf)) {
6872                 buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
6873                 buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
6874         }
6875
6876         /* this must be done after the buffer flags are adjusted */
6877         arc_cksum_compute(buf);
6878
6879         enum zio_compress compress;
6880         if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
6881                 compress = ZIO_COMPRESS_OFF;
6882         } else {
6883                 ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
6884                 compress = BP_GET_COMPRESS(bp);
6885         }
6886         HDR_SET_PSIZE(hdr, psize);
6887         arc_hdr_set_compress(hdr, compress);
6888         hdr->b_complevel = zio->io_prop.zp_complevel;
6889
6890         if (zio->io_error != 0 || psize == 0)
6891                 goto out;
6892
6893         /*
6894          * Fill the hdr with data. If the buffer is encrypted we have no choice
6895          * but to copy the data into b_radb. If the hdr is compressed, the data
6896          * we want is available from the zio, otherwise we can take it from
6897          * the buf.
6898          *
6899          * We might be able to share the buf's data with the hdr here. However,
6900          * doing so would cause the ARC to be full of linear ABDs if we write a
6901          * lot of shareable data. As a compromise, we check whether scattered
6902          * ABDs are allowed, and assume that if they are then the user wants
6903          * the ARC to be primarily filled with them regardless of the data being
6904          * written. Therefore, if they're allowed then we allocate one and copy
6905          * the data into it; otherwise, we share the data directly if we can.
6906          */
6907         if (ARC_BUF_ENCRYPTED(buf)) {
6908                 ASSERT3U(psize, >, 0);
6909                 ASSERT(ARC_BUF_COMPRESSED(buf));
6910                 arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | ARC_HDR_ALLOC_RDATA |
6911                     ARC_HDR_USE_RESERVE);
6912                 abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
6913         } else if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) {
6914                 /*
6915                  * Ideally, we would always copy the io_abd into b_pabd, but the
6916                  * user may have disabled compressed ARC, thus we must check the
6917                  * hdr's compression setting rather than the io_bp's.
6918                  */
6919                 if (BP_IS_ENCRYPTED(bp)) {
6920                         ASSERT3U(psize, >, 0);
6921                         arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT |
6922                             ARC_HDR_ALLOC_RDATA | ARC_HDR_USE_RESERVE);
6923                         abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
6924                 } else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
6925                     !ARC_BUF_COMPRESSED(buf)) {
6926                         ASSERT3U(psize, >, 0);
6927                         arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT |
6928                             ARC_HDR_USE_RESERVE);
6929                         abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
6930                 } else {
6931                         ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
6932                         arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT |
6933                             ARC_HDR_USE_RESERVE);
6934                         abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
6935                             arc_buf_size(buf));
6936                 }
6937         } else {
6938                 ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
6939                 ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
6940                 ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
6941
6942                 arc_share_buf(hdr, buf);
6943         }
6944
6945 out:
6946         arc_hdr_verify(hdr, bp);
6947         spl_fstrans_unmark(cookie);
6948 }
6949
6950 static void
6951 arc_write_children_ready(zio_t *zio)
6952 {
6953         arc_write_callback_t *callback = zio->io_private;
6954         arc_buf_t *buf = callback->awcb_buf;
6955
6956         callback->awcb_children_ready(zio, buf, callback->awcb_private);
6957 }
6958
6959 /*
6960  * The SPA calls this callback for each physical write that happens on behalf
6961  * of a logical write.  See the comment in dbuf_write_physdone() for details.
6962  */
6963 static void
6964 arc_write_physdone(zio_t *zio)
6965 {
6966         arc_write_callback_t *cb = zio->io_private;
6967         if (cb->awcb_physdone != NULL)
6968                 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
6969 }
6970
6971 static void
6972 arc_write_done(zio_t *zio)
6973 {
6974         arc_write_callback_t *callback = zio->io_private;
6975         arc_buf_t *buf = callback->awcb_buf;
6976         arc_buf_hdr_t *hdr = buf->b_hdr;
6977
6978         ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
6979
6980         if (zio->io_error == 0) {
6981                 arc_hdr_verify(hdr, zio->io_bp);
6982
6983                 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
6984                         buf_discard_identity(hdr);
6985                 } else {
6986                         hdr->b_dva = *BP_IDENTITY(zio->io_bp);
6987                         hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
6988                 }
6989         } else {
6990                 ASSERT(HDR_EMPTY(hdr));
6991         }
6992
6993         /*
6994          * If the block to be written was all-zero or compressed enough to be
6995          * embedded in the BP, no write was performed so there will be no
6996          * dva/birth/checksum.  The buffer must therefore remain anonymous
6997          * (and uncached).
6998          */
6999         if (!HDR_EMPTY(hdr)) {
7000                 arc_buf_hdr_t *exists;
7001                 kmutex_t *hash_lock;
7002
7003                 ASSERT3U(zio->io_error, ==, 0);
7004
7005                 arc_cksum_verify(buf);
7006
7007                 exists = buf_hash_insert(hdr, &hash_lock);
7008                 if (exists != NULL) {
7009                         /*
7010                          * This can only happen if we overwrite for
7011                          * sync-to-convergence, because we remove
7012                          * buffers from the hash table when we arc_free().
7013                          */
7014                         if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
7015                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
7016                                         panic("bad overwrite, hdr=%p exists=%p",
7017                                             (void *)hdr, (void *)exists);
7018                                 ASSERT(zfs_refcount_is_zero(
7019                                     &exists->b_l1hdr.b_refcnt));
7020                                 arc_change_state(arc_anon, exists, hash_lock);
7021                                 arc_hdr_destroy(exists);
7022                                 mutex_exit(hash_lock);
7023                                 exists = buf_hash_insert(hdr, &hash_lock);
7024                                 ASSERT3P(exists, ==, NULL);
7025                         } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
7026                                 /* nopwrite */
7027                                 ASSERT(zio->io_prop.zp_nopwrite);
7028                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
7029                                         panic("bad nopwrite, hdr=%p exists=%p",
7030                                             (void *)hdr, (void *)exists);
7031                         } else {
7032                                 /* Dedup */
7033                                 ASSERT(hdr->b_l1hdr.b_bufcnt == 1);
7034                                 ASSERT(hdr->b_l1hdr.b_state == arc_anon);
7035                                 ASSERT(BP_GET_DEDUP(zio->io_bp));
7036                                 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
7037                         }
7038                 }
7039                 arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
7040                 /* if it's not anon, we are doing a scrub */
7041                 if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
7042                         arc_access(hdr, hash_lock);
7043                 mutex_exit(hash_lock);
7044         } else {
7045                 arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
7046         }
7047
7048         ASSERT(!zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
7049         callback->awcb_done(zio, buf, callback->awcb_private);
7050
7051         abd_free(zio->io_abd);
7052         kmem_free(callback, sizeof (arc_write_callback_t));
7053 }
7054
7055 zio_t *
7056 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
7057     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc,
7058     const zio_prop_t *zp, arc_write_done_func_t *ready,
7059     arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone,
7060     arc_write_done_func_t *done, void *private, zio_priority_t priority,
7061     int zio_flags, const zbookmark_phys_t *zb)
7062 {
7063         arc_buf_hdr_t *hdr = buf->b_hdr;
7064         arc_write_callback_t *callback;
7065         zio_t *zio;
7066         zio_prop_t localprop = *zp;
7067
7068         ASSERT3P(ready, !=, NULL);
7069         ASSERT3P(done, !=, NULL);
7070         ASSERT(!HDR_IO_ERROR(hdr));
7071         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
7072         ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
7073         ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
7074         if (l2arc)
7075                 arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
7076
7077         if (ARC_BUF_ENCRYPTED(buf)) {
7078                 ASSERT(ARC_BUF_COMPRESSED(buf));
7079                 localprop.zp_encrypt = B_TRUE;
7080                 localprop.zp_compress = HDR_GET_COMPRESS(hdr);
7081                 localprop.zp_complevel = hdr->b_complevel;
7082                 localprop.zp_byteorder =
7083                     (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
7084                     ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
7085                 bcopy(hdr->b_crypt_hdr.b_salt, localprop.zp_salt,
7086                     ZIO_DATA_SALT_LEN);
7087                 bcopy(hdr->b_crypt_hdr.b_iv, localprop.zp_iv,
7088                     ZIO_DATA_IV_LEN);
7089                 bcopy(hdr->b_crypt_hdr.b_mac, localprop.zp_mac,
7090                     ZIO_DATA_MAC_LEN);
7091                 if (DMU_OT_IS_ENCRYPTED(localprop.zp_type)) {
7092                         localprop.zp_nopwrite = B_FALSE;
7093                         localprop.zp_copies =
7094                             MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1);
7095                 }
7096                 zio_flags |= ZIO_FLAG_RAW;
7097         } else if (ARC_BUF_COMPRESSED(buf)) {
7098                 ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf));
7099                 localprop.zp_compress = HDR_GET_COMPRESS(hdr);
7100                 localprop.zp_complevel = hdr->b_complevel;
7101                 zio_flags |= ZIO_FLAG_RAW_COMPRESS;
7102         }
7103         callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
7104         callback->awcb_ready = ready;
7105         callback->awcb_children_ready = children_ready;
7106         callback->awcb_physdone = physdone;
7107         callback->awcb_done = done;
7108         callback->awcb_private = private;
7109         callback->awcb_buf = buf;
7110
7111         /*
7112          * The hdr's b_pabd is now stale, free it now. A new data block
7113          * will be allocated when the zio pipeline calls arc_write_ready().
7114          */
7115         if (hdr->b_l1hdr.b_pabd != NULL) {
7116                 /*
7117                  * If the buf is currently sharing the data block with
7118                  * the hdr then we need to break that relationship here.
7119                  * The hdr will remain with a NULL data pointer and the
7120                  * buf will take sole ownership of the block.
7121                  */
7122                 if (arc_buf_is_shared(buf)) {
7123                         arc_unshare_buf(hdr, buf);
7124                 } else {
7125                         arc_hdr_free_abd(hdr, B_FALSE);
7126                 }
7127                 VERIFY3P(buf->b_data, !=, NULL);
7128         }
7129
7130         if (HDR_HAS_RABD(hdr))
7131                 arc_hdr_free_abd(hdr, B_TRUE);
7132
7133         if (!(zio_flags & ZIO_FLAG_RAW))
7134                 arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
7135
7136         ASSERT(!arc_buf_is_shared(buf));
7137         ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
7138
7139         zio = zio_write(pio, spa, txg, bp,
7140             abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
7141             HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
7142             (children_ready != NULL) ? arc_write_children_ready : NULL,
7143             arc_write_physdone, arc_write_done, callback,
7144             priority, zio_flags, zb);
7145
7146         return (zio);
7147 }
7148
7149 void
7150 arc_tempreserve_clear(uint64_t reserve)
7151 {
7152         atomic_add_64(&arc_tempreserve, -reserve);
7153         ASSERT((int64_t)arc_tempreserve >= 0);
7154 }
7155
7156 int
7157 arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg)
7158 {
7159         int error;
7160         uint64_t anon_size;
7161
7162         if (!arc_no_grow &&
7163             reserve > arc_c/4 &&
7164             reserve * 4 > (2ULL << SPA_MAXBLOCKSHIFT))
7165                 arc_c = MIN(arc_c_max, reserve * 4);
7166
7167         /*
7168          * Throttle when the calculated memory footprint for the TXG
7169          * exceeds the target ARC size.
7170          */
7171         if (reserve > arc_c) {
7172                 DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
7173                 return (SET_ERROR(ERESTART));
7174         }
7175
7176         /*
7177          * Don't count loaned bufs as in flight dirty data to prevent long
7178          * network delays from blocking transactions that are ready to be
7179          * assigned to a txg.
7180          */
7181
7182         /* assert that it has not wrapped around */
7183         ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
7184
7185         anon_size = MAX((int64_t)(zfs_refcount_count(&arc_anon->arcs_size) -
7186             arc_loaned_bytes), 0);
7187
7188         /*
7189          * Writes will, almost always, require additional memory allocations
7190          * in order to compress/encrypt/etc the data.  We therefore need to
7191          * make sure that there is sufficient available memory for this.
7192          */
7193         error = arc_memory_throttle(spa, reserve, txg);
7194         if (error != 0)
7195                 return (error);
7196
7197         /*
7198          * Throttle writes when the amount of dirty data in the cache
7199          * gets too large.  We try to keep the cache less than half full
7200          * of dirty blocks so that our sync times don't grow too large.
7201          *
7202          * In the case of one pool being built on another pool, we want
7203          * to make sure we don't end up throttling the lower (backing)
7204          * pool when the upper pool is the majority contributor to dirty
7205          * data. To insure we make forward progress during throttling, we
7206          * also check the current pool's net dirty data and only throttle
7207          * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty
7208          * data in the cache.
7209          *
7210          * Note: if two requests come in concurrently, we might let them
7211          * both succeed, when one of them should fail.  Not a huge deal.
7212          */
7213         uint64_t total_dirty = reserve + arc_tempreserve + anon_size;
7214         uint64_t spa_dirty_anon = spa_dirty_data(spa);
7215         uint64_t rarc_c = arc_warm ? arc_c : arc_c_max;
7216         if (total_dirty > rarc_c * zfs_arc_dirty_limit_percent / 100 &&
7217             anon_size > rarc_c * zfs_arc_anon_limit_percent / 100 &&
7218             spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) {
7219 #ifdef ZFS_DEBUG
7220                 uint64_t meta_esize = zfs_refcount_count(
7221                     &arc_anon->arcs_esize[ARC_BUFC_METADATA]);
7222                 uint64_t data_esize =
7223                     zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
7224                 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
7225                     "anon_data=%lluK tempreserve=%lluK rarc_c=%lluK\n",
7226                     (u_longlong_t)arc_tempreserve >> 10,
7227                     (u_longlong_t)meta_esize >> 10,
7228                     (u_longlong_t)data_esize >> 10,
7229                     (u_longlong_t)reserve >> 10,
7230                     (u_longlong_t)rarc_c >> 10);
7231 #endif
7232                 DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
7233                 return (SET_ERROR(ERESTART));
7234         }
7235         atomic_add_64(&arc_tempreserve, reserve);
7236         return (0);
7237 }
7238
7239 static void
7240 arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
7241     kstat_named_t *evict_data, kstat_named_t *evict_metadata)
7242 {
7243         size->value.ui64 = zfs_refcount_count(&state->arcs_size);
7244         evict_data->value.ui64 =
7245             zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]);
7246         evict_metadata->value.ui64 =
7247             zfs_refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]);
7248 }
7249
7250 static int
7251 arc_kstat_update(kstat_t *ksp, int rw)
7252 {
7253         arc_stats_t *as = ksp->ks_data;
7254
7255         if (rw == KSTAT_WRITE)
7256                 return (SET_ERROR(EACCES));
7257
7258         as->arcstat_hits.value.ui64 =
7259             wmsum_value(&arc_sums.arcstat_hits);
7260         as->arcstat_misses.value.ui64 =
7261             wmsum_value(&arc_sums.arcstat_misses);
7262         as->arcstat_demand_data_hits.value.ui64 =
7263             wmsum_value(&arc_sums.arcstat_demand_data_hits);
7264         as->arcstat_demand_data_misses.value.ui64 =
7265             wmsum_value(&arc_sums.arcstat_demand_data_misses);
7266         as->arcstat_demand_metadata_hits.value.ui64 =
7267             wmsum_value(&arc_sums.arcstat_demand_metadata_hits);
7268         as->arcstat_demand_metadata_misses.value.ui64 =
7269             wmsum_value(&arc_sums.arcstat_demand_metadata_misses);
7270         as->arcstat_prefetch_data_hits.value.ui64 =
7271             wmsum_value(&arc_sums.arcstat_prefetch_data_hits);
7272         as->arcstat_prefetch_data_misses.value.ui64 =
7273             wmsum_value(&arc_sums.arcstat_prefetch_data_misses);
7274         as->arcstat_prefetch_metadata_hits.value.ui64 =
7275             wmsum_value(&arc_sums.arcstat_prefetch_metadata_hits);
7276         as->arcstat_prefetch_metadata_misses.value.ui64 =
7277             wmsum_value(&arc_sums.arcstat_prefetch_metadata_misses);
7278         as->arcstat_mru_hits.value.ui64 =
7279             wmsum_value(&arc_sums.arcstat_mru_hits);
7280         as->arcstat_mru_ghost_hits.value.ui64 =
7281             wmsum_value(&arc_sums.arcstat_mru_ghost_hits);
7282         as->arcstat_mfu_hits.value.ui64 =
7283             wmsum_value(&arc_sums.arcstat_mfu_hits);
7284         as->arcstat_mfu_ghost_hits.value.ui64 =
7285             wmsum_value(&arc_sums.arcstat_mfu_ghost_hits);
7286         as->arcstat_deleted.value.ui64 =
7287             wmsum_value(&arc_sums.arcstat_deleted);
7288         as->arcstat_mutex_miss.value.ui64 =
7289             wmsum_value(&arc_sums.arcstat_mutex_miss);
7290         as->arcstat_access_skip.value.ui64 =
7291             wmsum_value(&arc_sums.arcstat_access_skip);
7292         as->arcstat_evict_skip.value.ui64 =
7293             wmsum_value(&arc_sums.arcstat_evict_skip);
7294         as->arcstat_evict_not_enough.value.ui64 =
7295             wmsum_value(&arc_sums.arcstat_evict_not_enough);
7296         as->arcstat_evict_l2_cached.value.ui64 =
7297             wmsum_value(&arc_sums.arcstat_evict_l2_cached);
7298         as->arcstat_evict_l2_eligible.value.ui64 =
7299             wmsum_value(&arc_sums.arcstat_evict_l2_eligible);
7300         as->arcstat_evict_l2_eligible_mfu.value.ui64 =
7301             wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mfu);
7302         as->arcstat_evict_l2_eligible_mru.value.ui64 =
7303             wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mru);
7304         as->arcstat_evict_l2_ineligible.value.ui64 =
7305             wmsum_value(&arc_sums.arcstat_evict_l2_ineligible);
7306         as->arcstat_evict_l2_skip.value.ui64 =
7307             wmsum_value(&arc_sums.arcstat_evict_l2_skip);
7308         as->arcstat_hash_collisions.value.ui64 =
7309             wmsum_value(&arc_sums.arcstat_hash_collisions);
7310         as->arcstat_hash_chains.value.ui64 =
7311             wmsum_value(&arc_sums.arcstat_hash_chains);
7312         as->arcstat_size.value.ui64 =
7313             aggsum_value(&arc_sums.arcstat_size);
7314         as->arcstat_compressed_size.value.ui64 =
7315             wmsum_value(&arc_sums.arcstat_compressed_size);
7316         as->arcstat_uncompressed_size.value.ui64 =
7317             wmsum_value(&arc_sums.arcstat_uncompressed_size);
7318         as->arcstat_overhead_size.value.ui64 =
7319             wmsum_value(&arc_sums.arcstat_overhead_size);
7320         as->arcstat_hdr_size.value.ui64 =
7321             wmsum_value(&arc_sums.arcstat_hdr_size);
7322         as->arcstat_data_size.value.ui64 =
7323             wmsum_value(&arc_sums.arcstat_data_size);
7324         as->arcstat_metadata_size.value.ui64 =
7325             wmsum_value(&arc_sums.arcstat_metadata_size);
7326         as->arcstat_dbuf_size.value.ui64 =
7327             wmsum_value(&arc_sums.arcstat_dbuf_size);
7328 #if defined(COMPAT_FREEBSD11)
7329         as->arcstat_other_size.value.ui64 =
7330             wmsum_value(&arc_sums.arcstat_bonus_size) +
7331             aggsum_value(&arc_sums.arcstat_dnode_size) +
7332             wmsum_value(&arc_sums.arcstat_dbuf_size);
7333 #endif
7334
7335         arc_kstat_update_state(arc_anon,
7336             &as->arcstat_anon_size,
7337             &as->arcstat_anon_evictable_data,
7338             &as->arcstat_anon_evictable_metadata);
7339         arc_kstat_update_state(arc_mru,
7340             &as->arcstat_mru_size,
7341             &as->arcstat_mru_evictable_data,
7342             &as->arcstat_mru_evictable_metadata);
7343         arc_kstat_update_state(arc_mru_ghost,
7344             &as->arcstat_mru_ghost_size,
7345             &as->arcstat_mru_ghost_evictable_data,
7346             &as->arcstat_mru_ghost_evictable_metadata);
7347         arc_kstat_update_state(arc_mfu,
7348             &as->arcstat_mfu_size,
7349             &as->arcstat_mfu_evictable_data,
7350             &as->arcstat_mfu_evictable_metadata);
7351         arc_kstat_update_state(arc_mfu_ghost,
7352             &as->arcstat_mfu_ghost_size,
7353             &as->arcstat_mfu_ghost_evictable_data,
7354             &as->arcstat_mfu_ghost_evictable_metadata);
7355
7356         as->arcstat_dnode_size.value.ui64 =
7357             aggsum_value(&arc_sums.arcstat_dnode_size);
7358         as->arcstat_bonus_size.value.ui64 =
7359             wmsum_value(&arc_sums.arcstat_bonus_size);
7360         as->arcstat_l2_hits.value.ui64 =
7361             wmsum_value(&arc_sums.arcstat_l2_hits);
7362         as->arcstat_l2_misses.value.ui64 =
7363             wmsum_value(&arc_sums.arcstat_l2_misses);
7364         as->arcstat_l2_prefetch_asize.value.ui64 =
7365             wmsum_value(&arc_sums.arcstat_l2_prefetch_asize);
7366         as->arcstat_l2_mru_asize.value.ui64 =
7367             wmsum_value(&arc_sums.arcstat_l2_mru_asize);
7368         as->arcstat_l2_mfu_asize.value.ui64 =
7369             wmsum_value(&arc_sums.arcstat_l2_mfu_asize);
7370         as->arcstat_l2_bufc_data_asize.value.ui64 =
7371             wmsum_value(&arc_sums.arcstat_l2_bufc_data_asize);
7372         as->arcstat_l2_bufc_metadata_asize.value.ui64 =
7373             wmsum_value(&arc_sums.arcstat_l2_bufc_metadata_asize);
7374         as->arcstat_l2_feeds.value.ui64 =
7375             wmsum_value(&arc_sums.arcstat_l2_feeds);
7376         as->arcstat_l2_rw_clash.value.ui64 =
7377             wmsum_value(&arc_sums.arcstat_l2_rw_clash);
7378         as->arcstat_l2_read_bytes.value.ui64 =
7379             wmsum_value(&arc_sums.arcstat_l2_read_bytes);
7380         as->arcstat_l2_write_bytes.value.ui64 =
7381             wmsum_value(&arc_sums.arcstat_l2_write_bytes);
7382         as->arcstat_l2_writes_sent.value.ui64 =
7383             wmsum_value(&arc_sums.arcstat_l2_writes_sent);
7384         as->arcstat_l2_writes_done.value.ui64 =
7385             wmsum_value(&arc_sums.arcstat_l2_writes_done);
7386         as->arcstat_l2_writes_error.value.ui64 =
7387             wmsum_value(&arc_sums.arcstat_l2_writes_error);
7388         as->arcstat_l2_writes_lock_retry.value.ui64 =
7389             wmsum_value(&arc_sums.arcstat_l2_writes_lock_retry);
7390         as->arcstat_l2_evict_lock_retry.value.ui64 =
7391             wmsum_value(&arc_sums.arcstat_l2_evict_lock_retry);
7392         as->arcstat_l2_evict_reading.value.ui64 =
7393             wmsum_value(&arc_sums.arcstat_l2_evict_reading);
7394         as->arcstat_l2_evict_l1cached.value.ui64 =
7395             wmsum_value(&arc_sums.arcstat_l2_evict_l1cached);
7396         as->arcstat_l2_free_on_write.value.ui64 =
7397             wmsum_value(&arc_sums.arcstat_l2_free_on_write);
7398         as->arcstat_l2_abort_lowmem.value.ui64 =
7399             wmsum_value(&arc_sums.arcstat_l2_abort_lowmem);
7400         as->arcstat_l2_cksum_bad.value.ui64 =
7401             wmsum_value(&arc_sums.arcstat_l2_cksum_bad);
7402         as->arcstat_l2_io_error.value.ui64 =
7403             wmsum_value(&arc_sums.arcstat_l2_io_error);
7404         as->arcstat_l2_lsize.value.ui64 =
7405             wmsum_value(&arc_sums.arcstat_l2_lsize);
7406         as->arcstat_l2_psize.value.ui64 =
7407             wmsum_value(&arc_sums.arcstat_l2_psize);
7408         as->arcstat_l2_hdr_size.value.ui64 =
7409             aggsum_value(&arc_sums.arcstat_l2_hdr_size);
7410         as->arcstat_l2_log_blk_writes.value.ui64 =
7411             wmsum_value(&arc_sums.arcstat_l2_log_blk_writes);
7412         as->arcstat_l2_log_blk_asize.value.ui64 =
7413             wmsum_value(&arc_sums.arcstat_l2_log_blk_asize);
7414         as->arcstat_l2_log_blk_count.value.ui64 =
7415             wmsum_value(&arc_sums.arcstat_l2_log_blk_count);
7416         as->arcstat_l2_rebuild_success.value.ui64 =
7417             wmsum_value(&arc_sums.arcstat_l2_rebuild_success);
7418         as->arcstat_l2_rebuild_abort_unsupported.value.ui64 =
7419             wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_unsupported);
7420         as->arcstat_l2_rebuild_abort_io_errors.value.ui64 =
7421             wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_io_errors);
7422         as->arcstat_l2_rebuild_abort_dh_errors.value.ui64 =
7423             wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_dh_errors);
7424         as->arcstat_l2_rebuild_abort_cksum_lb_errors.value.ui64 =
7425             wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors);
7426         as->arcstat_l2_rebuild_abort_lowmem.value.ui64 =
7427             wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_lowmem);
7428         as->arcstat_l2_rebuild_size.value.ui64 =
7429             wmsum_value(&arc_sums.arcstat_l2_rebuild_size);
7430         as->arcstat_l2_rebuild_asize.value.ui64 =
7431             wmsum_value(&arc_sums.arcstat_l2_rebuild_asize);
7432         as->arcstat_l2_rebuild_bufs.value.ui64 =
7433             wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs);
7434         as->arcstat_l2_rebuild_bufs_precached.value.ui64 =
7435             wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs_precached);
7436         as->arcstat_l2_rebuild_log_blks.value.ui64 =
7437             wmsum_value(&arc_sums.arcstat_l2_rebuild_log_blks);
7438         as->arcstat_memory_throttle_count.value.ui64 =
7439             wmsum_value(&arc_sums.arcstat_memory_throttle_count);
7440         as->arcstat_memory_direct_count.value.ui64 =
7441             wmsum_value(&arc_sums.arcstat_memory_direct_count);
7442         as->arcstat_memory_indirect_count.value.ui64 =
7443             wmsum_value(&arc_sums.arcstat_memory_indirect_count);
7444
7445         as->arcstat_memory_all_bytes.value.ui64 =
7446             arc_all_memory();
7447         as->arcstat_memory_free_bytes.value.ui64 =
7448             arc_free_memory();
7449         as->arcstat_memory_available_bytes.value.i64 =
7450             arc_available_memory();
7451
7452         as->arcstat_prune.value.ui64 =
7453             wmsum_value(&arc_sums.arcstat_prune);
7454         as->arcstat_meta_used.value.ui64 =
7455             aggsum_value(&arc_sums.arcstat_meta_used);
7456         as->arcstat_async_upgrade_sync.value.ui64 =
7457             wmsum_value(&arc_sums.arcstat_async_upgrade_sync);
7458         as->arcstat_demand_hit_predictive_prefetch.value.ui64 =
7459             wmsum_value(&arc_sums.arcstat_demand_hit_predictive_prefetch);
7460         as->arcstat_demand_hit_prescient_prefetch.value.ui64 =
7461             wmsum_value(&arc_sums.arcstat_demand_hit_prescient_prefetch);
7462         as->arcstat_raw_size.value.ui64 =
7463             wmsum_value(&arc_sums.arcstat_raw_size);
7464         as->arcstat_cached_only_in_progress.value.ui64 =
7465             wmsum_value(&arc_sums.arcstat_cached_only_in_progress);
7466         as->arcstat_abd_chunk_waste_size.value.ui64 =
7467             wmsum_value(&arc_sums.arcstat_abd_chunk_waste_size);
7468
7469         return (0);
7470 }
7471
7472 /*
7473  * This function *must* return indices evenly distributed between all
7474  * sublists of the multilist. This is needed due to how the ARC eviction
7475  * code is laid out; arc_evict_state() assumes ARC buffers are evenly
7476  * distributed between all sublists and uses this assumption when
7477  * deciding which sublist to evict from and how much to evict from it.
7478  */
7479 static unsigned int
7480 arc_state_multilist_index_func(multilist_t *ml, void *obj)
7481 {
7482         arc_buf_hdr_t *hdr = obj;
7483
7484         /*
7485          * We rely on b_dva to generate evenly distributed index
7486          * numbers using buf_hash below. So, as an added precaution,
7487          * let's make sure we never add empty buffers to the arc lists.
7488          */
7489         ASSERT(!HDR_EMPTY(hdr));
7490
7491         /*
7492          * The assumption here, is the hash value for a given
7493          * arc_buf_hdr_t will remain constant throughout its lifetime
7494          * (i.e. its b_spa, b_dva, and b_birth fields don't change).
7495          * Thus, we don't need to store the header's sublist index
7496          * on insertion, as this index can be recalculated on removal.
7497          *
7498          * Also, the low order bits of the hash value are thought to be
7499          * distributed evenly. Otherwise, in the case that the multilist
7500          * has a power of two number of sublists, each sublists' usage
7501          * would not be evenly distributed. In this context full 64bit
7502          * division would be a waste of time, so limit it to 32 bits.
7503          */
7504         return ((unsigned int)buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
7505             multilist_get_num_sublists(ml));
7506 }
7507
7508 static unsigned int
7509 arc_state_l2c_multilist_index_func(multilist_t *ml, void *obj)
7510 {
7511         panic("Header %p insert into arc_l2c_only %p", obj, ml);
7512 }
7513
7514 #define WARN_IF_TUNING_IGNORED(tuning, value, do_warn) do {     \
7515         if ((do_warn) && (tuning) && ((tuning) != (value))) {   \
7516                 cmn_err(CE_WARN,                                \
7517                     "ignoring tunable %s (using %llu instead)", \
7518                     (#tuning), (value));                        \
7519         }                                                       \
7520 } while (0)
7521
7522 /*
7523  * Called during module initialization and periodically thereafter to
7524  * apply reasonable changes to the exposed performance tunings.  Can also be
7525  * called explicitly by param_set_arc_*() functions when ARC tunables are
7526  * updated manually.  Non-zero zfs_* values which differ from the currently set
7527  * values will be applied.
7528  */
7529 void
7530 arc_tuning_update(boolean_t verbose)
7531 {
7532         uint64_t allmem = arc_all_memory();
7533         unsigned long limit;
7534
7535         /* Valid range: 32M - <arc_c_max> */
7536         if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) &&
7537             (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) &&
7538             (zfs_arc_min <= arc_c_max)) {
7539                 arc_c_min = zfs_arc_min;
7540                 arc_c = MAX(arc_c, arc_c_min);
7541         }
7542         WARN_IF_TUNING_IGNORED(zfs_arc_min, arc_c_min, verbose);
7543
7544         /* Valid range: 64M - <all physical memory> */
7545         if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) &&
7546             (zfs_arc_max >= MIN_ARC_MAX) && (zfs_arc_max < allmem) &&
7547             (zfs_arc_max > arc_c_min)) {
7548                 arc_c_max = zfs_arc_max;
7549                 arc_c = MIN(arc_c, arc_c_max);
7550                 arc_p = (arc_c >> 1);
7551                 if (arc_meta_limit > arc_c_max)
7552                         arc_meta_limit = arc_c_max;
7553                 if (arc_dnode_size_limit > arc_meta_limit)
7554                         arc_dnode_size_limit = arc_meta_limit;
7555         }
7556         WARN_IF_TUNING_IGNORED(zfs_arc_max, arc_c_max, verbose);
7557
7558         /* Valid range: 16M - <arc_c_max> */
7559         if ((zfs_arc_meta_min) && (zfs_arc_meta_min != arc_meta_min) &&
7560             (zfs_arc_meta_min >= 1ULL << SPA_MAXBLOCKSHIFT) &&
7561             (zfs_arc_meta_min <= arc_c_max)) {
7562                 arc_meta_min = zfs_arc_meta_min;
7563                 if (arc_meta_limit < arc_meta_min)
7564                         arc_meta_limit = arc_meta_min;
7565                 if (arc_dnode_size_limit < arc_meta_min)
7566                         arc_dnode_size_limit = arc_meta_min;
7567         }
7568         WARN_IF_TUNING_IGNORED(zfs_arc_meta_min, arc_meta_min, verbose);
7569
7570         /* Valid range: <arc_meta_min> - <arc_c_max> */
7571         limit = zfs_arc_meta_limit ? zfs_arc_meta_limit :
7572             MIN(zfs_arc_meta_limit_percent, 100) * arc_c_max / 100;
7573         if ((limit != arc_meta_limit) &&
7574             (limit >= arc_meta_min) &&
7575             (limit <= arc_c_max))
7576                 arc_meta_limit = limit;
7577         WARN_IF_TUNING_IGNORED(zfs_arc_meta_limit, arc_meta_limit, verbose);
7578
7579         /* Valid range: <arc_meta_min> - <arc_meta_limit> */
7580         limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit :
7581             MIN(zfs_arc_dnode_limit_percent, 100) * arc_meta_limit / 100;
7582         if ((limit != arc_dnode_size_limit) &&
7583             (limit >= arc_meta_min) &&
7584             (limit <= arc_meta_limit))
7585                 arc_dnode_size_limit = limit;
7586         WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_size_limit,
7587             verbose);
7588
7589         /* Valid range: 1 - N */
7590         if (zfs_arc_grow_retry)
7591                 arc_grow_retry = zfs_arc_grow_retry;
7592
7593         /* Valid range: 1 - N */
7594         if (zfs_arc_shrink_shift) {
7595                 arc_shrink_shift = zfs_arc_shrink_shift;
7596                 arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1);
7597         }
7598
7599         /* Valid range: 1 - N */
7600         if (zfs_arc_p_min_shift)
7601                 arc_p_min_shift = zfs_arc_p_min_shift;
7602
7603         /* Valid range: 1 - N ms */
7604         if (zfs_arc_min_prefetch_ms)
7605                 arc_min_prefetch_ms = zfs_arc_min_prefetch_ms;
7606
7607         /* Valid range: 1 - N ms */
7608         if (zfs_arc_min_prescient_prefetch_ms) {
7609                 arc_min_prescient_prefetch_ms =
7610                     zfs_arc_min_prescient_prefetch_ms;
7611         }
7612
7613         /* Valid range: 0 - 100 */
7614         if ((zfs_arc_lotsfree_percent >= 0) &&
7615             (zfs_arc_lotsfree_percent <= 100))
7616                 arc_lotsfree_percent = zfs_arc_lotsfree_percent;
7617         WARN_IF_TUNING_IGNORED(zfs_arc_lotsfree_percent, arc_lotsfree_percent,
7618             verbose);
7619
7620         /* Valid range: 0 - <all physical memory> */
7621         if ((zfs_arc_sys_free) && (zfs_arc_sys_free != arc_sys_free))
7622                 arc_sys_free = MIN(MAX(zfs_arc_sys_free, 0), allmem);
7623         WARN_IF_TUNING_IGNORED(zfs_arc_sys_free, arc_sys_free, verbose);
7624 }
7625
7626 static void
7627 arc_state_multilist_init(multilist_t *ml,
7628     multilist_sublist_index_func_t *index_func, int *maxcountp)
7629 {
7630         multilist_create(ml, sizeof (arc_buf_hdr_t),
7631             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), index_func);
7632         *maxcountp = MAX(*maxcountp, multilist_get_num_sublists(ml));
7633 }
7634
7635 static void
7636 arc_state_init(void)
7637 {
7638         int num_sublists = 0;
7639
7640         arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_METADATA],
7641             arc_state_multilist_index_func, &num_sublists);
7642         arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_DATA],
7643             arc_state_multilist_index_func, &num_sublists);
7644         arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
7645             arc_state_multilist_index_func, &num_sublists);
7646         arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
7647             arc_state_multilist_index_func, &num_sublists);
7648         arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
7649             arc_state_multilist_index_func, &num_sublists);
7650         arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_DATA],
7651             arc_state_multilist_index_func, &num_sublists);
7652         arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
7653             arc_state_multilist_index_func, &num_sublists);
7654         arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
7655             arc_state_multilist_index_func, &num_sublists);
7656
7657         /*
7658          * L2 headers should never be on the L2 state list since they don't
7659          * have L1 headers allocated.  Special index function asserts that.
7660          */
7661         arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
7662             arc_state_l2c_multilist_index_func, &num_sublists);
7663         arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
7664             arc_state_l2c_multilist_index_func, &num_sublists);
7665
7666         /*
7667          * Keep track of the number of markers needed to reclaim buffers from
7668          * any ARC state.  The markers will be pre-allocated so as to minimize
7669          * the number of memory allocations performed by the eviction thread.
7670          */
7671         arc_state_evict_marker_count = num_sublists;
7672
7673         zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
7674         zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
7675         zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
7676         zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
7677         zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
7678         zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
7679         zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
7680         zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
7681         zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
7682         zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
7683         zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
7684         zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
7685
7686         zfs_refcount_create(&arc_anon->arcs_size);
7687         zfs_refcount_create(&arc_mru->arcs_size);
7688         zfs_refcount_create(&arc_mru_ghost->arcs_size);
7689         zfs_refcount_create(&arc_mfu->arcs_size);
7690         zfs_refcount_create(&arc_mfu_ghost->arcs_size);
7691         zfs_refcount_create(&arc_l2c_only->arcs_size);
7692
7693         wmsum_init(&arc_sums.arcstat_hits, 0);
7694         wmsum_init(&arc_sums.arcstat_misses, 0);
7695         wmsum_init(&arc_sums.arcstat_demand_data_hits, 0);
7696         wmsum_init(&arc_sums.arcstat_demand_data_misses, 0);
7697         wmsum_init(&arc_sums.arcstat_demand_metadata_hits, 0);
7698         wmsum_init(&arc_sums.arcstat_demand_metadata_misses, 0);
7699         wmsum_init(&arc_sums.arcstat_prefetch_data_hits, 0);
7700         wmsum_init(&arc_sums.arcstat_prefetch_data_misses, 0);
7701         wmsum_init(&arc_sums.arcstat_prefetch_metadata_hits, 0);
7702         wmsum_init(&arc_sums.arcstat_prefetch_metadata_misses, 0);
7703         wmsum_init(&arc_sums.arcstat_mru_hits, 0);
7704         wmsum_init(&arc_sums.arcstat_mru_ghost_hits, 0);
7705         wmsum_init(&arc_sums.arcstat_mfu_hits, 0);
7706         wmsum_init(&arc_sums.arcstat_mfu_ghost_hits, 0);
7707         wmsum_init(&arc_sums.arcstat_deleted, 0);
7708         wmsum_init(&arc_sums.arcstat_mutex_miss, 0);
7709         wmsum_init(&arc_sums.arcstat_access_skip, 0);
7710         wmsum_init(&arc_sums.arcstat_evict_skip, 0);
7711         wmsum_init(&arc_sums.arcstat_evict_not_enough, 0);
7712         wmsum_init(&arc_sums.arcstat_evict_l2_cached, 0);
7713         wmsum_init(&arc_sums.arcstat_evict_l2_eligible, 0);
7714         wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mfu, 0);
7715         wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mru, 0);
7716         wmsum_init(&arc_sums.arcstat_evict_l2_ineligible, 0);
7717         wmsum_init(&arc_sums.arcstat_evict_l2_skip, 0);
7718         wmsum_init(&arc_sums.arcstat_hash_collisions, 0);
7719         wmsum_init(&arc_sums.arcstat_hash_chains, 0);
7720         aggsum_init(&arc_sums.arcstat_size, 0);
7721         wmsum_init(&arc_sums.arcstat_compressed_size, 0);
7722         wmsum_init(&arc_sums.arcstat_uncompressed_size, 0);
7723         wmsum_init(&arc_sums.arcstat_overhead_size, 0);
7724         wmsum_init(&arc_sums.arcstat_hdr_size, 0);
7725         wmsum_init(&arc_sums.arcstat_data_size, 0);
7726         wmsum_init(&arc_sums.arcstat_metadata_size, 0);
7727         wmsum_init(&arc_sums.arcstat_dbuf_size, 0);
7728         aggsum_init(&arc_sums.arcstat_dnode_size, 0);
7729         wmsum_init(&arc_sums.arcstat_bonus_size, 0);
7730         wmsum_init(&arc_sums.arcstat_l2_hits, 0);
7731         wmsum_init(&arc_sums.arcstat_l2_misses, 0);
7732         wmsum_init(&arc_sums.arcstat_l2_prefetch_asize, 0);
7733         wmsum_init(&arc_sums.arcstat_l2_mru_asize, 0);
7734         wmsum_init(&arc_sums.arcstat_l2_mfu_asize, 0);
7735         wmsum_init(&arc_sums.arcstat_l2_bufc_data_asize, 0);
7736         wmsum_init(&arc_sums.arcstat_l2_bufc_metadata_asize, 0);
7737         wmsum_init(&arc_sums.arcstat_l2_feeds, 0);
7738         wmsum_init(&arc_sums.arcstat_l2_rw_clash, 0);
7739         wmsum_init(&arc_sums.arcstat_l2_read_bytes, 0);
7740         wmsum_init(&arc_sums.arcstat_l2_write_bytes, 0);
7741         wmsum_init(&arc_sums.arcstat_l2_writes_sent, 0);
7742         wmsum_init(&arc_sums.arcstat_l2_writes_done, 0);
7743         wmsum_init(&arc_sums.arcstat_l2_writes_error, 0);
7744         wmsum_init(&arc_sums.arcstat_l2_writes_lock_retry, 0);
7745         wmsum_init(&arc_sums.arcstat_l2_evict_lock_retry, 0);
7746         wmsum_init(&arc_sums.arcstat_l2_evict_reading, 0);
7747         wmsum_init(&arc_sums.arcstat_l2_evict_l1cached, 0);
7748         wmsum_init(&arc_sums.arcstat_l2_free_on_write, 0);
7749         wmsum_init(&arc_sums.arcstat_l2_abort_lowmem, 0);
7750         wmsum_init(&arc_sums.arcstat_l2_cksum_bad, 0);
7751         wmsum_init(&arc_sums.arcstat_l2_io_error, 0);
7752         wmsum_init(&arc_sums.arcstat_l2_lsize, 0);
7753         wmsum_init(&arc_sums.arcstat_l2_psize, 0);
7754         aggsum_init(&arc_sums.arcstat_l2_hdr_size, 0);
7755         wmsum_init(&arc_sums.arcstat_l2_log_blk_writes, 0);
7756         wmsum_init(&arc_sums.arcstat_l2_log_blk_asize, 0);
7757         wmsum_init(&arc_sums.arcstat_l2_log_blk_count, 0);
7758         wmsum_init(&arc_sums.arcstat_l2_rebuild_success, 0);
7759         wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_unsupported, 0);
7760         wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_io_errors, 0);
7761         wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_dh_errors, 0);
7762         wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors, 0);
7763         wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_lowmem, 0);
7764         wmsum_init(&arc_sums.arcstat_l2_rebuild_size, 0);
7765         wmsum_init(&arc_sums.arcstat_l2_rebuild_asize, 0);
7766         wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs, 0);
7767         wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs_precached, 0);
7768         wmsum_init(&arc_sums.arcstat_l2_rebuild_log_blks, 0);
7769         wmsum_init(&arc_sums.arcstat_memory_throttle_count, 0);
7770         wmsum_init(&arc_sums.arcstat_memory_direct_count, 0);
7771         wmsum_init(&arc_sums.arcstat_memory_indirect_count, 0);
7772         wmsum_init(&arc_sums.arcstat_prune, 0);
7773         aggsum_init(&arc_sums.arcstat_meta_used, 0);
7774         wmsum_init(&arc_sums.arcstat_async_upgrade_sync, 0);
7775         wmsum_init(&arc_sums.arcstat_demand_hit_predictive_prefetch, 0);
7776         wmsum_init(&arc_sums.arcstat_demand_hit_prescient_prefetch, 0);
7777         wmsum_init(&arc_sums.arcstat_raw_size, 0);
7778         wmsum_init(&arc_sums.arcstat_cached_only_in_progress, 0);
7779         wmsum_init(&arc_sums.arcstat_abd_chunk_waste_size, 0);
7780
7781         arc_anon->arcs_state = ARC_STATE_ANON;
7782         arc_mru->arcs_state = ARC_STATE_MRU;
7783         arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
7784         arc_mfu->arcs_state = ARC_STATE_MFU;
7785         arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
7786         arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
7787 }
7788
7789 static void
7790 arc_state_fini(void)
7791 {
7792         zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
7793         zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
7794         zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
7795         zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
7796         zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
7797         zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
7798         zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
7799         zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
7800         zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
7801         zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
7802         zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
7803         zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
7804
7805         zfs_refcount_destroy(&arc_anon->arcs_size);
7806         zfs_refcount_destroy(&arc_mru->arcs_size);
7807         zfs_refcount_destroy(&arc_mru_ghost->arcs_size);
7808         zfs_refcount_destroy(&arc_mfu->arcs_size);
7809         zfs_refcount_destroy(&arc_mfu_ghost->arcs_size);
7810         zfs_refcount_destroy(&arc_l2c_only->arcs_size);
7811
7812         multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
7813         multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
7814         multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
7815         multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
7816         multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
7817         multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
7818         multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
7819         multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
7820         multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
7821         multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
7822
7823         wmsum_fini(&arc_sums.arcstat_hits);
7824         wmsum_fini(&arc_sums.arcstat_misses);
7825         wmsum_fini(&arc_sums.arcstat_demand_data_hits);
7826         wmsum_fini(&arc_sums.arcstat_demand_data_misses);
7827         wmsum_fini(&arc_sums.arcstat_demand_metadata_hits);
7828         wmsum_fini(&arc_sums.arcstat_demand_metadata_misses);
7829         wmsum_fini(&arc_sums.arcstat_prefetch_data_hits);
7830         wmsum_fini(&arc_sums.arcstat_prefetch_data_misses);
7831         wmsum_fini(&arc_sums.arcstat_prefetch_metadata_hits);
7832         wmsum_fini(&arc_sums.arcstat_prefetch_metadata_misses);
7833         wmsum_fini(&arc_sums.arcstat_mru_hits);
7834         wmsum_fini(&arc_sums.arcstat_mru_ghost_hits);
7835         wmsum_fini(&arc_sums.arcstat_mfu_hits);
7836         wmsum_fini(&arc_sums.arcstat_mfu_ghost_hits);
7837         wmsum_fini(&arc_sums.arcstat_deleted);
7838         wmsum_fini(&arc_sums.arcstat_mutex_miss);
7839         wmsum_fini(&arc_sums.arcstat_access_skip);
7840         wmsum_fini(&arc_sums.arcstat_evict_skip);
7841         wmsum_fini(&arc_sums.arcstat_evict_not_enough);
7842         wmsum_fini(&arc_sums.arcstat_evict_l2_cached);
7843         wmsum_fini(&arc_sums.arcstat_evict_l2_eligible);
7844         wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mfu);
7845         wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mru);
7846         wmsum_fini(&arc_sums.arcstat_evict_l2_ineligible);
7847         wmsum_fini(&arc_sums.arcstat_evict_l2_skip);
7848         wmsum_fini(&arc_sums.arcstat_hash_collisions);
7849         wmsum_fini(&arc_sums.arcstat_hash_chains);
7850         aggsum_fini(&arc_sums.arcstat_size);
7851         wmsum_fini(&arc_sums.arcstat_compressed_size);
7852         wmsum_fini(&arc_sums.arcstat_uncompressed_size);
7853         wmsum_fini(&arc_sums.arcstat_overhead_size);
7854         wmsum_fini(&arc_sums.arcstat_hdr_size);
7855         wmsum_fini(&arc_sums.arcstat_data_size);
7856         wmsum_fini(&arc_sums.arcstat_metadata_size);
7857         wmsum_fini(&arc_sums.arcstat_dbuf_size);
7858         aggsum_fini(&arc_sums.arcstat_dnode_size);
7859         wmsum_fini(&arc_sums.arcstat_bonus_size);
7860         wmsum_fini(&arc_sums.arcstat_l2_hits);
7861         wmsum_fini(&arc_sums.arcstat_l2_misses);
7862         wmsum_fini(&arc_sums.arcstat_l2_prefetch_asize);
7863         wmsum_fini(&arc_sums.arcstat_l2_mru_asize);
7864         wmsum_fini(&arc_sums.arcstat_l2_mfu_asize);
7865         wmsum_fini(&arc_sums.arcstat_l2_bufc_data_asize);
7866         wmsum_fini(&arc_sums.arcstat_l2_bufc_metadata_asize);
7867         wmsum_fini(&arc_sums.arcstat_l2_feeds);
7868         wmsum_fini(&arc_sums.arcstat_l2_rw_clash);
7869         wmsum_fini(&arc_sums.arcstat_l2_read_bytes);
7870         wmsum_fini(&arc_sums.arcstat_l2_write_bytes);
7871         wmsum_fini(&arc_sums.arcstat_l2_writes_sent);
7872         wmsum_fini(&arc_sums.arcstat_l2_writes_done);
7873         wmsum_fini(&arc_sums.arcstat_l2_writes_error);
7874         wmsum_fini(&arc_sums.arcstat_l2_writes_lock_retry);
7875         wmsum_fini(&arc_sums.arcstat_l2_evict_lock_retry);
7876         wmsum_fini(&arc_sums.arcstat_l2_evict_reading);
7877         wmsum_fini(&arc_sums.arcstat_l2_evict_l1cached);
7878         wmsum_fini(&arc_sums.arcstat_l2_free_on_write);
7879         wmsum_fini(&arc_sums.arcstat_l2_abort_lowmem);
7880         wmsum_fini(&arc_sums.arcstat_l2_cksum_bad);
7881         wmsum_fini(&arc_sums.arcstat_l2_io_error);
7882         wmsum_fini(&arc_sums.arcstat_l2_lsize);
7883         wmsum_fini(&arc_sums.arcstat_l2_psize);
7884         aggsum_fini(&arc_sums.arcstat_l2_hdr_size);
7885         wmsum_fini(&arc_sums.arcstat_l2_log_blk_writes);
7886         wmsum_fini(&arc_sums.arcstat_l2_log_blk_asize);
7887         wmsum_fini(&arc_sums.arcstat_l2_log_blk_count);
7888         wmsum_fini(&arc_sums.arcstat_l2_rebuild_success);
7889         wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_unsupported);
7890         wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_io_errors);
7891         wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_dh_errors);
7892         wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors);
7893         wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_lowmem);
7894         wmsum_fini(&arc_sums.arcstat_l2_rebuild_size);
7895         wmsum_fini(&arc_sums.arcstat_l2_rebuild_asize);
7896         wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs);
7897         wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs_precached);
7898         wmsum_fini(&arc_sums.arcstat_l2_rebuild_log_blks);
7899         wmsum_fini(&arc_sums.arcstat_memory_throttle_count);
7900         wmsum_fini(&arc_sums.arcstat_memory_direct_count);
7901         wmsum_fini(&arc_sums.arcstat_memory_indirect_count);
7902         wmsum_fini(&arc_sums.arcstat_prune);
7903         aggsum_fini(&arc_sums.arcstat_meta_used);
7904         wmsum_fini(&arc_sums.arcstat_async_upgrade_sync);
7905         wmsum_fini(&arc_sums.arcstat_demand_hit_predictive_prefetch);
7906         wmsum_fini(&arc_sums.arcstat_demand_hit_prescient_prefetch);
7907         wmsum_fini(&arc_sums.arcstat_raw_size);
7908         wmsum_fini(&arc_sums.arcstat_cached_only_in_progress);
7909         wmsum_fini(&arc_sums.arcstat_abd_chunk_waste_size);
7910 }
7911
7912 uint64_t
7913 arc_target_bytes(void)
7914 {
7915         return (arc_c);
7916 }
7917
7918 void
7919 arc_set_limits(uint64_t allmem)
7920 {
7921         /* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */
7922         arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT);
7923
7924         /* How to set default max varies by platform. */
7925         arc_c_max = arc_default_max(arc_c_min, allmem);
7926 }
7927 void
7928 arc_init(void)
7929 {
7930         uint64_t percent, allmem = arc_all_memory();
7931         mutex_init(&arc_evict_lock, NULL, MUTEX_DEFAULT, NULL);
7932         list_create(&arc_evict_waiters, sizeof (arc_evict_waiter_t),
7933             offsetof(arc_evict_waiter_t, aew_node));
7934
7935         arc_min_prefetch_ms = 1000;
7936         arc_min_prescient_prefetch_ms = 6000;
7937
7938 #if defined(_KERNEL)
7939         arc_lowmem_init();
7940 #endif
7941
7942         arc_set_limits(allmem);
7943
7944 #ifdef _KERNEL
7945         /*
7946          * If zfs_arc_max is non-zero at init, meaning it was set in the kernel
7947          * environment before the module was loaded, don't block setting the
7948          * maximum because it is less than arc_c_min, instead, reset arc_c_min
7949          * to a lower value.
7950          * zfs_arc_min will be handled by arc_tuning_update().
7951          */
7952         if (zfs_arc_max != 0 && zfs_arc_max >= MIN_ARC_MAX &&
7953             zfs_arc_max < allmem) {
7954                 arc_c_max = zfs_arc_max;
7955                 if (arc_c_min >= arc_c_max) {
7956                         arc_c_min = MAX(zfs_arc_max / 2,
7957                             2ULL << SPA_MAXBLOCKSHIFT);
7958                 }
7959         }
7960 #else
7961         /*
7962          * In userland, there's only the memory pressure that we artificially
7963          * create (see arc_available_memory()).  Don't let arc_c get too
7964          * small, because it can cause transactions to be larger than
7965          * arc_c, causing arc_tempreserve_space() to fail.
7966          */
7967         arc_c_min = MAX(arc_c_max / 2, 2ULL << SPA_MAXBLOCKSHIFT);
7968 #endif
7969
7970         arc_c = arc_c_min;
7971         arc_p = (arc_c >> 1);
7972
7973         /* Set min to 1/2 of arc_c_min */
7974         arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT;
7975         /*
7976          * Set arc_meta_limit to a percent of arc_c_max with a floor of
7977          * arc_meta_min, and a ceiling of arc_c_max.
7978          */
7979         percent = MIN(zfs_arc_meta_limit_percent, 100);
7980         arc_meta_limit = MAX(arc_meta_min, (percent * arc_c_max) / 100);
7981         percent = MIN(zfs_arc_dnode_limit_percent, 100);
7982         arc_dnode_size_limit = (percent * arc_meta_limit) / 100;
7983
7984         /* Apply user specified tunings */
7985         arc_tuning_update(B_TRUE);
7986
7987         /* if kmem_flags are set, lets try to use less memory */
7988         if (kmem_debugging())
7989                 arc_c = arc_c / 2;
7990         if (arc_c < arc_c_min)
7991                 arc_c = arc_c_min;
7992
7993         arc_register_hotplug();
7994
7995         arc_state_init();
7996
7997         buf_init();
7998
7999         list_create(&arc_prune_list, sizeof (arc_prune_t),
8000             offsetof(arc_prune_t, p_node));
8001         mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
8002
8003         arc_prune_taskq = taskq_create("arc_prune", 100, defclsyspri,
8004             boot_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC |
8005             TASKQ_THREADS_CPU_PCT);
8006
8007         arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
8008             sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
8009
8010         if (arc_ksp != NULL) {
8011                 arc_ksp->ks_data = &arc_stats;
8012                 arc_ksp->ks_update = arc_kstat_update;
8013                 kstat_install(arc_ksp);
8014         }
8015
8016         arc_state_evict_markers =
8017             arc_state_alloc_markers(arc_state_evict_marker_count);
8018         arc_evict_zthr = zthr_create("arc_evict",
8019             arc_evict_cb_check, arc_evict_cb, NULL, defclsyspri);
8020         arc_reap_zthr = zthr_create_timer("arc_reap",
8021             arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1), minclsyspri);
8022
8023         arc_warm = B_FALSE;
8024
8025         /*
8026          * Calculate maximum amount of dirty data per pool.
8027          *
8028          * If it has been set by a module parameter, take that.
8029          * Otherwise, use a percentage of physical memory defined by
8030          * zfs_dirty_data_max_percent (default 10%) with a cap at
8031          * zfs_dirty_data_max_max (default 4G or 25% of physical memory).
8032          */
8033 #ifdef __LP64__
8034         if (zfs_dirty_data_max_max == 0)
8035                 zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024,
8036                     allmem * zfs_dirty_data_max_max_percent / 100);
8037 #else
8038         if (zfs_dirty_data_max_max == 0)
8039                 zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024,
8040                     allmem * zfs_dirty_data_max_max_percent / 100);
8041 #endif
8042
8043         if (zfs_dirty_data_max == 0) {
8044                 zfs_dirty_data_max = allmem *
8045                     zfs_dirty_data_max_percent / 100;
8046                 zfs_dirty_data_max = MIN(zfs_dirty_data_max,
8047                     zfs_dirty_data_max_max);
8048         }
8049 }
8050
8051 void
8052 arc_fini(void)
8053 {
8054         arc_prune_t *p;
8055
8056 #ifdef _KERNEL
8057         arc_lowmem_fini();
8058 #endif /* _KERNEL */
8059
8060         /* Use B_TRUE to ensure *all* buffers are evicted */
8061         arc_flush(NULL, B_TRUE);
8062
8063         if (arc_ksp != NULL) {
8064                 kstat_delete(arc_ksp);
8065                 arc_ksp = NULL;
8066         }
8067
8068         taskq_wait(arc_prune_taskq);
8069         taskq_destroy(arc_prune_taskq);
8070
8071         mutex_enter(&arc_prune_mtx);
8072         while ((p = list_head(&arc_prune_list)) != NULL) {
8073                 list_remove(&arc_prune_list, p);
8074                 zfs_refcount_remove(&p->p_refcnt, &arc_prune_list);
8075                 zfs_refcount_destroy(&p->p_refcnt);
8076                 kmem_free(p, sizeof (*p));
8077         }
8078         mutex_exit(&arc_prune_mtx);
8079
8080         list_destroy(&arc_prune_list);
8081         mutex_destroy(&arc_prune_mtx);
8082
8083         (void) zthr_cancel(arc_evict_zthr);
8084         (void) zthr_cancel(arc_reap_zthr);
8085         arc_state_free_markers(arc_state_evict_markers,
8086             arc_state_evict_marker_count);
8087
8088         mutex_destroy(&arc_evict_lock);
8089         list_destroy(&arc_evict_waiters);
8090
8091         /*
8092          * Free any buffers that were tagged for destruction.  This needs
8093          * to occur before arc_state_fini() runs and destroys the aggsum
8094          * values which are updated when freeing scatter ABDs.
8095          */
8096         l2arc_do_free_on_write();
8097
8098         /*
8099          * buf_fini() must proceed arc_state_fini() because buf_fin() may
8100          * trigger the release of kmem magazines, which can callback to
8101          * arc_space_return() which accesses aggsums freed in act_state_fini().
8102          */
8103         buf_fini();
8104         arc_state_fini();
8105
8106         arc_unregister_hotplug();
8107
8108         /*
8109          * We destroy the zthrs after all the ARC state has been
8110          * torn down to avoid the case of them receiving any
8111          * wakeup() signals after they are destroyed.
8112          */
8113         zthr_destroy(arc_evict_zthr);
8114         zthr_destroy(arc_reap_zthr);
8115
8116         ASSERT0(arc_loaned_bytes);
8117 }
8118
8119 /*
8120  * Level 2 ARC
8121  *
8122  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
8123  * It uses dedicated storage devices to hold cached data, which are populated
8124  * using large infrequent writes.  The main role of this cache is to boost
8125  * the performance of random read workloads.  The intended L2ARC devices
8126  * include short-stroked disks, solid state disks, and other media with
8127  * substantially faster read latency than disk.
8128  *
8129  *                 +-----------------------+
8130  *                 |         ARC           |
8131  *                 +-----------------------+
8132  *                    |         ^     ^
8133  *                    |         |     |
8134  *      l2arc_feed_thread()    arc_read()
8135  *                    |         |     |
8136  *                    |  l2arc read   |
8137  *                    V         |     |
8138  *               +---------------+    |
8139  *               |     L2ARC     |    |
8140  *               +---------------+    |
8141  *                   |    ^           |
8142  *          l2arc_write() |           |
8143  *                   |    |           |
8144  *                   V    |           |
8145  *                 +-------+      +-------+
8146  *                 | vdev  |      | vdev  |
8147  *                 | cache |      | cache |
8148  *                 +-------+      +-------+
8149  *                 +=========+     .-----.
8150  *                 :  L2ARC  :    |-_____-|
8151  *                 : devices :    | Disks |
8152  *                 +=========+    `-_____-'
8153  *
8154  * Read requests are satisfied from the following sources, in order:
8155  *
8156  *      1) ARC
8157  *      2) vdev cache of L2ARC devices
8158  *      3) L2ARC devices
8159  *      4) vdev cache of disks
8160  *      5) disks
8161  *
8162  * Some L2ARC device types exhibit extremely slow write performance.
8163  * To accommodate for this there are some significant differences between
8164  * the L2ARC and traditional cache design:
8165  *
8166  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
8167  * the ARC behave as usual, freeing buffers and placing headers on ghost
8168  * lists.  The ARC does not send buffers to the L2ARC during eviction as
8169  * this would add inflated write latencies for all ARC memory pressure.
8170  *
8171  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
8172  * It does this by periodically scanning buffers from the eviction-end of
8173  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
8174  * not already there. It scans until a headroom of buffers is satisfied,
8175  * which itself is a buffer for ARC eviction. If a compressible buffer is
8176  * found during scanning and selected for writing to an L2ARC device, we
8177  * temporarily boost scanning headroom during the next scan cycle to make
8178  * sure we adapt to compression effects (which might significantly reduce
8179  * the data volume we write to L2ARC). The thread that does this is
8180  * l2arc_feed_thread(), illustrated below; example sizes are included to
8181  * provide a better sense of ratio than this diagram:
8182  *
8183  *             head -->                        tail
8184  *              +---------------------+----------+
8185  *      ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
8186  *              +---------------------+----------+   |   o L2ARC eligible
8187  *      ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
8188  *              +---------------------+----------+   |
8189  *                   15.9 Gbytes      ^ 32 Mbytes    |
8190  *                                 headroom          |
8191  *                                            l2arc_feed_thread()
8192  *                                                   |
8193  *                       l2arc write hand <--[oooo]--'
8194  *                               |           8 Mbyte
8195  *                               |          write max
8196  *                               V
8197  *                +==============================+
8198  *      L2ARC dev |####|#|###|###|    |####| ... |
8199  *                +==============================+
8200  *                           32 Gbytes
8201  *
8202  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
8203  * evicted, then the L2ARC has cached a buffer much sooner than it probably
8204  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
8205  * safe to say that this is an uncommon case, since buffers at the end of
8206  * the ARC lists have moved there due to inactivity.
8207  *
8208  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
8209  * then the L2ARC simply misses copying some buffers.  This serves as a
8210  * pressure valve to prevent heavy read workloads from both stalling the ARC
8211  * with waits and clogging the L2ARC with writes.  This also helps prevent
8212  * the potential for the L2ARC to churn if it attempts to cache content too
8213  * quickly, such as during backups of the entire pool.
8214  *
8215  * 5. After system boot and before the ARC has filled main memory, there are
8216  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
8217  * lists can remain mostly static.  Instead of searching from tail of these
8218  * lists as pictured, the l2arc_feed_thread() will search from the list heads
8219  * for eligible buffers, greatly increasing its chance of finding them.
8220  *
8221  * The L2ARC device write speed is also boosted during this time so that
8222  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
8223  * there are no L2ARC reads, and no fear of degrading read performance
8224  * through increased writes.
8225  *
8226  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
8227  * the vdev queue can aggregate them into larger and fewer writes.  Each
8228  * device is written to in a rotor fashion, sweeping writes through
8229  * available space then repeating.
8230  *
8231  * 7. The L2ARC does not store dirty content.  It never needs to flush
8232  * write buffers back to disk based storage.
8233  *
8234  * 8. If an ARC buffer is written (and dirtied) which also exists in the
8235  * L2ARC, the now stale L2ARC buffer is immediately dropped.
8236  *
8237  * The performance of the L2ARC can be tweaked by a number of tunables, which
8238  * may be necessary for different workloads:
8239  *
8240  *      l2arc_write_max         max write bytes per interval
8241  *      l2arc_write_boost       extra write bytes during device warmup
8242  *      l2arc_noprefetch        skip caching prefetched buffers
8243  *      l2arc_headroom          number of max device writes to precache
8244  *      l2arc_headroom_boost    when we find compressed buffers during ARC
8245  *                              scanning, we multiply headroom by this
8246  *                              percentage factor for the next scan cycle,
8247  *                              since more compressed buffers are likely to
8248  *                              be present
8249  *      l2arc_feed_secs         seconds between L2ARC writing
8250  *
8251  * Tunables may be removed or added as future performance improvements are
8252  * integrated, and also may become zpool properties.
8253  *
8254  * There are three key functions that control how the L2ARC warms up:
8255  *
8256  *      l2arc_write_eligible()  check if a buffer is eligible to cache
8257  *      l2arc_write_size()      calculate how much to write
8258  *      l2arc_write_interval()  calculate sleep delay between writes
8259  *
8260  * These three functions determine what to write, how much, and how quickly
8261  * to send writes.
8262  *
8263  * L2ARC persistence:
8264  *
8265  * When writing buffers to L2ARC, we periodically add some metadata to
8266  * make sure we can pick them up after reboot, thus dramatically reducing
8267  * the impact that any downtime has on the performance of storage systems
8268  * with large caches.
8269  *
8270  * The implementation works fairly simply by integrating the following two
8271  * modifications:
8272  *
8273  * *) When writing to the L2ARC, we occasionally write a "l2arc log block",
8274  *    which is an additional piece of metadata which describes what's been
8275  *    written. This allows us to rebuild the arc_buf_hdr_t structures of the
8276  *    main ARC buffers. There are 2 linked-lists of log blocks headed by
8277  *    dh_start_lbps[2]. We alternate which chain we append to, so they are
8278  *    time-wise and offset-wise interleaved, but that is an optimization rather
8279  *    than for correctness. The log block also includes a pointer to the
8280  *    previous block in its chain.
8281  *
8282  * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device
8283  *    for our header bookkeeping purposes. This contains a device header,
8284  *    which contains our top-level reference structures. We update it each
8285  *    time we write a new log block, so that we're able to locate it in the
8286  *    L2ARC device. If this write results in an inconsistent device header
8287  *    (e.g. due to power failure), we detect this by verifying the header's
8288  *    checksum and simply fail to reconstruct the L2ARC after reboot.
8289  *
8290  * Implementation diagram:
8291  *
8292  * +=== L2ARC device (not to scale) ======================================+
8293  * |       ___two newest log block pointers__.__________                  |
8294  * |      /                                   \dh_start_lbps[1]           |
8295  * |     /                                     \         \dh_start_lbps[0]|
8296  * |.___/__.                                    V         V               |
8297  * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---|
8298  * ||   hdr|      ^         /^       /^        /         /                |
8299  * |+------+  ...--\-------/  \-----/--\------/         /                 |
8300  * |                \--------------/    \--------------/                  |
8301  * +======================================================================+
8302  *
8303  * As can be seen on the diagram, rather than using a simple linked list,
8304  * we use a pair of linked lists with alternating elements. This is a
8305  * performance enhancement due to the fact that we only find out the
8306  * address of the next log block access once the current block has been
8307  * completely read in. Obviously, this hurts performance, because we'd be
8308  * keeping the device's I/O queue at only a 1 operation deep, thus
8309  * incurring a large amount of I/O round-trip latency. Having two lists
8310  * allows us to fetch two log blocks ahead of where we are currently
8311  * rebuilding L2ARC buffers.
8312  *
8313  * On-device data structures:
8314  *
8315  * L2ARC device header: l2arc_dev_hdr_phys_t
8316  * L2ARC log block:     l2arc_log_blk_phys_t
8317  *
8318  * L2ARC reconstruction:
8319  *
8320  * When writing data, we simply write in the standard rotary fashion,
8321  * evicting buffers as we go and simply writing new data over them (writing
8322  * a new log block every now and then). This obviously means that once we
8323  * loop around the end of the device, we will start cutting into an already
8324  * committed log block (and its referenced data buffers), like so:
8325  *
8326  *    current write head__       __old tail
8327  *                        \     /
8328  *                        V    V
8329  * <--|bufs |lb |bufs |lb |    |bufs |lb |bufs |lb |-->
8330  *                         ^    ^^^^^^^^^___________________________________
8331  *                         |                                                \
8332  *                   <<nextwrite>> may overwrite this blk and/or its bufs --'
8333  *
8334  * When importing the pool, we detect this situation and use it to stop
8335  * our scanning process (see l2arc_rebuild).
8336  *
8337  * There is one significant caveat to consider when rebuilding ARC contents
8338  * from an L2ARC device: what about invalidated buffers? Given the above
8339  * construction, we cannot update blocks which we've already written to amend
8340  * them to remove buffers which were invalidated. Thus, during reconstruction,
8341  * we might be populating the cache with buffers for data that's not on the
8342  * main pool anymore, or may have been overwritten!
8343  *
8344  * As it turns out, this isn't a problem. Every arc_read request includes
8345  * both the DVA and, crucially, the birth TXG of the BP the caller is
8346  * looking for. So even if the cache were populated by completely rotten
8347  * blocks for data that had been long deleted and/or overwritten, we'll
8348  * never actually return bad data from the cache, since the DVA with the
8349  * birth TXG uniquely identify a block in space and time - once created,
8350  * a block is immutable on disk. The worst thing we have done is wasted
8351  * some time and memory at l2arc rebuild to reconstruct outdated ARC
8352  * entries that will get dropped from the l2arc as it is being updated
8353  * with new blocks.
8354  *
8355  * L2ARC buffers that have been evicted by l2arc_evict() ahead of the write
8356  * hand are not restored. This is done by saving the offset (in bytes)
8357  * l2arc_evict() has evicted to in the L2ARC device header and taking it
8358  * into account when restoring buffers.
8359  */
8360
8361 static boolean_t
8362 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
8363 {
8364         /*
8365          * A buffer is *not* eligible for the L2ARC if it:
8366          * 1. belongs to a different spa.
8367          * 2. is already cached on the L2ARC.
8368          * 3. has an I/O in progress (it may be an incomplete read).
8369          * 4. is flagged not eligible (zfs property).
8370          */
8371         if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) ||
8372             HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr))
8373                 return (B_FALSE);
8374
8375         return (B_TRUE);
8376 }
8377
8378 static uint64_t
8379 l2arc_write_size(l2arc_dev_t *dev)
8380 {
8381         uint64_t size, dev_size, tsize;
8382
8383         /*
8384          * Make sure our globals have meaningful values in case the user
8385          * altered them.
8386          */
8387         size = l2arc_write_max;
8388         if (size == 0) {
8389                 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
8390                     "be greater than zero, resetting it to the default (%d)",
8391                     L2ARC_WRITE_SIZE);
8392                 size = l2arc_write_max = L2ARC_WRITE_SIZE;
8393         }
8394
8395         if (arc_warm == B_FALSE)
8396                 size += l2arc_write_boost;
8397
8398         /*
8399          * Make sure the write size does not exceed the size of the cache
8400          * device. This is important in l2arc_evict(), otherwise infinite
8401          * iteration can occur.
8402          */
8403         dev_size = dev->l2ad_end - dev->l2ad_start;
8404         tsize = size + l2arc_log_blk_overhead(size, dev);
8405         if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0)
8406                 tsize += MAX(64 * 1024 * 1024,
8407                     (tsize * l2arc_trim_ahead) / 100);
8408
8409         if (tsize >= dev_size) {
8410                 cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost "
8411                     "plus the overhead of log blocks (persistent L2ARC, "
8412                     "%llu bytes) exceeds the size of the cache device "
8413                     "(guid %llu), resetting them to the default (%d)",
8414                     l2arc_log_blk_overhead(size, dev),
8415                     dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE);
8416                 size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE;
8417
8418                 if (arc_warm == B_FALSE)
8419                         size += l2arc_write_boost;
8420         }
8421
8422         return (size);
8423
8424 }
8425
8426 static clock_t
8427 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
8428 {
8429         clock_t interval, next, now;
8430
8431         /*
8432          * If the ARC lists are busy, increase our write rate; if the
8433          * lists are stale, idle back.  This is achieved by checking
8434          * how much we previously wrote - if it was more than half of
8435          * what we wanted, schedule the next write much sooner.
8436          */
8437         if (l2arc_feed_again && wrote > (wanted / 2))
8438                 interval = (hz * l2arc_feed_min_ms) / 1000;
8439         else
8440                 interval = hz * l2arc_feed_secs;
8441
8442         now = ddi_get_lbolt();
8443         next = MAX(now, MIN(now + interval, began + interval));
8444
8445         return (next);
8446 }
8447
8448 /*
8449  * Cycle through L2ARC devices.  This is how L2ARC load balances.
8450  * If a device is returned, this also returns holding the spa config lock.
8451  */
8452 static l2arc_dev_t *
8453 l2arc_dev_get_next(void)
8454 {
8455         l2arc_dev_t *first, *next = NULL;
8456
8457         /*
8458          * Lock out the removal of spas (spa_namespace_lock), then removal
8459          * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
8460          * both locks will be dropped and a spa config lock held instead.
8461          */
8462         mutex_enter(&spa_namespace_lock);
8463         mutex_enter(&l2arc_dev_mtx);
8464
8465         /* if there are no vdevs, there is nothing to do */
8466         if (l2arc_ndev == 0)
8467                 goto out;
8468
8469         first = NULL;
8470         next = l2arc_dev_last;
8471         do {
8472                 /* loop around the list looking for a non-faulted vdev */
8473                 if (next == NULL) {
8474                         next = list_head(l2arc_dev_list);
8475                 } else {
8476                         next = list_next(l2arc_dev_list, next);
8477                         if (next == NULL)
8478                                 next = list_head(l2arc_dev_list);
8479                 }
8480
8481                 /* if we have come back to the start, bail out */
8482                 if (first == NULL)
8483                         first = next;
8484                 else if (next == first)
8485                         break;
8486
8487         } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
8488             next->l2ad_trim_all);
8489
8490         /* if we were unable to find any usable vdevs, return NULL */
8491         if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
8492             next->l2ad_trim_all)
8493                 next = NULL;
8494
8495         l2arc_dev_last = next;
8496
8497 out:
8498         mutex_exit(&l2arc_dev_mtx);
8499
8500         /*
8501          * Grab the config lock to prevent the 'next' device from being
8502          * removed while we are writing to it.
8503          */
8504         if (next != NULL)
8505                 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
8506         mutex_exit(&spa_namespace_lock);
8507
8508         return (next);
8509 }
8510
8511 /*
8512  * Free buffers that were tagged for destruction.
8513  */
8514 static void
8515 l2arc_do_free_on_write(void)
8516 {
8517         list_t *buflist;
8518         l2arc_data_free_t *df, *df_prev;
8519
8520         mutex_enter(&l2arc_free_on_write_mtx);
8521         buflist = l2arc_free_on_write;
8522
8523         for (df = list_tail(buflist); df; df = df_prev) {
8524                 df_prev = list_prev(buflist, df);
8525                 ASSERT3P(df->l2df_abd, !=, NULL);
8526                 abd_free(df->l2df_abd);
8527                 list_remove(buflist, df);
8528                 kmem_free(df, sizeof (l2arc_data_free_t));
8529         }
8530
8531         mutex_exit(&l2arc_free_on_write_mtx);
8532 }
8533
8534 /*
8535  * A write to a cache device has completed.  Update all headers to allow
8536  * reads from these buffers to begin.
8537  */
8538 static void
8539 l2arc_write_done(zio_t *zio)
8540 {
8541         l2arc_write_callback_t  *cb;
8542         l2arc_lb_abd_buf_t      *abd_buf;
8543         l2arc_lb_ptr_buf_t      *lb_ptr_buf;
8544         l2arc_dev_t             *dev;
8545         l2arc_dev_hdr_phys_t    *l2dhdr;
8546         list_t                  *buflist;
8547         arc_buf_hdr_t           *head, *hdr, *hdr_prev;
8548         kmutex_t                *hash_lock;
8549         int64_t                 bytes_dropped = 0;
8550
8551         cb = zio->io_private;
8552         ASSERT3P(cb, !=, NULL);
8553         dev = cb->l2wcb_dev;
8554         l2dhdr = dev->l2ad_dev_hdr;
8555         ASSERT3P(dev, !=, NULL);
8556         head = cb->l2wcb_head;
8557         ASSERT3P(head, !=, NULL);
8558         buflist = &dev->l2ad_buflist;
8559         ASSERT3P(buflist, !=, NULL);
8560         DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
8561             l2arc_write_callback_t *, cb);
8562
8563         /*
8564          * All writes completed, or an error was hit.
8565          */
8566 top:
8567         mutex_enter(&dev->l2ad_mtx);
8568         for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
8569                 hdr_prev = list_prev(buflist, hdr);
8570
8571                 hash_lock = HDR_LOCK(hdr);
8572
8573                 /*
8574                  * We cannot use mutex_enter or else we can deadlock
8575                  * with l2arc_write_buffers (due to swapping the order
8576                  * the hash lock and l2ad_mtx are taken).
8577                  */
8578                 if (!mutex_tryenter(hash_lock)) {
8579                         /*
8580                          * Missed the hash lock. We must retry so we
8581                          * don't leave the ARC_FLAG_L2_WRITING bit set.
8582                          */
8583                         ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
8584
8585                         /*
8586                          * We don't want to rescan the headers we've
8587                          * already marked as having been written out, so
8588                          * we reinsert the head node so we can pick up
8589                          * where we left off.
8590                          */
8591                         list_remove(buflist, head);
8592                         list_insert_after(buflist, hdr, head);
8593
8594                         mutex_exit(&dev->l2ad_mtx);
8595
8596                         /*
8597                          * We wait for the hash lock to become available
8598                          * to try and prevent busy waiting, and increase
8599                          * the chance we'll be able to acquire the lock
8600                          * the next time around.
8601                          */
8602                         mutex_enter(hash_lock);
8603                         mutex_exit(hash_lock);
8604                         goto top;
8605                 }
8606
8607                 /*
8608                  * We could not have been moved into the arc_l2c_only
8609                  * state while in-flight due to our ARC_FLAG_L2_WRITING
8610                  * bit being set. Let's just ensure that's being enforced.
8611                  */
8612                 ASSERT(HDR_HAS_L1HDR(hdr));
8613
8614                 /*
8615                  * Skipped - drop L2ARC entry and mark the header as no
8616                  * longer L2 eligibile.
8617                  */
8618                 if (zio->io_error != 0) {
8619                         /*
8620                          * Error - drop L2ARC entry.
8621                          */
8622                         list_remove(buflist, hdr);
8623                         arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
8624
8625                         uint64_t psize = HDR_GET_PSIZE(hdr);
8626                         l2arc_hdr_arcstats_decrement(hdr);
8627
8628                         bytes_dropped +=
8629                             vdev_psize_to_asize(dev->l2ad_vdev, psize);
8630                         (void) zfs_refcount_remove_many(&dev->l2ad_alloc,
8631                             arc_hdr_size(hdr), hdr);
8632                 }
8633
8634                 /*
8635                  * Allow ARC to begin reads and ghost list evictions to
8636                  * this L2ARC entry.
8637                  */
8638                 arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING);
8639
8640                 mutex_exit(hash_lock);
8641         }
8642
8643         /*
8644          * Free the allocated abd buffers for writing the log blocks.
8645          * If the zio failed reclaim the allocated space and remove the
8646          * pointers to these log blocks from the log block pointer list
8647          * of the L2ARC device.
8648          */
8649         while ((abd_buf = list_remove_tail(&cb->l2wcb_abd_list)) != NULL) {
8650                 abd_free(abd_buf->abd);
8651                 zio_buf_free(abd_buf, sizeof (*abd_buf));
8652                 if (zio->io_error != 0) {
8653                         lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list);
8654                         /*
8655                          * L2BLK_GET_PSIZE returns aligned size for log
8656                          * blocks.
8657                          */
8658                         uint64_t asize =
8659                             L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop);
8660                         bytes_dropped += asize;
8661                         ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
8662                         ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
8663                         zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
8664                             lb_ptr_buf);
8665                         zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf);
8666                         kmem_free(lb_ptr_buf->lb_ptr,
8667                             sizeof (l2arc_log_blkptr_t));
8668                         kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
8669                 }
8670         }
8671         list_destroy(&cb->l2wcb_abd_list);
8672
8673         if (zio->io_error != 0) {
8674                 ARCSTAT_BUMP(arcstat_l2_writes_error);
8675
8676                 /*
8677                  * Restore the lbps array in the header to its previous state.
8678                  * If the list of log block pointers is empty, zero out the
8679                  * log block pointers in the device header.
8680                  */
8681                 lb_ptr_buf = list_head(&dev->l2ad_lbptr_list);
8682                 for (int i = 0; i < 2; i++) {
8683                         if (lb_ptr_buf == NULL) {
8684                                 /*
8685                                  * If the list is empty zero out the device
8686                                  * header. Otherwise zero out the second log
8687                                  * block pointer in the header.
8688                                  */
8689                                 if (i == 0) {
8690                                         bzero(l2dhdr, dev->l2ad_dev_hdr_asize);
8691                                 } else {
8692                                         bzero(&l2dhdr->dh_start_lbps[i],
8693                                             sizeof (l2arc_log_blkptr_t));
8694                                 }
8695                                 break;
8696                         }
8697                         bcopy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[i],
8698                             sizeof (l2arc_log_blkptr_t));
8699                         lb_ptr_buf = list_next(&dev->l2ad_lbptr_list,
8700                             lb_ptr_buf);
8701                 }
8702         }
8703
8704         ARCSTAT_BUMP(arcstat_l2_writes_done);
8705         list_remove(buflist, head);
8706         ASSERT(!HDR_HAS_L1HDR(head));
8707         kmem_cache_free(hdr_l2only_cache, head);
8708         mutex_exit(&dev->l2ad_mtx);
8709
8710         ASSERT(dev->l2ad_vdev != NULL);
8711         vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
8712
8713         l2arc_do_free_on_write();
8714
8715         kmem_free(cb, sizeof (l2arc_write_callback_t));
8716 }
8717
8718 static int
8719 l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
8720 {
8721         int ret;
8722         spa_t *spa = zio->io_spa;
8723         arc_buf_hdr_t *hdr = cb->l2rcb_hdr;
8724         blkptr_t *bp = zio->io_bp;
8725         uint8_t salt[ZIO_DATA_SALT_LEN];
8726         uint8_t iv[ZIO_DATA_IV_LEN];
8727         uint8_t mac[ZIO_DATA_MAC_LEN];
8728         boolean_t no_crypt = B_FALSE;
8729
8730         /*
8731          * ZIL data is never be written to the L2ARC, so we don't need
8732          * special handling for its unique MAC storage.
8733          */
8734         ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
8735         ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
8736         ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
8737
8738         /*
8739          * If the data was encrypted, decrypt it now. Note that
8740          * we must check the bp here and not the hdr, since the
8741          * hdr does not have its encryption parameters updated
8742          * until arc_read_done().
8743          */
8744         if (BP_IS_ENCRYPTED(bp)) {
8745                 abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
8746                     ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE);
8747
8748                 zio_crypt_decode_params_bp(bp, salt, iv);
8749                 zio_crypt_decode_mac_bp(bp, mac);
8750
8751                 ret = spa_do_crypt_abd(B_FALSE, spa, &cb->l2rcb_zb,
8752                     BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp),
8753                     salt, iv, mac, HDR_GET_PSIZE(hdr), eabd,
8754                     hdr->b_l1hdr.b_pabd, &no_crypt);
8755                 if (ret != 0) {
8756                         arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
8757                         goto error;
8758                 }
8759
8760                 /*
8761                  * If we actually performed decryption, replace b_pabd
8762                  * with the decrypted data. Otherwise we can just throw
8763                  * our decryption buffer away.
8764                  */
8765                 if (!no_crypt) {
8766                         arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
8767                             arc_hdr_size(hdr), hdr);
8768                         hdr->b_l1hdr.b_pabd = eabd;
8769                         zio->io_abd = eabd;
8770                 } else {
8771                         arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
8772                 }
8773         }
8774
8775         /*
8776          * If the L2ARC block was compressed, but ARC compression
8777          * is disabled we decompress the data into a new buffer and
8778          * replace the existing data.
8779          */
8780         if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
8781             !HDR_COMPRESSION_ENABLED(hdr)) {
8782                 abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
8783                     ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE);
8784                 void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
8785
8786                 ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
8787                     hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
8788                     HDR_GET_LSIZE(hdr), &hdr->b_complevel);
8789                 if (ret != 0) {
8790                         abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
8791                         arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr);
8792                         goto error;
8793                 }
8794
8795                 abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
8796                 arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
8797                     arc_hdr_size(hdr), hdr);
8798                 hdr->b_l1hdr.b_pabd = cabd;
8799                 zio->io_abd = cabd;
8800                 zio->io_size = HDR_GET_LSIZE(hdr);
8801         }
8802
8803         return (0);
8804
8805 error:
8806         return (ret);
8807 }
8808
8809
8810 /*
8811  * A read to a cache device completed.  Validate buffer contents before
8812  * handing over to the regular ARC routines.
8813  */
8814 static void
8815 l2arc_read_done(zio_t *zio)
8816 {
8817         int tfm_error = 0;
8818         l2arc_read_callback_t *cb = zio->io_private;
8819         arc_buf_hdr_t *hdr;
8820         kmutex_t *hash_lock;
8821         boolean_t valid_cksum;
8822         boolean_t using_rdata = (BP_IS_ENCRYPTED(&cb->l2rcb_bp) &&
8823             (cb->l2rcb_flags & ZIO_FLAG_RAW_ENCRYPT));
8824
8825         ASSERT3P(zio->io_vd, !=, NULL);
8826         ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
8827
8828         spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
8829
8830         ASSERT3P(cb, !=, NULL);
8831         hdr = cb->l2rcb_hdr;
8832         ASSERT3P(hdr, !=, NULL);
8833
8834         hash_lock = HDR_LOCK(hdr);
8835         mutex_enter(hash_lock);
8836         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
8837
8838         /*
8839          * If the data was read into a temporary buffer,
8840          * move it and free the buffer.
8841          */
8842         if (cb->l2rcb_abd != NULL) {
8843                 ASSERT3U(arc_hdr_size(hdr), <, zio->io_size);
8844                 if (zio->io_error == 0) {
8845                         if (using_rdata) {
8846                                 abd_copy(hdr->b_crypt_hdr.b_rabd,
8847                                     cb->l2rcb_abd, arc_hdr_size(hdr));
8848                         } else {
8849                                 abd_copy(hdr->b_l1hdr.b_pabd,
8850                                     cb->l2rcb_abd, arc_hdr_size(hdr));
8851                         }
8852                 }
8853
8854                 /*
8855                  * The following must be done regardless of whether
8856                  * there was an error:
8857                  * - free the temporary buffer
8858                  * - point zio to the real ARC buffer
8859                  * - set zio size accordingly
8860                  * These are required because zio is either re-used for
8861                  * an I/O of the block in the case of the error
8862                  * or the zio is passed to arc_read_done() and it
8863                  * needs real data.
8864                  */
8865                 abd_free(cb->l2rcb_abd);
8866                 zio->io_size = zio->io_orig_size = arc_hdr_size(hdr);
8867
8868                 if (using_rdata) {
8869                         ASSERT(HDR_HAS_RABD(hdr));
8870                         zio->io_abd = zio->io_orig_abd =
8871                             hdr->b_crypt_hdr.b_rabd;
8872                 } else {
8873                         ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
8874                         zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd;
8875                 }
8876         }
8877
8878         ASSERT3P(zio->io_abd, !=, NULL);
8879
8880         /*
8881          * Check this survived the L2ARC journey.
8882          */
8883         ASSERT(zio->io_abd == hdr->b_l1hdr.b_pabd ||
8884             (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd));
8885         zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
8886         zio->io_bp = &zio->io_bp_copy;  /* XXX fix in L2ARC 2.0 */
8887         zio->io_prop.zp_complevel = hdr->b_complevel;
8888
8889         valid_cksum = arc_cksum_is_equal(hdr, zio);
8890
8891         /*
8892          * b_rabd will always match the data as it exists on disk if it is
8893          * being used. Therefore if we are reading into b_rabd we do not
8894          * attempt to untransform the data.
8895          */
8896         if (valid_cksum && !using_rdata)
8897                 tfm_error = l2arc_untransform(zio, cb);
8898
8899         if (valid_cksum && tfm_error == 0 && zio->io_error == 0 &&
8900             !HDR_L2_EVICTED(hdr)) {
8901                 mutex_exit(hash_lock);
8902                 zio->io_private = hdr;
8903                 arc_read_done(zio);
8904         } else {
8905                 /*
8906                  * Buffer didn't survive caching.  Increment stats and
8907                  * reissue to the original storage device.
8908                  */
8909                 if (zio->io_error != 0) {
8910                         ARCSTAT_BUMP(arcstat_l2_io_error);
8911                 } else {
8912                         zio->io_error = SET_ERROR(EIO);
8913                 }
8914                 if (!valid_cksum || tfm_error != 0)
8915                         ARCSTAT_BUMP(arcstat_l2_cksum_bad);
8916
8917                 /*
8918                  * If there's no waiter, issue an async i/o to the primary
8919                  * storage now.  If there *is* a waiter, the caller must
8920                  * issue the i/o in a context where it's OK to block.
8921                  */
8922                 if (zio->io_waiter == NULL) {
8923                         zio_t *pio = zio_unique_parent(zio);
8924                         void *abd = (using_rdata) ?
8925                             hdr->b_crypt_hdr.b_rabd : hdr->b_l1hdr.b_pabd;
8926
8927                         ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
8928
8929                         zio = zio_read(pio, zio->io_spa, zio->io_bp,
8930                             abd, zio->io_size, arc_read_done,
8931                             hdr, zio->io_priority, cb->l2rcb_flags,
8932                             &cb->l2rcb_zb);
8933
8934                         /*
8935                          * Original ZIO will be freed, so we need to update
8936                          * ARC header with the new ZIO pointer to be used
8937                          * by zio_change_priority() in arc_read().
8938                          */
8939                         for (struct arc_callback *acb = hdr->b_l1hdr.b_acb;
8940                             acb != NULL; acb = acb->acb_next)
8941                                 acb->acb_zio_head = zio;
8942
8943                         mutex_exit(hash_lock);
8944                         zio_nowait(zio);
8945                 } else {
8946                         mutex_exit(hash_lock);
8947                 }
8948         }
8949
8950         kmem_free(cb, sizeof (l2arc_read_callback_t));
8951 }
8952
8953 /*
8954  * This is the list priority from which the L2ARC will search for pages to
8955  * cache.  This is used within loops (0..3) to cycle through lists in the
8956  * desired order.  This order can have a significant effect on cache
8957  * performance.
8958  *
8959  * Currently the metadata lists are hit first, MFU then MRU, followed by
8960  * the data lists.  This function returns a locked list, and also returns
8961  * the lock pointer.
8962  */
8963 static multilist_sublist_t *
8964 l2arc_sublist_lock(int list_num)
8965 {
8966         multilist_t *ml = NULL;
8967         unsigned int idx;
8968
8969         ASSERT(list_num >= 0 && list_num < L2ARC_FEED_TYPES);
8970
8971         switch (list_num) {
8972         case 0:
8973                 ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
8974                 break;
8975         case 1:
8976                 ml = &arc_mru->arcs_list[ARC_BUFC_METADATA];
8977                 break;
8978         case 2:
8979                 ml = &arc_mfu->arcs_list[ARC_BUFC_DATA];
8980                 break;
8981         case 3:
8982                 ml = &arc_mru->arcs_list[ARC_BUFC_DATA];
8983                 break;
8984         default:
8985                 return (NULL);
8986         }
8987
8988         /*
8989          * Return a randomly-selected sublist. This is acceptable
8990          * because the caller feeds only a little bit of data for each
8991          * call (8MB). Subsequent calls will result in different
8992          * sublists being selected.
8993          */
8994         idx = multilist_get_random_index(ml);
8995         return (multilist_sublist_lock(ml, idx));
8996 }
8997
8998 /*
8999  * Calculates the maximum overhead of L2ARC metadata log blocks for a given
9000  * L2ARC write size. l2arc_evict and l2arc_write_size need to include this
9001  * overhead in processing to make sure there is enough headroom available
9002  * when writing buffers.
9003  */
9004 static inline uint64_t
9005 l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev)
9006 {
9007         if (dev->l2ad_log_entries == 0) {
9008                 return (0);
9009         } else {
9010                 uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT;
9011
9012                 uint64_t log_blocks = (log_entries +
9013                     dev->l2ad_log_entries - 1) /
9014                     dev->l2ad_log_entries;
9015
9016                 return (vdev_psize_to_asize(dev->l2ad_vdev,
9017                     sizeof (l2arc_log_blk_phys_t)) * log_blocks);
9018         }
9019 }
9020
9021 /*
9022  * Evict buffers from the device write hand to the distance specified in
9023  * bytes. This distance may span populated buffers, it may span nothing.
9024  * This is clearing a region on the L2ARC device ready for writing.
9025  * If the 'all' boolean is set, every buffer is evicted.
9026  */
9027 static void
9028 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
9029 {
9030         list_t *buflist;
9031         arc_buf_hdr_t *hdr, *hdr_prev;
9032         kmutex_t *hash_lock;
9033         uint64_t taddr;
9034         l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev;
9035         vdev_t *vd = dev->l2ad_vdev;
9036         boolean_t rerun;
9037
9038         buflist = &dev->l2ad_buflist;
9039
9040         /*
9041          * We need to add in the worst case scenario of log block overhead.
9042          */
9043         distance += l2arc_log_blk_overhead(distance, dev);
9044         if (vd->vdev_has_trim && l2arc_trim_ahead > 0) {
9045                 /*
9046                  * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100)
9047                  * times the write size, whichever is greater.
9048                  */
9049                 distance += MAX(64 * 1024 * 1024,
9050                     (distance * l2arc_trim_ahead) / 100);
9051         }
9052
9053 top:
9054         rerun = B_FALSE;
9055         if (dev->l2ad_hand >= (dev->l2ad_end - distance)) {
9056                 /*
9057                  * When there is no space to accommodate upcoming writes,
9058                  * evict to the end. Then bump the write and evict hands
9059                  * to the start and iterate. This iteration does not
9060                  * happen indefinitely as we make sure in
9061                  * l2arc_write_size() that when the write hand is reset,
9062                  * the write size does not exceed the end of the device.
9063                  */
9064                 rerun = B_TRUE;
9065                 taddr = dev->l2ad_end;
9066         } else {
9067                 taddr = dev->l2ad_hand + distance;
9068         }
9069         DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
9070             uint64_t, taddr, boolean_t, all);
9071
9072         if (!all) {
9073                 /*
9074                  * This check has to be placed after deciding whether to
9075                  * iterate (rerun).
9076                  */
9077                 if (dev->l2ad_first) {
9078                         /*
9079                          * This is the first sweep through the device. There is
9080                          * nothing to evict. We have already trimmmed the
9081                          * whole device.
9082                          */
9083                         goto out;
9084                 } else {
9085                         /*
9086                          * Trim the space to be evicted.
9087                          */
9088                         if (vd->vdev_has_trim && dev->l2ad_evict < taddr &&
9089                             l2arc_trim_ahead > 0) {
9090                                 /*
9091                                  * We have to drop the spa_config lock because
9092                                  * vdev_trim_range() will acquire it.
9093                                  * l2ad_evict already accounts for the label
9094                                  * size. To prevent vdev_trim_ranges() from
9095                                  * adding it again, we subtract it from
9096                                  * l2ad_evict.
9097                                  */
9098                                 spa_config_exit(dev->l2ad_spa, SCL_L2ARC, dev);
9099                                 vdev_trim_simple(vd,
9100                                     dev->l2ad_evict - VDEV_LABEL_START_SIZE,
9101                                     taddr - dev->l2ad_evict);
9102                                 spa_config_enter(dev->l2ad_spa, SCL_L2ARC, dev,
9103                                     RW_READER);
9104                         }
9105
9106                         /*
9107                          * When rebuilding L2ARC we retrieve the evict hand
9108                          * from the header of the device. Of note, l2arc_evict()
9109                          * does not actually delete buffers from the cache
9110                          * device, but trimming may do so depending on the
9111                          * hardware implementation. Thus keeping track of the
9112                          * evict hand is useful.
9113                          */
9114                         dev->l2ad_evict = MAX(dev->l2ad_evict, taddr);
9115                 }
9116         }
9117
9118 retry:
9119         mutex_enter(&dev->l2ad_mtx);
9120         /*
9121          * We have to account for evicted log blocks. Run vdev_space_update()
9122          * on log blocks whose offset (in bytes) is before the evicted offset
9123          * (in bytes) by searching in the list of pointers to log blocks
9124          * present in the L2ARC device.
9125          */
9126         for (lb_ptr_buf = list_tail(&dev->l2ad_lbptr_list); lb_ptr_buf;
9127             lb_ptr_buf = lb_ptr_buf_prev) {
9128
9129                 lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf);
9130
9131                 /* L2BLK_GET_PSIZE returns aligned size for log blocks */
9132                 uint64_t asize = L2BLK_GET_PSIZE(
9133                     (lb_ptr_buf->lb_ptr)->lbp_prop);
9134
9135                 /*
9136                  * We don't worry about log blocks left behind (ie
9137                  * lbp_payload_start < l2ad_hand) because l2arc_write_buffers()
9138                  * will never write more than l2arc_evict() evicts.
9139                  */
9140                 if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) {
9141                         break;
9142                 } else {
9143                         vdev_space_update(vd, -asize, 0, 0);
9144                         ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
9145                         ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
9146                         zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
9147                             lb_ptr_buf);
9148                         zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf);
9149                         list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf);
9150                         kmem_free(lb_ptr_buf->lb_ptr,
9151                             sizeof (l2arc_log_blkptr_t));
9152                         kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
9153                 }
9154         }
9155
9156         for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
9157                 hdr_prev = list_prev(buflist, hdr);
9158
9159                 ASSERT(!HDR_EMPTY(hdr));
9160                 hash_lock = HDR_LOCK(hdr);
9161
9162                 /*
9163                  * We cannot use mutex_enter or else we can deadlock
9164                  * with l2arc_write_buffers (due to swapping the order
9165                  * the hash lock and l2ad_mtx are taken).
9166                  */
9167                 if (!mutex_tryenter(hash_lock)) {
9168                         /*
9169                          * Missed the hash lock.  Retry.
9170                          */
9171                         ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
9172                         mutex_exit(&dev->l2ad_mtx);
9173                         mutex_enter(hash_lock);
9174                         mutex_exit(hash_lock);
9175                         goto retry;
9176                 }
9177
9178                 /*
9179                  * A header can't be on this list if it doesn't have L2 header.
9180                  */
9181                 ASSERT(HDR_HAS_L2HDR(hdr));
9182
9183                 /* Ensure this header has finished being written. */
9184                 ASSERT(!HDR_L2_WRITING(hdr));
9185                 ASSERT(!HDR_L2_WRITE_HEAD(hdr));
9186
9187                 if (!all && (hdr->b_l2hdr.b_daddr >= dev->l2ad_evict ||
9188                     hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
9189                         /*
9190                          * We've evicted to the target address,
9191                          * or the end of the device.
9192                          */
9193                         mutex_exit(hash_lock);
9194                         break;
9195                 }
9196
9197                 if (!HDR_HAS_L1HDR(hdr)) {
9198                         ASSERT(!HDR_L2_READING(hdr));
9199                         /*
9200                          * This doesn't exist in the ARC.  Destroy.
9201                          * arc_hdr_destroy() will call list_remove()
9202                          * and decrement arcstat_l2_lsize.
9203                          */
9204                         arc_change_state(arc_anon, hdr, hash_lock);
9205                         arc_hdr_destroy(hdr);
9206                 } else {
9207                         ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
9208                         ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
9209                         /*
9210                          * Invalidate issued or about to be issued
9211                          * reads, since we may be about to write
9212                          * over this location.
9213                          */
9214                         if (HDR_L2_READING(hdr)) {
9215                                 ARCSTAT_BUMP(arcstat_l2_evict_reading);
9216                                 arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED);
9217                         }
9218
9219                         arc_hdr_l2hdr_destroy(hdr);
9220                 }
9221                 mutex_exit(hash_lock);
9222         }
9223         mutex_exit(&dev->l2ad_mtx);
9224
9225 out:
9226         /*
9227          * We need to check if we evict all buffers, otherwise we may iterate
9228          * unnecessarily.
9229          */
9230         if (!all && rerun) {
9231                 /*
9232                  * Bump device hand to the device start if it is approaching the
9233                  * end. l2arc_evict() has already evicted ahead for this case.
9234                  */
9235                 dev->l2ad_hand = dev->l2ad_start;
9236                 dev->l2ad_evict = dev->l2ad_start;
9237                 dev->l2ad_first = B_FALSE;
9238                 goto top;
9239         }
9240
9241         if (!all) {
9242                 /*
9243                  * In case of cache device removal (all) the following
9244                  * assertions may be violated without functional consequences
9245                  * as the device is about to be removed.
9246                  */
9247                 ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end);
9248                 if (!dev->l2ad_first)
9249                         ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict);
9250         }
9251 }
9252
9253 /*
9254  * Handle any abd transforms that might be required for writing to the L2ARC.
9255  * If successful, this function will always return an abd with the data
9256  * transformed as it is on disk in a new abd of asize bytes.
9257  */
9258 static int
9259 l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
9260     abd_t **abd_out)
9261 {
9262         int ret;
9263         void *tmp = NULL;
9264         abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd;
9265         enum zio_compress compress = HDR_GET_COMPRESS(hdr);
9266         uint64_t psize = HDR_GET_PSIZE(hdr);
9267         uint64_t size = arc_hdr_size(hdr);
9268         boolean_t ismd = HDR_ISTYPE_METADATA(hdr);
9269         boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
9270         dsl_crypto_key_t *dck = NULL;
9271         uint8_t mac[ZIO_DATA_MAC_LEN] = { 0 };
9272         boolean_t no_crypt = B_FALSE;
9273
9274         ASSERT((HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
9275             !HDR_COMPRESSION_ENABLED(hdr)) ||
9276             HDR_ENCRYPTED(hdr) || HDR_SHARED_DATA(hdr) || psize != asize);
9277         ASSERT3U(psize, <=, asize);
9278
9279         /*
9280          * If this data simply needs its own buffer, we simply allocate it
9281          * and copy the data. This may be done to eliminate a dependency on a
9282          * shared buffer or to reallocate the buffer to match asize.
9283          */
9284         if (HDR_HAS_RABD(hdr) && asize != psize) {
9285                 ASSERT3U(asize, >=, psize);
9286                 to_write = abd_alloc_for_io(asize, ismd);
9287                 abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize);
9288                 if (psize != asize)
9289                         abd_zero_off(to_write, psize, asize - psize);
9290                 goto out;
9291         }
9292
9293         if ((compress == ZIO_COMPRESS_OFF || HDR_COMPRESSION_ENABLED(hdr)) &&
9294             !HDR_ENCRYPTED(hdr)) {
9295                 ASSERT3U(size, ==, psize);
9296                 to_write = abd_alloc_for_io(asize, ismd);
9297                 abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
9298                 if (size != asize)
9299                         abd_zero_off(to_write, size, asize - size);
9300                 goto out;
9301         }
9302
9303         if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) {
9304                 cabd = abd_alloc_for_io(asize, ismd);
9305                 tmp = abd_borrow_buf(cabd, asize);
9306
9307                 psize = zio_compress_data(compress, to_write, tmp, size,
9308                     hdr->b_complevel);
9309
9310                 if (psize >= size) {
9311                         abd_return_buf(cabd, tmp, asize);
9312                         HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
9313                         to_write = cabd;
9314                         abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
9315                         if (size != asize)
9316                                 abd_zero_off(to_write, size, asize - size);
9317                         goto encrypt;
9318                 }
9319                 ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr));
9320                 if (psize < asize)
9321                         bzero((char *)tmp + psize, asize - psize);
9322                 psize = HDR_GET_PSIZE(hdr);
9323                 abd_return_buf_copy(cabd, tmp, asize);
9324                 to_write = cabd;
9325         }
9326
9327 encrypt:
9328         if (HDR_ENCRYPTED(hdr)) {
9329                 eabd = abd_alloc_for_io(asize, ismd);
9330
9331                 /*
9332                  * If the dataset was disowned before the buffer
9333                  * made it to this point, the key to re-encrypt
9334                  * it won't be available. In this case we simply
9335                  * won't write the buffer to the L2ARC.
9336                  */
9337                 ret = spa_keystore_lookup_key(spa, hdr->b_crypt_hdr.b_dsobj,
9338                     FTAG, &dck);
9339                 if (ret != 0)
9340                         goto error;
9341
9342                 ret = zio_do_crypt_abd(B_TRUE, &dck->dck_key,
9343                     hdr->b_crypt_hdr.b_ot, bswap, hdr->b_crypt_hdr.b_salt,
9344                     hdr->b_crypt_hdr.b_iv, mac, psize, to_write, eabd,
9345                     &no_crypt);
9346                 if (ret != 0)
9347                         goto error;
9348
9349                 if (no_crypt)
9350                         abd_copy(eabd, to_write, psize);
9351
9352                 if (psize != asize)
9353                         abd_zero_off(eabd, psize, asize - psize);
9354
9355                 /* assert that the MAC we got here matches the one we saved */
9356                 ASSERT0(bcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN));
9357                 spa_keystore_dsl_key_rele(spa, dck, FTAG);
9358
9359                 if (to_write == cabd)
9360                         abd_free(cabd);
9361
9362                 to_write = eabd;
9363         }
9364
9365 out:
9366         ASSERT3P(to_write, !=, hdr->b_l1hdr.b_pabd);
9367         *abd_out = to_write;
9368         return (0);
9369
9370 error:
9371         if (dck != NULL)
9372                 spa_keystore_dsl_key_rele(spa, dck, FTAG);
9373         if (cabd != NULL)
9374                 abd_free(cabd);
9375         if (eabd != NULL)
9376                 abd_free(eabd);
9377
9378         *abd_out = NULL;
9379         return (ret);
9380 }
9381
9382 static void
9383 l2arc_blk_fetch_done(zio_t *zio)
9384 {
9385         l2arc_read_callback_t *cb;
9386
9387         cb = zio->io_private;
9388         if (cb->l2rcb_abd != NULL)
9389                 abd_free(cb->l2rcb_abd);
9390         kmem_free(cb, sizeof (l2arc_read_callback_t));
9391 }
9392
9393 /*
9394  * Find and write ARC buffers to the L2ARC device.
9395  *
9396  * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
9397  * for reading until they have completed writing.
9398  * The headroom_boost is an in-out parameter used to maintain headroom boost
9399  * state between calls to this function.
9400  *
9401  * Returns the number of bytes actually written (which may be smaller than
9402  * the delta by which the device hand has changed due to alignment and the
9403  * writing of log blocks).
9404  */
9405 static uint64_t
9406 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
9407 {
9408         arc_buf_hdr_t           *hdr, *hdr_prev, *head;
9409         uint64_t                write_asize, write_psize, write_lsize, headroom;
9410         boolean_t               full;
9411         l2arc_write_callback_t  *cb = NULL;
9412         zio_t                   *pio, *wzio;
9413         uint64_t                guid = spa_load_guid(spa);
9414         l2arc_dev_hdr_phys_t    *l2dhdr = dev->l2ad_dev_hdr;
9415
9416         ASSERT3P(dev->l2ad_vdev, !=, NULL);
9417
9418         pio = NULL;
9419         write_lsize = write_asize = write_psize = 0;
9420         full = B_FALSE;
9421         head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
9422         arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
9423
9424         /*
9425          * Copy buffers for L2ARC writing.
9426          */
9427         for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
9428                 /*
9429                  * If pass == 1 or 3, we cache MRU metadata and data
9430                  * respectively.
9431                  */
9432                 if (l2arc_mfuonly) {
9433                         if (pass == 1 || pass == 3)
9434                                 continue;
9435                 }
9436
9437                 multilist_sublist_t *mls = l2arc_sublist_lock(pass);
9438                 uint64_t passed_sz = 0;
9439
9440                 VERIFY3P(mls, !=, NULL);
9441
9442                 /*
9443                  * L2ARC fast warmup.
9444                  *
9445                  * Until the ARC is warm and starts to evict, read from the
9446                  * head of the ARC lists rather than the tail.
9447                  */
9448                 if (arc_warm == B_FALSE)
9449                         hdr = multilist_sublist_head(mls);
9450                 else
9451                         hdr = multilist_sublist_tail(mls);
9452
9453                 headroom = target_sz * l2arc_headroom;
9454                 if (zfs_compressed_arc_enabled)
9455                         headroom = (headroom * l2arc_headroom_boost) / 100;
9456
9457                 for (; hdr; hdr = hdr_prev) {
9458                         kmutex_t *hash_lock;
9459                         abd_t *to_write = NULL;
9460
9461                         if (arc_warm == B_FALSE)
9462                                 hdr_prev = multilist_sublist_next(mls, hdr);
9463                         else
9464                                 hdr_prev = multilist_sublist_prev(mls, hdr);
9465
9466                         hash_lock = HDR_LOCK(hdr);
9467                         if (!mutex_tryenter(hash_lock)) {
9468                                 /*
9469                                  * Skip this buffer rather than waiting.
9470                                  */
9471                                 continue;
9472                         }
9473
9474                         passed_sz += HDR_GET_LSIZE(hdr);
9475                         if (l2arc_headroom != 0 && passed_sz > headroom) {
9476                                 /*
9477                                  * Searched too far.
9478                                  */
9479                                 mutex_exit(hash_lock);
9480                                 break;
9481                         }
9482
9483                         if (!l2arc_write_eligible(guid, hdr)) {
9484                                 mutex_exit(hash_lock);
9485                                 continue;
9486                         }
9487
9488                         /*
9489                          * We rely on the L1 portion of the header below, so
9490                          * it's invalid for this header to have been evicted out
9491                          * of the ghost cache, prior to being written out. The
9492                          * ARC_FLAG_L2_WRITING bit ensures this won't happen.
9493                          */
9494                         ASSERT(HDR_HAS_L1HDR(hdr));
9495
9496                         ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
9497                         ASSERT3U(arc_hdr_size(hdr), >, 0);
9498                         ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
9499                             HDR_HAS_RABD(hdr));
9500                         uint64_t psize = HDR_GET_PSIZE(hdr);
9501                         uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
9502                             psize);
9503
9504                         if ((write_asize + asize) > target_sz) {
9505                                 full = B_TRUE;
9506                                 mutex_exit(hash_lock);
9507                                 break;
9508                         }
9509
9510                         /*
9511                          * We rely on the L1 portion of the header below, so
9512                          * it's invalid for this header to have been evicted out
9513                          * of the ghost cache, prior to being written out. The
9514                          * ARC_FLAG_L2_WRITING bit ensures this won't happen.
9515                          */
9516                         arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING);
9517                         ASSERT(HDR_HAS_L1HDR(hdr));
9518
9519                         ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
9520                         ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
9521                             HDR_HAS_RABD(hdr));
9522                         ASSERT3U(arc_hdr_size(hdr), >, 0);
9523
9524                         /*
9525                          * If this header has b_rabd, we can use this since it
9526                          * must always match the data exactly as it exists on
9527                          * disk. Otherwise, the L2ARC can normally use the
9528                          * hdr's data, but if we're sharing data between the
9529                          * hdr and one of its bufs, L2ARC needs its own copy of
9530                          * the data so that the ZIO below can't race with the
9531                          * buf consumer. To ensure that this copy will be
9532                          * available for the lifetime of the ZIO and be cleaned
9533                          * up afterwards, we add it to the l2arc_free_on_write
9534                          * queue. If we need to apply any transforms to the
9535                          * data (compression, encryption) we will also need the
9536                          * extra buffer.
9537                          */
9538                         if (HDR_HAS_RABD(hdr) && psize == asize) {
9539                                 to_write = hdr->b_crypt_hdr.b_rabd;
9540                         } else if ((HDR_COMPRESSION_ENABLED(hdr) ||
9541                             HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) &&
9542                             !HDR_ENCRYPTED(hdr) && !HDR_SHARED_DATA(hdr) &&
9543                             psize == asize) {
9544                                 to_write = hdr->b_l1hdr.b_pabd;
9545                         } else {
9546                                 int ret;
9547                                 arc_buf_contents_t type = arc_buf_type(hdr);
9548
9549                                 ret = l2arc_apply_transforms(spa, hdr, asize,
9550                                     &to_write);
9551                                 if (ret != 0) {
9552                                         arc_hdr_clear_flags(hdr,
9553                                             ARC_FLAG_L2_WRITING);
9554                                         mutex_exit(hash_lock);
9555                                         continue;
9556                                 }
9557
9558                                 l2arc_free_abd_on_write(to_write, asize, type);
9559                         }
9560
9561                         if (pio == NULL) {
9562                                 /*
9563                                  * Insert a dummy header on the buflist so
9564                                  * l2arc_write_done() can find where the
9565                                  * write buffers begin without searching.
9566                                  */
9567                                 mutex_enter(&dev->l2ad_mtx);
9568                                 list_insert_head(&dev->l2ad_buflist, head);
9569                                 mutex_exit(&dev->l2ad_mtx);
9570
9571                                 cb = kmem_alloc(
9572                                     sizeof (l2arc_write_callback_t), KM_SLEEP);
9573                                 cb->l2wcb_dev = dev;
9574                                 cb->l2wcb_head = head;
9575                                 /*
9576                                  * Create a list to save allocated abd buffers
9577                                  * for l2arc_log_blk_commit().
9578                                  */
9579                                 list_create(&cb->l2wcb_abd_list,
9580                                     sizeof (l2arc_lb_abd_buf_t),
9581                                     offsetof(l2arc_lb_abd_buf_t, node));
9582                                 pio = zio_root(spa, l2arc_write_done, cb,
9583                                     ZIO_FLAG_CANFAIL);
9584                         }
9585
9586                         hdr->b_l2hdr.b_dev = dev;
9587                         hdr->b_l2hdr.b_hits = 0;
9588
9589                         hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
9590                         hdr->b_l2hdr.b_arcs_state =
9591                             hdr->b_l1hdr.b_state->arcs_state;
9592                         arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR);
9593
9594                         mutex_enter(&dev->l2ad_mtx);
9595                         list_insert_head(&dev->l2ad_buflist, hdr);
9596                         mutex_exit(&dev->l2ad_mtx);
9597
9598                         (void) zfs_refcount_add_many(&dev->l2ad_alloc,
9599                             arc_hdr_size(hdr), hdr);
9600
9601                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
9602                             hdr->b_l2hdr.b_daddr, asize, to_write,
9603                             ZIO_CHECKSUM_OFF, NULL, hdr,
9604                             ZIO_PRIORITY_ASYNC_WRITE,
9605                             ZIO_FLAG_CANFAIL, B_FALSE);
9606
9607                         write_lsize += HDR_GET_LSIZE(hdr);
9608                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
9609                             zio_t *, wzio);
9610
9611                         write_psize += psize;
9612                         write_asize += asize;
9613                         dev->l2ad_hand += asize;
9614                         l2arc_hdr_arcstats_increment(hdr);
9615                         vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
9616
9617                         mutex_exit(hash_lock);
9618
9619                         /*
9620                          * Append buf info to current log and commit if full.
9621                          * arcstat_l2_{size,asize} kstats are updated
9622                          * internally.
9623                          */
9624                         if (l2arc_log_blk_insert(dev, hdr))
9625                                 l2arc_log_blk_commit(dev, pio, cb);
9626
9627                         zio_nowait(wzio);
9628                 }
9629
9630                 multilist_sublist_unlock(mls);
9631
9632                 if (full == B_TRUE)
9633                         break;
9634         }
9635
9636         /* No buffers selected for writing? */
9637         if (pio == NULL) {
9638                 ASSERT0(write_lsize);
9639                 ASSERT(!HDR_HAS_L1HDR(head));
9640                 kmem_cache_free(hdr_l2only_cache, head);
9641
9642                 /*
9643                  * Although we did not write any buffers l2ad_evict may
9644                  * have advanced.
9645                  */
9646                 if (dev->l2ad_evict != l2dhdr->dh_evict)
9647                         l2arc_dev_hdr_update(dev);
9648
9649                 return (0);
9650         }
9651
9652         if (!dev->l2ad_first)
9653                 ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict);
9654
9655         ASSERT3U(write_asize, <=, target_sz);
9656         ARCSTAT_BUMP(arcstat_l2_writes_sent);
9657         ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize);
9658
9659         dev->l2ad_writing = B_TRUE;
9660         (void) zio_wait(pio);
9661         dev->l2ad_writing = B_FALSE;
9662
9663         /*
9664          * Update the device header after the zio completes as
9665          * l2arc_write_done() may have updated the memory holding the log block
9666          * pointers in the device header.
9667          */
9668         l2arc_dev_hdr_update(dev);
9669
9670         return (write_asize);
9671 }
9672
9673 static boolean_t
9674 l2arc_hdr_limit_reached(void)
9675 {
9676         int64_t s = aggsum_upper_bound(&arc_sums.arcstat_l2_hdr_size);
9677
9678         return (arc_reclaim_needed() || (s > arc_meta_limit * 3 / 4) ||
9679             (s > (arc_warm ? arc_c : arc_c_max) * l2arc_meta_percent / 100));
9680 }
9681
9682 /*
9683  * This thread feeds the L2ARC at regular intervals.  This is the beating
9684  * heart of the L2ARC.
9685  */
9686 /* ARGSUSED */
9687 static void
9688 l2arc_feed_thread(void *unused)
9689 {
9690         callb_cpr_t cpr;
9691         l2arc_dev_t *dev;
9692         spa_t *spa;
9693         uint64_t size, wrote;
9694         clock_t begin, next = ddi_get_lbolt();
9695         fstrans_cookie_t cookie;
9696
9697         CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
9698
9699         mutex_enter(&l2arc_feed_thr_lock);
9700
9701         cookie = spl_fstrans_mark();
9702         while (l2arc_thread_exit == 0) {
9703                 CALLB_CPR_SAFE_BEGIN(&cpr);
9704                 (void) cv_timedwait_idle(&l2arc_feed_thr_cv,
9705                     &l2arc_feed_thr_lock, next);
9706                 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
9707                 next = ddi_get_lbolt() + hz;
9708
9709                 /*
9710                  * Quick check for L2ARC devices.
9711                  */
9712                 mutex_enter(&l2arc_dev_mtx);
9713                 if (l2arc_ndev == 0) {
9714                         mutex_exit(&l2arc_dev_mtx);
9715                         continue;
9716                 }
9717                 mutex_exit(&l2arc_dev_mtx);
9718                 begin = ddi_get_lbolt();
9719
9720                 /*
9721                  * This selects the next l2arc device to write to, and in
9722                  * doing so the next spa to feed from: dev->l2ad_spa.   This
9723                  * will return NULL if there are now no l2arc devices or if
9724                  * they are all faulted.
9725                  *
9726                  * If a device is returned, its spa's config lock is also
9727                  * held to prevent device removal.  l2arc_dev_get_next()
9728                  * will grab and release l2arc_dev_mtx.
9729                  */
9730                 if ((dev = l2arc_dev_get_next()) == NULL)
9731                         continue;
9732
9733                 spa = dev->l2ad_spa;
9734                 ASSERT3P(spa, !=, NULL);
9735
9736                 /*
9737                  * If the pool is read-only then force the feed thread to
9738                  * sleep a little longer.
9739                  */
9740                 if (!spa_writeable(spa)) {
9741                         next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
9742                         spa_config_exit(spa, SCL_L2ARC, dev);
9743                         continue;
9744                 }
9745
9746                 /*
9747                  * Avoid contributing to memory pressure.
9748                  */
9749                 if (l2arc_hdr_limit_reached()) {
9750                         ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
9751                         spa_config_exit(spa, SCL_L2ARC, dev);
9752                         continue;
9753                 }
9754
9755                 ARCSTAT_BUMP(arcstat_l2_feeds);
9756
9757                 size = l2arc_write_size(dev);
9758
9759                 /*
9760                  * Evict L2ARC buffers that will be overwritten.
9761                  */
9762                 l2arc_evict(dev, size, B_FALSE);
9763
9764                 /*
9765                  * Write ARC buffers.
9766                  */
9767                 wrote = l2arc_write_buffers(spa, dev, size);
9768
9769                 /*
9770                  * Calculate interval between writes.
9771                  */
9772                 next = l2arc_write_interval(begin, size, wrote);
9773                 spa_config_exit(spa, SCL_L2ARC, dev);
9774         }
9775         spl_fstrans_unmark(cookie);
9776
9777         l2arc_thread_exit = 0;
9778         cv_broadcast(&l2arc_feed_thr_cv);
9779         CALLB_CPR_EXIT(&cpr);           /* drops l2arc_feed_thr_lock */
9780         thread_exit();
9781 }
9782
9783 boolean_t
9784 l2arc_vdev_present(vdev_t *vd)
9785 {
9786         return (l2arc_vdev_get(vd) != NULL);
9787 }
9788
9789 /*
9790  * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if
9791  * the vdev_t isn't an L2ARC device.
9792  */
9793 l2arc_dev_t *
9794 l2arc_vdev_get(vdev_t *vd)
9795 {
9796         l2arc_dev_t     *dev;
9797
9798         mutex_enter(&l2arc_dev_mtx);
9799         for (dev = list_head(l2arc_dev_list); dev != NULL;
9800             dev = list_next(l2arc_dev_list, dev)) {
9801                 if (dev->l2ad_vdev == vd)
9802                         break;
9803         }
9804         mutex_exit(&l2arc_dev_mtx);
9805
9806         return (dev);
9807 }
9808
9809 static void
9810 l2arc_rebuild_dev(l2arc_dev_t *dev, boolean_t reopen)
9811 {
9812         l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
9813         uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize;
9814         spa_t *spa = dev->l2ad_spa;
9815
9816         /*
9817          * The L2ARC has to hold at least the payload of one log block for
9818          * them to be restored (persistent L2ARC). The payload of a log block
9819          * depends on the amount of its log entries. We always write log blocks
9820          * with 1022 entries. How many of them are committed or restored depends
9821          * on the size of the L2ARC device. Thus the maximum payload of
9822          * one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device
9823          * is less than that, we reduce the amount of committed and restored
9824          * log entries per block so as to enable persistence.
9825          */
9826         if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) {
9827                 dev->l2ad_log_entries = 0;
9828         } else {
9829                 dev->l2ad_log_entries = MIN((dev->l2ad_end -
9830                     dev->l2ad_start) >> SPA_MAXBLOCKSHIFT,
9831                     L2ARC_LOG_BLK_MAX_ENTRIES);
9832         }
9833
9834         /*
9835          * Read the device header, if an error is returned do not rebuild L2ARC.
9836          */
9837         if (l2arc_dev_hdr_read(dev) == 0 && dev->l2ad_log_entries > 0) {
9838                 /*
9839                  * If we are onlining a cache device (vdev_reopen) that was
9840                  * still present (l2arc_vdev_present()) and rebuild is enabled,
9841                  * we should evict all ARC buffers and pointers to log blocks
9842                  * and reclaim their space before restoring its contents to
9843                  * L2ARC.
9844                  */
9845                 if (reopen) {
9846                         if (!l2arc_rebuild_enabled) {
9847                                 return;
9848                         } else {
9849                                 l2arc_evict(dev, 0, B_TRUE);
9850                                 /* start a new log block */
9851                                 dev->l2ad_log_ent_idx = 0;
9852                                 dev->l2ad_log_blk_payload_asize = 0;
9853                                 dev->l2ad_log_blk_payload_start = 0;
9854                         }
9855                 }
9856                 /*
9857                  * Just mark the device as pending for a rebuild. We won't
9858                  * be starting a rebuild in line here as it would block pool
9859                  * import. Instead spa_load_impl will hand that off to an
9860                  * async task which will call l2arc_spa_rebuild_start.
9861                  */
9862                 dev->l2ad_rebuild = B_TRUE;
9863         } else if (spa_writeable(spa)) {
9864                 /*
9865                  * In this case TRIM the whole device if l2arc_trim_ahead > 0,
9866                  * otherwise create a new header. We zero out the memory holding
9867                  * the header to reset dh_start_lbps. If we TRIM the whole
9868                  * device the new header will be written by
9869                  * vdev_trim_l2arc_thread() at the end of the TRIM to update the
9870                  * trim_state in the header too. When reading the header, if
9871                  * trim_state is not VDEV_TRIM_COMPLETE and l2arc_trim_ahead > 0
9872                  * we opt to TRIM the whole device again.
9873                  */
9874                 if (l2arc_trim_ahead > 0) {
9875                         dev->l2ad_trim_all = B_TRUE;
9876                 } else {
9877                         bzero(l2dhdr, l2dhdr_asize);
9878                         l2arc_dev_hdr_update(dev);
9879                 }
9880         }
9881 }
9882
9883 /*
9884  * Add a vdev for use by the L2ARC.  By this point the spa has already
9885  * validated the vdev and opened it.
9886  */
9887 void
9888 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
9889 {
9890         l2arc_dev_t             *adddev;
9891         uint64_t                l2dhdr_asize;
9892
9893         ASSERT(!l2arc_vdev_present(vd));
9894
9895         /*
9896          * Create a new l2arc device entry.
9897          */
9898         adddev = vmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
9899         adddev->l2ad_spa = spa;
9900         adddev->l2ad_vdev = vd;
9901         /* leave extra size for an l2arc device header */
9902         l2dhdr_asize = adddev->l2ad_dev_hdr_asize =
9903             MAX(sizeof (*adddev->l2ad_dev_hdr), 1 << vd->vdev_ashift);
9904         adddev->l2ad_start = VDEV_LABEL_START_SIZE + l2dhdr_asize;
9905         adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
9906         ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end);
9907         adddev->l2ad_hand = adddev->l2ad_start;
9908         adddev->l2ad_evict = adddev->l2ad_start;
9909         adddev->l2ad_first = B_TRUE;
9910         adddev->l2ad_writing = B_FALSE;
9911         adddev->l2ad_trim_all = B_FALSE;
9912         list_link_init(&adddev->l2ad_node);
9913         adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP);
9914
9915         mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
9916         /*
9917          * This is a list of all ARC buffers that are still valid on the
9918          * device.
9919          */
9920         list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
9921             offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
9922
9923         /*
9924          * This is a list of pointers to log blocks that are still present
9925          * on the device.
9926          */
9927         list_create(&adddev->l2ad_lbptr_list, sizeof (l2arc_lb_ptr_buf_t),
9928             offsetof(l2arc_lb_ptr_buf_t, node));
9929
9930         vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
9931         zfs_refcount_create(&adddev->l2ad_alloc);
9932         zfs_refcount_create(&adddev->l2ad_lb_asize);
9933         zfs_refcount_create(&adddev->l2ad_lb_count);
9934
9935         /*
9936          * Decide if dev is eligible for L2ARC rebuild or whole device
9937          * trimming. This has to happen before the device is added in the
9938          * cache device list and l2arc_dev_mtx is released. Otherwise
9939          * l2arc_feed_thread() might already start writing on the
9940          * device.
9941          */
9942         l2arc_rebuild_dev(adddev, B_FALSE);
9943
9944         /*
9945          * Add device to global list
9946          */
9947         mutex_enter(&l2arc_dev_mtx);
9948         list_insert_head(l2arc_dev_list, adddev);
9949         atomic_inc_64(&l2arc_ndev);
9950         mutex_exit(&l2arc_dev_mtx);
9951 }
9952
9953 /*
9954  * Decide if a vdev is eligible for L2ARC rebuild, called from vdev_reopen()
9955  * in case of onlining a cache device.
9956  */
9957 void
9958 l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen)
9959 {
9960         l2arc_dev_t             *dev = NULL;
9961
9962         dev = l2arc_vdev_get(vd);
9963         ASSERT3P(dev, !=, NULL);
9964
9965         /*
9966          * In contrast to l2arc_add_vdev() we do not have to worry about
9967          * l2arc_feed_thread() invalidating previous content when onlining a
9968          * cache device. The device parameters (l2ad*) are not cleared when
9969          * offlining the device and writing new buffers will not invalidate
9970          * all previous content. In worst case only buffers that have not had
9971          * their log block written to the device will be lost.
9972          * When onlining the cache device (ie offline->online without exporting
9973          * the pool in between) this happens:
9974          * vdev_reopen() -> vdev_open() -> l2arc_rebuild_vdev()
9975          *                      |                       |
9976          *              vdev_is_dead() = B_FALSE        l2ad_rebuild = B_TRUE
9977          * During the time where vdev_is_dead = B_FALSE and until l2ad_rebuild
9978          * is set to B_TRUE we might write additional buffers to the device.
9979          */
9980         l2arc_rebuild_dev(dev, reopen);
9981 }
9982
9983 /*
9984  * Remove a vdev from the L2ARC.
9985  */
9986 void
9987 l2arc_remove_vdev(vdev_t *vd)
9988 {
9989         l2arc_dev_t *remdev = NULL;
9990
9991         /*
9992          * Find the device by vdev
9993          */
9994         remdev = l2arc_vdev_get(vd);
9995         ASSERT3P(remdev, !=, NULL);
9996
9997         /*
9998          * Cancel any ongoing or scheduled rebuild.
9999          */
10000         mutex_enter(&l2arc_rebuild_thr_lock);
10001         if (remdev->l2ad_rebuild_began == B_TRUE) {
10002                 remdev->l2ad_rebuild_cancel = B_TRUE;
10003                 while (remdev->l2ad_rebuild == B_TRUE)
10004                         cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock);
10005         }
10006         mutex_exit(&l2arc_rebuild_thr_lock);
10007
10008         /*
10009          * Remove device from global list
10010          */
10011         mutex_enter(&l2arc_dev_mtx);
10012         list_remove(l2arc_dev_list, remdev);
10013         l2arc_dev_last = NULL;          /* may have been invalidated */
10014         atomic_dec_64(&l2arc_ndev);
10015         mutex_exit(&l2arc_dev_mtx);
10016
10017         /*
10018          * Clear all buflists and ARC references.  L2ARC device flush.
10019          */
10020         l2arc_evict(remdev, 0, B_TRUE);
10021         list_destroy(&remdev->l2ad_buflist);
10022         ASSERT(list_is_empty(&remdev->l2ad_lbptr_list));
10023         list_destroy(&remdev->l2ad_lbptr_list);
10024         mutex_destroy(&remdev->l2ad_mtx);
10025         zfs_refcount_destroy(&remdev->l2ad_alloc);
10026         zfs_refcount_destroy(&remdev->l2ad_lb_asize);
10027         zfs_refcount_destroy(&remdev->l2ad_lb_count);
10028         kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize);
10029         vmem_free(remdev, sizeof (l2arc_dev_t));
10030 }
10031
10032 void
10033 l2arc_init(void)
10034 {
10035         l2arc_thread_exit = 0;
10036         l2arc_ndev = 0;
10037
10038         mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
10039         cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
10040         mutex_init(&l2arc_rebuild_thr_lock, NULL, MUTEX_DEFAULT, NULL);
10041         cv_init(&l2arc_rebuild_thr_cv, NULL, CV_DEFAULT, NULL);
10042         mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
10043         mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
10044
10045         l2arc_dev_list = &L2ARC_dev_list;
10046         l2arc_free_on_write = &L2ARC_free_on_write;
10047         list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
10048             offsetof(l2arc_dev_t, l2ad_node));
10049         list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
10050             offsetof(l2arc_data_free_t, l2df_list_node));
10051 }
10052
10053 void
10054 l2arc_fini(void)
10055 {
10056         mutex_destroy(&l2arc_feed_thr_lock);
10057         cv_destroy(&l2arc_feed_thr_cv);
10058         mutex_destroy(&l2arc_rebuild_thr_lock);
10059         cv_destroy(&l2arc_rebuild_thr_cv);
10060         mutex_destroy(&l2arc_dev_mtx);
10061         mutex_destroy(&l2arc_free_on_write_mtx);
10062
10063         list_destroy(l2arc_dev_list);
10064         list_destroy(l2arc_free_on_write);
10065 }
10066
10067 void
10068 l2arc_start(void)
10069 {
10070         if (!(spa_mode_global & SPA_MODE_WRITE))
10071                 return;
10072
10073         (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
10074             TS_RUN, defclsyspri);
10075 }
10076
10077 void
10078 l2arc_stop(void)
10079 {
10080         if (!(spa_mode_global & SPA_MODE_WRITE))
10081                 return;
10082
10083         mutex_enter(&l2arc_feed_thr_lock);
10084         cv_signal(&l2arc_feed_thr_cv);  /* kick thread out of startup */
10085         l2arc_thread_exit = 1;
10086         while (l2arc_thread_exit != 0)
10087                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
10088         mutex_exit(&l2arc_feed_thr_lock);
10089 }
10090
10091 /*
10092  * Punches out rebuild threads for the L2ARC devices in a spa. This should
10093  * be called after pool import from the spa async thread, since starting
10094  * these threads directly from spa_import() will make them part of the
10095  * "zpool import" context and delay process exit (and thus pool import).
10096  */
10097 void
10098 l2arc_spa_rebuild_start(spa_t *spa)
10099 {
10100         ASSERT(MUTEX_HELD(&spa_namespace_lock));
10101
10102         /*
10103          * Locate the spa's l2arc devices and kick off rebuild threads.
10104          */
10105         for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
10106                 l2arc_dev_t *dev =
10107                     l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
10108                 if (dev == NULL) {
10109                         /* Don't attempt a rebuild if the vdev is UNAVAIL */
10110                         continue;
10111                 }
10112                 mutex_enter(&l2arc_rebuild_thr_lock);
10113                 if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) {
10114                         dev->l2ad_rebuild_began = B_TRUE;
10115                         (void) thread_create(NULL, 0, l2arc_dev_rebuild_thread,
10116                             dev, 0, &p0, TS_RUN, minclsyspri);
10117                 }
10118                 mutex_exit(&l2arc_rebuild_thr_lock);
10119         }
10120 }
10121
10122 /*
10123  * Main entry point for L2ARC rebuilding.
10124  */
10125 static void
10126 l2arc_dev_rebuild_thread(void *arg)
10127 {
10128         l2arc_dev_t *dev = arg;
10129
10130         VERIFY(!dev->l2ad_rebuild_cancel);
10131         VERIFY(dev->l2ad_rebuild);
10132         (void) l2arc_rebuild(dev);
10133         mutex_enter(&l2arc_rebuild_thr_lock);
10134         dev->l2ad_rebuild_began = B_FALSE;
10135         dev->l2ad_rebuild = B_FALSE;
10136         mutex_exit(&l2arc_rebuild_thr_lock);
10137
10138         thread_exit();
10139 }
10140
10141 /*
10142  * This function implements the actual L2ARC metadata rebuild. It:
10143  * starts reading the log block chain and restores each block's contents
10144  * to memory (reconstructing arc_buf_hdr_t's).
10145  *
10146  * Operation stops under any of the following conditions:
10147  *
10148  * 1) We reach the end of the log block chain.
10149  * 2) We encounter *any* error condition (cksum errors, io errors)
10150  */
10151 static int
10152 l2arc_rebuild(l2arc_dev_t *dev)
10153 {
10154         vdev_t                  *vd = dev->l2ad_vdev;
10155         spa_t                   *spa = vd->vdev_spa;
10156         int                     err = 0;
10157         l2arc_dev_hdr_phys_t    *l2dhdr = dev->l2ad_dev_hdr;
10158         l2arc_log_blk_phys_t    *this_lb, *next_lb;
10159         zio_t                   *this_io = NULL, *next_io = NULL;
10160         l2arc_log_blkptr_t      lbps[2];
10161         l2arc_lb_ptr_buf_t      *lb_ptr_buf;
10162         boolean_t               lock_held;
10163
10164         this_lb = vmem_zalloc(sizeof (*this_lb), KM_SLEEP);
10165         next_lb = vmem_zalloc(sizeof (*next_lb), KM_SLEEP);
10166
10167         /*
10168          * We prevent device removal while issuing reads to the device,
10169          * then during the rebuilding phases we drop this lock again so
10170          * that a spa_unload or device remove can be initiated - this is
10171          * safe, because the spa will signal us to stop before removing
10172          * our device and wait for us to stop.
10173          */
10174         spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
10175         lock_held = B_TRUE;
10176
10177         /*
10178          * Retrieve the persistent L2ARC device state.
10179          * L2BLK_GET_PSIZE returns aligned size for log blocks.
10180          */
10181         dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start);
10182         dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr +
10183             L2BLK_GET_PSIZE((&l2dhdr->dh_start_lbps[0])->lbp_prop),
10184             dev->l2ad_start);
10185         dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
10186
10187         vd->vdev_trim_action_time = l2dhdr->dh_trim_action_time;
10188         vd->vdev_trim_state = l2dhdr->dh_trim_state;
10189
10190         /*
10191          * In case the zfs module parameter l2arc_rebuild_enabled is false
10192          * we do not start the rebuild process.
10193          */
10194         if (!l2arc_rebuild_enabled)
10195                 goto out;
10196
10197         /* Prepare the rebuild process */
10198         bcopy(l2dhdr->dh_start_lbps, lbps, sizeof (lbps));
10199
10200         /* Start the rebuild process */
10201         for (;;) {
10202                 if (!l2arc_log_blkptr_valid(dev, &lbps[0]))
10203                         break;
10204
10205                 if ((err = l2arc_log_blk_read(dev, &lbps[0], &lbps[1],
10206                     this_lb, next_lb, this_io, &next_io)) != 0)
10207                         goto out;
10208
10209                 /*
10210                  * Our memory pressure valve. If the system is running low
10211                  * on memory, rather than swamping memory with new ARC buf
10212                  * hdrs, we opt not to rebuild the L2ARC. At this point,
10213                  * however, we have already set up our L2ARC dev to chain in
10214                  * new metadata log blocks, so the user may choose to offline/
10215                  * online the L2ARC dev at a later time (or re-import the pool)
10216                  * to reconstruct it (when there's less memory pressure).
10217                  */
10218                 if (l2arc_hdr_limit_reached()) {
10219                         ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
10220                         cmn_err(CE_NOTE, "System running low on memory, "
10221                             "aborting L2ARC rebuild.");
10222                         err = SET_ERROR(ENOMEM);
10223                         goto out;
10224                 }
10225
10226                 spa_config_exit(spa, SCL_L2ARC, vd);
10227                 lock_held = B_FALSE;
10228
10229                 /*
10230                  * Now that we know that the next_lb checks out alright, we
10231                  * can start reconstruction from this log block.
10232                  * L2BLK_GET_PSIZE returns aligned size for log blocks.
10233                  */
10234                 uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
10235                 l2arc_log_blk_restore(dev, this_lb, asize);
10236
10237                 /*
10238                  * log block restored, include its pointer in the list of
10239                  * pointers to log blocks present in the L2ARC device.
10240                  */
10241                 lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
10242                 lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t),
10243                     KM_SLEEP);
10244                 bcopy(&lbps[0], lb_ptr_buf->lb_ptr,
10245                     sizeof (l2arc_log_blkptr_t));
10246                 mutex_enter(&dev->l2ad_mtx);
10247                 list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf);
10248                 ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
10249                 ARCSTAT_BUMP(arcstat_l2_log_blk_count);
10250                 zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
10251                 zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
10252                 mutex_exit(&dev->l2ad_mtx);
10253                 vdev_space_update(vd, asize, 0, 0);
10254
10255                 /*
10256                  * Protection against loops of log blocks:
10257                  *
10258                  *                                     l2ad_hand  l2ad_evict
10259                  *                                         V          V
10260                  * l2ad_start |=======================================| l2ad_end
10261                  *             -----|||----|||---|||----|||
10262                  *                  (3)    (2)   (1)    (0)
10263                  *             ---|||---|||----|||---|||
10264                  *                (7)   (6)    (5)   (4)
10265                  *
10266                  * In this situation the pointer of log block (4) passes
10267                  * l2arc_log_blkptr_valid() but the log block should not be
10268                  * restored as it is overwritten by the payload of log block
10269                  * (0). Only log blocks (0)-(3) should be restored. We check
10270                  * whether l2ad_evict lies in between the payload starting
10271                  * offset of the next log block (lbps[1].lbp_payload_start)
10272                  * and the payload starting offset of the present log block
10273                  * (lbps[0].lbp_payload_start). If true and this isn't the
10274                  * first pass, we are looping from the beginning and we should
10275                  * stop.
10276                  */
10277                 if (l2arc_range_check_overlap(lbps[1].lbp_payload_start,
10278                     lbps[0].lbp_payload_start, dev->l2ad_evict) &&
10279                     !dev->l2ad_first)
10280                         goto out;
10281
10282                 cond_resched();
10283                 for (;;) {
10284                         mutex_enter(&l2arc_rebuild_thr_lock);
10285                         if (dev->l2ad_rebuild_cancel) {
10286                                 dev->l2ad_rebuild = B_FALSE;
10287                                 cv_signal(&l2arc_rebuild_thr_cv);
10288                                 mutex_exit(&l2arc_rebuild_thr_lock);
10289                                 err = SET_ERROR(ECANCELED);
10290                                 goto out;
10291                         }
10292                         mutex_exit(&l2arc_rebuild_thr_lock);
10293                         if (spa_config_tryenter(spa, SCL_L2ARC, vd,
10294                             RW_READER)) {
10295                                 lock_held = B_TRUE;
10296                                 break;
10297                         }
10298                         /*
10299                          * L2ARC config lock held by somebody in writer,
10300                          * possibly due to them trying to remove us. They'll
10301                          * likely to want us to shut down, so after a little
10302                          * delay, we check l2ad_rebuild_cancel and retry
10303                          * the lock again.
10304                          */
10305                         delay(1);
10306                 }
10307
10308                 /*
10309                  * Continue with the next log block.
10310                  */
10311                 lbps[0] = lbps[1];
10312                 lbps[1] = this_lb->lb_prev_lbp;
10313                 PTR_SWAP(this_lb, next_lb);
10314                 this_io = next_io;
10315                 next_io = NULL;
10316         }
10317
10318         if (this_io != NULL)
10319                 l2arc_log_blk_fetch_abort(this_io);
10320 out:
10321         if (next_io != NULL)
10322                 l2arc_log_blk_fetch_abort(next_io);
10323         vmem_free(this_lb, sizeof (*this_lb));
10324         vmem_free(next_lb, sizeof (*next_lb));
10325
10326         if (!l2arc_rebuild_enabled) {
10327                 spa_history_log_internal(spa, "L2ARC rebuild", NULL,
10328                     "disabled");
10329         } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) {
10330                 ARCSTAT_BUMP(arcstat_l2_rebuild_success);
10331                 spa_history_log_internal(spa, "L2ARC rebuild", NULL,
10332                     "successful, restored %llu blocks",
10333                     (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
10334         } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) {
10335                 /*
10336                  * No error but also nothing restored, meaning the lbps array
10337                  * in the device header points to invalid/non-present log
10338                  * blocks. Reset the header.
10339                  */
10340                 spa_history_log_internal(spa, "L2ARC rebuild", NULL,
10341                     "no valid log blocks");
10342                 bzero(l2dhdr, dev->l2ad_dev_hdr_asize);
10343                 l2arc_dev_hdr_update(dev);
10344         } else if (err == ECANCELED) {
10345                 /*
10346                  * In case the rebuild was canceled do not log to spa history
10347                  * log as the pool may be in the process of being removed.
10348                  */
10349                 zfs_dbgmsg("L2ARC rebuild aborted, restored %llu blocks",
10350                     (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
10351         } else if (err != 0) {
10352                 spa_history_log_internal(spa, "L2ARC rebuild", NULL,
10353                     "aborted, restored %llu blocks",
10354                     (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
10355         }
10356
10357         if (lock_held)
10358                 spa_config_exit(spa, SCL_L2ARC, vd);
10359
10360         return (err);
10361 }
10362
10363 /*
10364  * Attempts to read the device header on the provided L2ARC device and writes
10365  * it to `hdr'. On success, this function returns 0, otherwise the appropriate
10366  * error code is returned.
10367  */
10368 static int
10369 l2arc_dev_hdr_read(l2arc_dev_t *dev)
10370 {
10371         int                     err;
10372         uint64_t                guid;
10373         l2arc_dev_hdr_phys_t    *l2dhdr = dev->l2ad_dev_hdr;
10374         const uint64_t          l2dhdr_asize = dev->l2ad_dev_hdr_asize;
10375         abd_t                   *abd;
10376
10377         guid = spa_guid(dev->l2ad_vdev->vdev_spa);
10378
10379         abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
10380
10381         err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
10382             VDEV_LABEL_START_SIZE, l2dhdr_asize, abd,
10383             ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
10384             ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
10385             ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
10386             ZIO_FLAG_SPECULATIVE, B_FALSE));
10387
10388         abd_free(abd);
10389
10390         if (err != 0) {
10391                 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors);
10392                 zfs_dbgmsg("L2ARC IO error (%d) while reading device header, "
10393                     "vdev guid: %llu", err,
10394                     (u_longlong_t)dev->l2ad_vdev->vdev_guid);
10395                 return (err);
10396         }
10397
10398         if (l2dhdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
10399                 byteswap_uint64_array(l2dhdr, sizeof (*l2dhdr));
10400
10401         if (l2dhdr->dh_magic != L2ARC_DEV_HDR_MAGIC ||
10402             l2dhdr->dh_spa_guid != guid ||
10403             l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid ||
10404             l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION ||
10405             l2dhdr->dh_log_entries != dev->l2ad_log_entries ||
10406             l2dhdr->dh_end != dev->l2ad_end ||
10407             !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end,
10408             l2dhdr->dh_evict) ||
10409             (l2dhdr->dh_trim_state != VDEV_TRIM_COMPLETE &&
10410             l2arc_trim_ahead > 0)) {
10411                 /*
10412                  * Attempt to rebuild a device containing no actual dev hdr
10413                  * or containing a header from some other pool or from another
10414                  * version of persistent L2ARC.
10415                  */
10416                 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
10417                 return (SET_ERROR(ENOTSUP));
10418         }
10419
10420         return (0);
10421 }
10422
10423 /*
10424  * Reads L2ARC log blocks from storage and validates their contents.
10425  *
10426  * This function implements a simple fetcher to make sure that while
10427  * we're processing one buffer the L2ARC is already fetching the next
10428  * one in the chain.
10429  *
10430  * The arguments this_lp and next_lp point to the current and next log block
10431  * address in the block chain. Similarly, this_lb and next_lb hold the
10432  * l2arc_log_blk_phys_t's of the current and next L2ARC blk.
10433  *
10434  * The `this_io' and `next_io' arguments are used for block fetching.
10435  * When issuing the first blk IO during rebuild, you should pass NULL for
10436  * `this_io'. This function will then issue a sync IO to read the block and
10437  * also issue an async IO to fetch the next block in the block chain. The
10438  * fetched IO is returned in `next_io'. On subsequent calls to this
10439  * function, pass the value returned in `next_io' from the previous call
10440  * as `this_io' and a fresh `next_io' pointer to hold the next fetch IO.
10441  * Prior to the call, you should initialize your `next_io' pointer to be
10442  * NULL. If no fetch IO was issued, the pointer is left set at NULL.
10443  *
10444  * On success, this function returns 0, otherwise it returns an appropriate
10445  * error code. On error the fetching IO is aborted and cleared before
10446  * returning from this function. Therefore, if we return `success', the
10447  * caller can assume that we have taken care of cleanup of fetch IOs.
10448  */
10449 static int
10450 l2arc_log_blk_read(l2arc_dev_t *dev,
10451     const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp,
10452     l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
10453     zio_t *this_io, zio_t **next_io)
10454 {
10455         int             err = 0;
10456         zio_cksum_t     cksum;
10457         abd_t           *abd = NULL;
10458         uint64_t        asize;
10459
10460         ASSERT(this_lbp != NULL && next_lbp != NULL);
10461         ASSERT(this_lb != NULL && next_lb != NULL);
10462         ASSERT(next_io != NULL && *next_io == NULL);
10463         ASSERT(l2arc_log_blkptr_valid(dev, this_lbp));
10464
10465         /*
10466          * Check to see if we have issued the IO for this log block in a
10467          * previous run. If not, this is the first call, so issue it now.
10468          */
10469         if (this_io == NULL) {
10470                 this_io = l2arc_log_blk_fetch(dev->l2ad_vdev, this_lbp,
10471                     this_lb);
10472         }
10473
10474         /*
10475          * Peek to see if we can start issuing the next IO immediately.
10476          */
10477         if (l2arc_log_blkptr_valid(dev, next_lbp)) {
10478                 /*
10479                  * Start issuing IO for the next log block early - this
10480                  * should help keep the L2ARC device busy while we
10481                  * decompress and restore this log block.
10482                  */
10483                 *next_io = l2arc_log_blk_fetch(dev->l2ad_vdev, next_lbp,
10484                     next_lb);
10485         }
10486
10487         /* Wait for the IO to read this log block to complete */
10488         if ((err = zio_wait(this_io)) != 0) {
10489                 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
10490                 zfs_dbgmsg("L2ARC IO error (%d) while reading log block, "
10491                     "offset: %llu, vdev guid: %llu", err,
10492                     (u_longlong_t)this_lbp->lbp_daddr,
10493                     (u_longlong_t)dev->l2ad_vdev->vdev_guid);
10494                 goto cleanup;
10495         }
10496
10497         /*
10498          * Make sure the buffer checks out.
10499          * L2BLK_GET_PSIZE returns aligned size for log blocks.
10500          */
10501         asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop);
10502         fletcher_4_native(this_lb, asize, NULL, &cksum);
10503         if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) {
10504                 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors);
10505                 zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, "
10506                     "vdev guid: %llu, l2ad_hand: %llu, l2ad_evict: %llu",
10507                     (u_longlong_t)this_lbp->lbp_daddr,
10508                     (u_longlong_t)dev->l2ad_vdev->vdev_guid,
10509                     (u_longlong_t)dev->l2ad_hand,
10510                     (u_longlong_t)dev->l2ad_evict);
10511                 err = SET_ERROR(ECKSUM);
10512                 goto cleanup;
10513         }
10514
10515         /* Now we can take our time decoding this buffer */
10516         switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) {
10517         case ZIO_COMPRESS_OFF:
10518                 break;
10519         case ZIO_COMPRESS_LZ4:
10520                 abd = abd_alloc_for_io(asize, B_TRUE);
10521                 abd_copy_from_buf_off(abd, this_lb, 0, asize);
10522                 if ((err = zio_decompress_data(
10523                     L2BLK_GET_COMPRESS((this_lbp)->lbp_prop),
10524                     abd, this_lb, asize, sizeof (*this_lb), NULL)) != 0) {
10525                         err = SET_ERROR(EINVAL);
10526                         goto cleanup;
10527                 }
10528                 break;
10529         default:
10530                 err = SET_ERROR(EINVAL);
10531                 goto cleanup;
10532         }
10533         if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
10534                 byteswap_uint64_array(this_lb, sizeof (*this_lb));
10535         if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) {
10536                 err = SET_ERROR(EINVAL);
10537                 goto cleanup;
10538         }
10539 cleanup:
10540         /* Abort an in-flight fetch I/O in case of error */
10541         if (err != 0 && *next_io != NULL) {
10542                 l2arc_log_blk_fetch_abort(*next_io);
10543                 *next_io = NULL;
10544         }
10545         if (abd != NULL)
10546                 abd_free(abd);
10547         return (err);
10548 }
10549
10550 /*
10551  * Restores the payload of a log block to ARC. This creates empty ARC hdr
10552  * entries which only contain an l2arc hdr, essentially restoring the
10553  * buffers to their L2ARC evicted state. This function also updates space
10554  * usage on the L2ARC vdev to make sure it tracks restored buffers.
10555  */
10556 static void
10557 l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb,
10558     uint64_t lb_asize)
10559 {
10560         uint64_t        size = 0, asize = 0;
10561         uint64_t        log_entries = dev->l2ad_log_entries;
10562
10563         /*
10564          * Usually arc_adapt() is called only for data, not headers, but
10565          * since we may allocate significant amount of memory here, let ARC
10566          * grow its arc_c.
10567          */
10568         arc_adapt(log_entries * HDR_L2ONLY_SIZE, arc_l2c_only);
10569
10570         for (int i = log_entries - 1; i >= 0; i--) {
10571                 /*
10572                  * Restore goes in the reverse temporal direction to preserve
10573                  * correct temporal ordering of buffers in the l2ad_buflist.
10574                  * l2arc_hdr_restore also does a list_insert_tail instead of
10575                  * list_insert_head on the l2ad_buflist:
10576                  *
10577                  *              LIST    l2ad_buflist            LIST
10578                  *              HEAD  <------ (time) ------     TAIL
10579                  * direction    +-----+-----+-----+-----+-----+    direction
10580                  * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild
10581                  * fill         +-----+-----+-----+-----+-----+
10582                  *              ^                               ^
10583                  *              |                               |
10584                  *              |                               |
10585                  *      l2arc_feed_thread               l2arc_rebuild
10586                  *      will place new bufs here        restores bufs here
10587                  *
10588                  * During l2arc_rebuild() the device is not used by
10589                  * l2arc_feed_thread() as dev->l2ad_rebuild is set to true.
10590                  */
10591                 size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop);
10592                 asize += vdev_psize_to_asize(dev->l2ad_vdev,
10593                     L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop));
10594                 l2arc_hdr_restore(&lb->lb_entries[i], dev);
10595         }
10596
10597         /*
10598          * Record rebuild stats:
10599          *      size            Logical size of restored buffers in the L2ARC
10600          *      asize           Aligned size of restored buffers in the L2ARC
10601          */
10602         ARCSTAT_INCR(arcstat_l2_rebuild_size, size);
10603         ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize);
10604         ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries);
10605         ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize);
10606         ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize);
10607         ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks);
10608 }
10609
10610 /*
10611  * Restores a single ARC buf hdr from a log entry. The ARC buffer is put
10612  * into a state indicating that it has been evicted to L2ARC.
10613  */
10614 static void
10615 l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev)
10616 {
10617         arc_buf_hdr_t           *hdr, *exists;
10618         kmutex_t                *hash_lock;
10619         arc_buf_contents_t      type = L2BLK_GET_TYPE((le)->le_prop);
10620         uint64_t                asize;
10621
10622         /*
10623          * Do all the allocation before grabbing any locks, this lets us
10624          * sleep if memory is full and we don't have to deal with failed
10625          * allocations.
10626          */
10627         hdr = arc_buf_alloc_l2only(L2BLK_GET_LSIZE((le)->le_prop), type,
10628             dev, le->le_dva, le->le_daddr,
10629             L2BLK_GET_PSIZE((le)->le_prop), le->le_birth,
10630             L2BLK_GET_COMPRESS((le)->le_prop), le->le_complevel,
10631             L2BLK_GET_PROTECTED((le)->le_prop),
10632             L2BLK_GET_PREFETCH((le)->le_prop),
10633             L2BLK_GET_STATE((le)->le_prop));
10634         asize = vdev_psize_to_asize(dev->l2ad_vdev,
10635             L2BLK_GET_PSIZE((le)->le_prop));
10636
10637         /*
10638          * vdev_space_update() has to be called before arc_hdr_destroy() to
10639          * avoid underflow since the latter also calls vdev_space_update().
10640          */
10641         l2arc_hdr_arcstats_increment(hdr);
10642         vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
10643
10644         mutex_enter(&dev->l2ad_mtx);
10645         list_insert_tail(&dev->l2ad_buflist, hdr);
10646         (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr);
10647         mutex_exit(&dev->l2ad_mtx);
10648
10649         exists = buf_hash_insert(hdr, &hash_lock);
10650         if (exists) {
10651                 /* Buffer was already cached, no need to restore it. */
10652                 arc_hdr_destroy(hdr);
10653                 /*
10654                  * If the buffer is already cached, check whether it has
10655                  * L2ARC metadata. If not, enter them and update the flag.
10656                  * This is important is case of onlining a cache device, since
10657                  * we previously evicted all L2ARC metadata from ARC.
10658                  */
10659                 if (!HDR_HAS_L2HDR(exists)) {
10660                         arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR);
10661                         exists->b_l2hdr.b_dev = dev;
10662                         exists->b_l2hdr.b_daddr = le->le_daddr;
10663                         exists->b_l2hdr.b_arcs_state =
10664                             L2BLK_GET_STATE((le)->le_prop);
10665                         mutex_enter(&dev->l2ad_mtx);
10666                         list_insert_tail(&dev->l2ad_buflist, exists);
10667                         (void) zfs_refcount_add_many(&dev->l2ad_alloc,
10668                             arc_hdr_size(exists), exists);
10669                         mutex_exit(&dev->l2ad_mtx);
10670                         l2arc_hdr_arcstats_increment(exists);
10671                         vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
10672                 }
10673                 ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
10674         }
10675
10676         mutex_exit(hash_lock);
10677 }
10678
10679 /*
10680  * Starts an asynchronous read IO to read a log block. This is used in log
10681  * block reconstruction to start reading the next block before we are done
10682  * decoding and reconstructing the current block, to keep the l2arc device
10683  * nice and hot with read IO to process.
10684  * The returned zio will contain a newly allocated memory buffers for the IO
10685  * data which should then be freed by the caller once the zio is no longer
10686  * needed (i.e. due to it having completed). If you wish to abort this
10687  * zio, you should do so using l2arc_log_blk_fetch_abort, which takes
10688  * care of disposing of the allocated buffers correctly.
10689  */
10690 static zio_t *
10691 l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp,
10692     l2arc_log_blk_phys_t *lb)
10693 {
10694         uint32_t                asize;
10695         zio_t                   *pio;
10696         l2arc_read_callback_t   *cb;
10697
10698         /* L2BLK_GET_PSIZE returns aligned size for log blocks */
10699         asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
10700         ASSERT(asize <= sizeof (l2arc_log_blk_phys_t));
10701
10702         cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP);
10703         cb->l2rcb_abd = abd_get_from_buf(lb, asize);
10704         pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb,
10705             ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
10706             ZIO_FLAG_DONT_RETRY);
10707         (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize,
10708             cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL,
10709             ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
10710             ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
10711
10712         return (pio);
10713 }
10714
10715 /*
10716  * Aborts a zio returned from l2arc_log_blk_fetch and frees the data
10717  * buffers allocated for it.
10718  */
10719 static void
10720 l2arc_log_blk_fetch_abort(zio_t *zio)
10721 {
10722         (void) zio_wait(zio);
10723 }
10724
10725 /*
10726  * Creates a zio to update the device header on an l2arc device.
10727  */
10728 void
10729 l2arc_dev_hdr_update(l2arc_dev_t *dev)
10730 {
10731         l2arc_dev_hdr_phys_t    *l2dhdr = dev->l2ad_dev_hdr;
10732         const uint64_t          l2dhdr_asize = dev->l2ad_dev_hdr_asize;
10733         abd_t                   *abd;
10734         int                     err;
10735
10736         VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER));
10737
10738         l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC;
10739         l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION;
10740         l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
10741         l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid;
10742         l2dhdr->dh_log_entries = dev->l2ad_log_entries;
10743         l2dhdr->dh_evict = dev->l2ad_evict;
10744         l2dhdr->dh_start = dev->l2ad_start;
10745         l2dhdr->dh_end = dev->l2ad_end;
10746         l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize);
10747         l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count);
10748         l2dhdr->dh_flags = 0;
10749         l2dhdr->dh_trim_action_time = dev->l2ad_vdev->vdev_trim_action_time;
10750         l2dhdr->dh_trim_state = dev->l2ad_vdev->vdev_trim_state;
10751         if (dev->l2ad_first)
10752                 l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
10753
10754         abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
10755
10756         err = zio_wait(zio_write_phys(NULL, dev->l2ad_vdev,
10757             VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL,
10758             NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE));
10759
10760         abd_free(abd);
10761
10762         if (err != 0) {
10763                 zfs_dbgmsg("L2ARC IO error (%d) while writing device header, "
10764                     "vdev guid: %llu", err,
10765                     (u_longlong_t)dev->l2ad_vdev->vdev_guid);
10766         }
10767 }
10768
10769 /*
10770  * Commits a log block to the L2ARC device. This routine is invoked from
10771  * l2arc_write_buffers when the log block fills up.
10772  * This function allocates some memory to temporarily hold the serialized
10773  * buffer to be written. This is then released in l2arc_write_done.
10774  */
10775 static void
10776 l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
10777 {
10778         l2arc_log_blk_phys_t    *lb = &dev->l2ad_log_blk;
10779         l2arc_dev_hdr_phys_t    *l2dhdr = dev->l2ad_dev_hdr;
10780         uint64_t                psize, asize;
10781         zio_t                   *wzio;
10782         l2arc_lb_abd_buf_t      *abd_buf;
10783         uint8_t                 *tmpbuf;
10784         l2arc_lb_ptr_buf_t      *lb_ptr_buf;
10785
10786         VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries);
10787
10788         tmpbuf = zio_buf_alloc(sizeof (*lb));
10789         abd_buf = zio_buf_alloc(sizeof (*abd_buf));
10790         abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb));
10791         lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
10792         lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP);
10793
10794         /* link the buffer into the block chain */
10795         lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1];
10796         lb->lb_magic = L2ARC_LOG_BLK_MAGIC;
10797
10798         /*
10799          * l2arc_log_blk_commit() may be called multiple times during a single
10800          * l2arc_write_buffers() call. Save the allocated abd buffers in a list
10801          * so we can free them in l2arc_write_done() later on.
10802          */
10803         list_insert_tail(&cb->l2wcb_abd_list, abd_buf);
10804
10805         /* try to compress the buffer */
10806         psize = zio_compress_data(ZIO_COMPRESS_LZ4,
10807             abd_buf->abd, tmpbuf, sizeof (*lb), 0);
10808
10809         /* a log block is never entirely zero */
10810         ASSERT(psize != 0);
10811         asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
10812         ASSERT(asize <= sizeof (*lb));
10813
10814         /*
10815          * Update the start log block pointer in the device header to point
10816          * to the log block we're about to write.
10817          */
10818         l2dhdr->dh_start_lbps[1] = l2dhdr->dh_start_lbps[0];
10819         l2dhdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand;
10820         l2dhdr->dh_start_lbps[0].lbp_payload_asize =
10821             dev->l2ad_log_blk_payload_asize;
10822         l2dhdr->dh_start_lbps[0].lbp_payload_start =
10823             dev->l2ad_log_blk_payload_start;
10824         _NOTE(CONSTCOND)
10825         L2BLK_SET_LSIZE(
10826             (&l2dhdr->dh_start_lbps[0])->lbp_prop, sizeof (*lb));
10827         L2BLK_SET_PSIZE(
10828             (&l2dhdr->dh_start_lbps[0])->lbp_prop, asize);
10829         L2BLK_SET_CHECKSUM(
10830             (&l2dhdr->dh_start_lbps[0])->lbp_prop,
10831             ZIO_CHECKSUM_FLETCHER_4);
10832         if (asize < sizeof (*lb)) {
10833                 /* compression succeeded */
10834                 bzero(tmpbuf + psize, asize - psize);
10835                 L2BLK_SET_COMPRESS(
10836                     (&l2dhdr->dh_start_lbps[0])->lbp_prop,
10837                     ZIO_COMPRESS_LZ4);
10838         } else {
10839                 /* compression failed */
10840                 bcopy(lb, tmpbuf, sizeof (*lb));
10841                 L2BLK_SET_COMPRESS(
10842                     (&l2dhdr->dh_start_lbps[0])->lbp_prop,
10843                     ZIO_COMPRESS_OFF);
10844         }
10845
10846         /* checksum what we're about to write */
10847         fletcher_4_native(tmpbuf, asize, NULL,
10848             &l2dhdr->dh_start_lbps[0].lbp_cksum);
10849
10850         abd_free(abd_buf->abd);
10851
10852         /* perform the write itself */
10853         abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb));
10854         abd_take_ownership_of_buf(abd_buf->abd, B_TRUE);
10855         wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
10856             asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL,
10857             ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
10858         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio);
10859         (void) zio_nowait(wzio);
10860
10861         dev->l2ad_hand += asize;
10862         /*
10863          * Include the committed log block's pointer  in the list of pointers
10864          * to log blocks present in the L2ARC device.
10865          */
10866         bcopy(&l2dhdr->dh_start_lbps[0], lb_ptr_buf->lb_ptr,
10867             sizeof (l2arc_log_blkptr_t));
10868         mutex_enter(&dev->l2ad_mtx);
10869         list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf);
10870         ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
10871         ARCSTAT_BUMP(arcstat_l2_log_blk_count);
10872         zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
10873         zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
10874         mutex_exit(&dev->l2ad_mtx);
10875         vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
10876
10877         /* bump the kstats */
10878         ARCSTAT_INCR(arcstat_l2_write_bytes, asize);
10879         ARCSTAT_BUMP(arcstat_l2_log_blk_writes);
10880         ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize);
10881         ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio,
10882             dev->l2ad_log_blk_payload_asize / asize);
10883
10884         /* start a new log block */
10885         dev->l2ad_log_ent_idx = 0;
10886         dev->l2ad_log_blk_payload_asize = 0;
10887         dev->l2ad_log_blk_payload_start = 0;
10888 }
10889
10890 /*
10891  * Validates an L2ARC log block address to make sure that it can be read
10892  * from the provided L2ARC device.
10893  */
10894 boolean_t
10895 l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp)
10896 {
10897         /* L2BLK_GET_PSIZE returns aligned size for log blocks */
10898         uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
10899         uint64_t end = lbp->lbp_daddr + asize - 1;
10900         uint64_t start = lbp->lbp_payload_start;
10901         boolean_t evicted = B_FALSE;
10902
10903         /*
10904          * A log block is valid if all of the following conditions are true:
10905          * - it fits entirely (including its payload) between l2ad_start and
10906          *   l2ad_end
10907          * - it has a valid size
10908          * - neither the log block itself nor part of its payload was evicted
10909          *   by l2arc_evict():
10910          *
10911          *              l2ad_hand          l2ad_evict
10912          *              |                        |      lbp_daddr
10913          *              |     start              |      |  end
10914          *              |     |                  |      |  |
10915          *              V     V                  V      V  V
10916          *   l2ad_start ============================================ l2ad_end
10917          *                    --------------------------||||
10918          *                              ^                ^
10919          *                              |               log block
10920          *                              payload
10921          */
10922
10923         evicted =
10924             l2arc_range_check_overlap(start, end, dev->l2ad_hand) ||
10925             l2arc_range_check_overlap(start, end, dev->l2ad_evict) ||
10926             l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, start) ||
10927             l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end);
10928
10929         return (start >= dev->l2ad_start && end <= dev->l2ad_end &&
10930             asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) &&
10931             (!evicted || dev->l2ad_first));
10932 }
10933
10934 /*
10935  * Inserts ARC buffer header `hdr' into the current L2ARC log block on
10936  * the device. The buffer being inserted must be present in L2ARC.
10937  * Returns B_TRUE if the L2ARC log block is full and needs to be committed
10938  * to L2ARC, or B_FALSE if it still has room for more ARC buffers.
10939  */
10940 static boolean_t
10941 l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
10942 {
10943         l2arc_log_blk_phys_t    *lb = &dev->l2ad_log_blk;
10944         l2arc_log_ent_phys_t    *le;
10945
10946         if (dev->l2ad_log_entries == 0)
10947                 return (B_FALSE);
10948
10949         int index = dev->l2ad_log_ent_idx++;
10950
10951         ASSERT3S(index, <, dev->l2ad_log_entries);
10952         ASSERT(HDR_HAS_L2HDR(hdr));
10953
10954         le = &lb->lb_entries[index];
10955         bzero(le, sizeof (*le));
10956         le->le_dva = hdr->b_dva;
10957         le->le_birth = hdr->b_birth;
10958         le->le_daddr = hdr->b_l2hdr.b_daddr;
10959         if (index == 0)
10960                 dev->l2ad_log_blk_payload_start = le->le_daddr;
10961         L2BLK_SET_LSIZE((le)->le_prop, HDR_GET_LSIZE(hdr));
10962         L2BLK_SET_PSIZE((le)->le_prop, HDR_GET_PSIZE(hdr));
10963         L2BLK_SET_COMPRESS((le)->le_prop, HDR_GET_COMPRESS(hdr));
10964         le->le_complevel = hdr->b_complevel;
10965         L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
10966         L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
10967         L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
10968         L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state);
10969
10970         dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
10971             HDR_GET_PSIZE(hdr));
10972
10973         return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries);
10974 }
10975
10976 /*
10977  * Checks whether a given L2ARC device address sits in a time-sequential
10978  * range. The trick here is that the L2ARC is a rotary buffer, so we can't
10979  * just do a range comparison, we need to handle the situation in which the
10980  * range wraps around the end of the L2ARC device. Arguments:
10981  *      bottom -- Lower end of the range to check (written to earlier).
10982  *      top    -- Upper end of the range to check (written to later).
10983  *      check  -- The address for which we want to determine if it sits in
10984  *                between the top and bottom.
10985  *
10986  * The 3-way conditional below represents the following cases:
10987  *
10988  *      bottom < top : Sequentially ordered case:
10989  *        <check>--------+-------------------+
10990  *                       |  (overlap here?)  |
10991  *       L2ARC dev       V                   V
10992  *       |---------------<bottom>============<top>--------------|
10993  *
10994  *      bottom > top: Looped-around case:
10995  *                            <check>--------+------------------+
10996  *                                           |  (overlap here?) |
10997  *       L2ARC dev                           V                  V
10998  *       |===============<top>---------------<bottom>===========|
10999  *       ^               ^
11000  *       |  (or here?)   |
11001  *       +---------------+---------<check>
11002  *
11003  *      top == bottom : Just a single address comparison.
11004  */
11005 boolean_t
11006 l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check)
11007 {
11008         if (bottom < top)
11009                 return (bottom <= check && check <= top);
11010         else if (bottom > top)
11011                 return (check <= top || bottom <= check);
11012         else
11013                 return (check == top);
11014 }
11015
11016 EXPORT_SYMBOL(arc_buf_size);
11017 EXPORT_SYMBOL(arc_write);
11018 EXPORT_SYMBOL(arc_read);
11019 EXPORT_SYMBOL(arc_buf_info);
11020 EXPORT_SYMBOL(arc_getbuf_func);
11021 EXPORT_SYMBOL(arc_add_prune_callback);
11022 EXPORT_SYMBOL(arc_remove_prune_callback);
11023
11024 /* BEGIN CSTYLED */
11025 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_min,
11026         param_get_long, ZMOD_RW, "Min arc size");
11027
11028 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_max,
11029         param_get_long, ZMOD_RW, "Max arc size");
11030
11031 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit, param_set_arc_long,
11032         param_get_long, ZMOD_RW, "Metadata limit for arc size");
11033
11034 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit_percent,
11035         param_set_arc_long, param_get_long, ZMOD_RW,
11036         "Percent of arc size for arc meta limit");
11037
11038 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_min, param_set_arc_long,
11039         param_get_long, ZMOD_RW, "Min arc metadata");
11040
11041 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_prune, INT, ZMOD_RW,
11042         "Meta objects to scan for prune");
11043
11044 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_adjust_restarts, INT, ZMOD_RW,
11045         "Limit number of restarts in arc_evict_meta");
11046
11047 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_strategy, INT, ZMOD_RW,
11048         "Meta reclaim strategy");
11049
11050 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int,
11051         param_get_int, ZMOD_RW, "Seconds before growing arc size");
11052
11053 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, p_dampener_disable, INT, ZMOD_RW,
11054         "Disable arc_p adapt dampener");
11055
11056 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int,
11057         param_get_int, ZMOD_RW, "log2(fraction of arc to reclaim)");
11058
11059 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW,
11060         "Percent of pagecache to reclaim arc to");
11061
11062 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, p_min_shift, param_set_arc_int,
11063         param_get_int, ZMOD_RW, "arc_c shift to calc min/max arc_p");
11064
11065 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, INT, ZMOD_RD,
11066         "Target average block size");
11067
11068 ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RW,
11069         "Disable compressed arc buffers");
11070
11071 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prefetch_ms, param_set_arc_int,
11072         param_get_int, ZMOD_RW, "Min life of prefetch block in ms");
11073
11074 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prescient_prefetch_ms,
11075         param_set_arc_int, param_get_int, ZMOD_RW,
11076         "Min life of prescient prefetched block in ms");
11077
11078 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, ULONG, ZMOD_RW,
11079         "Max write bytes per interval");
11080
11081 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_boost, ULONG, ZMOD_RW,
11082         "Extra write bytes during device warmup");
11083
11084 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, ULONG, ZMOD_RW,
11085         "Number of max device writes to precache");
11086
11087 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, ULONG, ZMOD_RW,
11088         "Compressed l2arc_headroom multiplier");
11089
11090 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, ULONG, ZMOD_RW,
11091         "TRIM ahead L2ARC write size multiplier");
11092
11093 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, ULONG, ZMOD_RW,
11094         "Seconds between L2ARC writing");
11095
11096 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_min_ms, ULONG, ZMOD_RW,
11097         "Min feed interval in milliseconds");
11098
11099 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, noprefetch, INT, ZMOD_RW,
11100         "Skip caching prefetched buffers");
11101
11102 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_again, INT, ZMOD_RW,
11103         "Turbo L2ARC warmup");
11104
11105 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, norw, INT, ZMOD_RW,
11106         "No reads during writes");
11107
11108 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, meta_percent, INT, ZMOD_RW,
11109         "Percent of ARC size allowed for L2ARC-only headers");
11110
11111 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_enabled, INT, ZMOD_RW,
11112         "Rebuild the L2ARC when importing a pool");
11113
11114 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, ULONG, ZMOD_RW,
11115         "Min size in bytes to write rebuild log blocks in L2ARC");
11116
11117 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW,
11118         "Cache only MFU data from ARC into L2ARC");
11119
11120 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int,
11121         param_get_int, ZMOD_RW, "System free memory I/O throttle in bytes");
11122
11123 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_long,
11124         param_get_long, ZMOD_RW, "System free memory target size in bytes");
11125
11126 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_long,
11127         param_get_long, ZMOD_RW, "Minimum bytes of dnodes in arc");
11128
11129 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent,
11130         param_set_arc_long, param_get_long, ZMOD_RW,
11131         "Percent of ARC meta buffers for dnodes");
11132
11133 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, ULONG, ZMOD_RW,
11134         "Percentage of excess dnodes to try to unpin");
11135
11136 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, INT, ZMOD_RW,
11137         "When full, ARC allocation waits for eviction of this % of alloc size");
11138
11139 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, INT, ZMOD_RW,
11140         "The number of headers to evict per sublist before moving to the next");
11141 /* END CSTYLED */