sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2018, Joyent, Inc.
  24  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  25  * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
  26  * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
  27  */
  28
  29 /*
  30  * DVA-based Adjustable Replacement Cache
  31  *
  32  * While much of the theory of operation used here is
  33  * based on the self-tuning, low overhead replacement cache
  34  * presented by Megiddo and Modha at FAST 2003, there are some
  35  * significant differences:
  36  *
  37  * 1. The Megiddo and Modha model assumes any page is evictable.
  38  * Pages in its cache cannot be "locked" into memory.  This makes
  39  * the eviction algorithm simple: evict the last page in the list.
  40  * This also make the performance characteristics easy to reason
  41  * about.  Our cache is not so simple.  At any given moment, some
  42  * subset of the blocks in the cache are un-evictable because we
  43  * have handed out a reference to them.  Blocks are only evictable
  44  * when there are no external references active.  This makes
  45  * eviction far more problematic:  we choose to evict the evictable
  46  * blocks that are the "lowest" in the list.
  47  *
  48  * There are times when it is not possible to evict the requested
  49  * space.  In these circumstances we are unable to adjust the cache
  50  * size.  To prevent the cache growing unbounded at these times we
  51  * implement a "cache throttle" that slows the flow of new data
  52  * into the cache until we can make space available.
  53  *
  54  * 2. The Megiddo and Modha model assumes a fixed cache size.
  55  * Pages are evicted when the cache is full and there is a cache
  56  * miss.  Our model has a variable sized cache.  It grows with
  57  * high use, but also tries to react to memory pressure from the
  58  * operating system: decreasing its size when system memory is
  59  * tight.
  60  *
  61  * 3. The Megiddo and Modha model assumes a fixed page size. All
  62  * elements of the cache are therefore exactly the same size.  So
  63  * when adjusting the cache size following a cache miss, its simply
  64  * a matter of choosing a single page to evict.  In our model, we
  65  * have variable sized cache blocks (rangeing from 512 bytes to
  66  * 128K bytes).  We therefore choose a set of blocks to evict to make
  67  * space for a cache miss that approximates as closely as possible
  68  * the space used by the new block.
  69  *
  70  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  71  * by N. Megiddo & D. Modha, FAST 2003
  72  */
  73
  74 /*
  75  * The locking model:
  76  *
  77  * A new reference to a cache buffer can be obtained in two
  78  * ways: 1) via a hash table lookup using the DVA as a key,
  79  * or 2) via one of the ARC lists.  The arc_read() interface
  80  * uses method 1, while the internal ARC algorithms for
  81  * adjusting the cache use method 2.  We therefore provide two
  82  * types of locks: 1) the hash table lock array, and 2) the
  83  * ARC list locks.
  84  *
  85  * Buffers do not have their own mutexes, rather they rely on the
  86  * hash table mutexes for the bulk of their protection (i.e. most
  87  * fields in the arc_buf_hdr_t are protected by these mutexes).
  88  *
  89  * buf_hash_find() returns the appropriate mutex (held) when it
  90  * locates the requested buffer in the hash table.  It returns
  91  * NULL for the mutex if the buffer was not in the table.
  92  *
  93  * buf_hash_remove() expects the appropriate hash mutex to be
  94  * already held before it is invoked.
  95  *
  96  * Each ARC state also has a mutex which is used to protect the
  97  * buffer list associated with the state.  When attempting to
  98  * obtain a hash table lock while holding an ARC list lock you
  99  * must use: mutex_tryenter() to avoid deadlock.  Also note that
 100  * the active state mutex must be held before the ghost state mutex.
 101  *
 102  * It as also possible to register a callback which is run when the
 103  * arc_meta_limit is reached and no buffers can be safely evicted.  In
 104  * this case the arc user should drop a reference on some arc buffers so
 105  * they can be reclaimed and the arc_meta_limit honored.  For example,
 106  * when using the ZPL each dentry holds a references on a znode.  These
 107  * dentries must be pruned before the arc buffer holding the znode can
 108  * be safely evicted.
 109  *
 110  * Note that the majority of the performance stats are manipulated
 111  * with atomic operations.
 112  *
 113  * The L2ARC uses the l2ad_mtx on each vdev for the following:
 114  *
 115  *      - L2ARC buflist creation
 116  *      - L2ARC buflist eviction
 117  *      - L2ARC write completion, which walks L2ARC buflists
 118  *      - ARC header destruction, as it removes from L2ARC buflists
 119  *      - ARC header release, as it removes from L2ARC buflists
 120  */
 121
 122 /*
 123  * ARC operation:
 124  *
 125  * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
 126  * This structure can point either to a block that is still in the cache or to
 127  * one that is only accessible in an L2 ARC device, or it can provide
 128  * information about a block that was recently evicted. If a block is
 129  * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
 130  * information to retrieve it from the L2ARC device. This information is
 131  * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
 132  * that is in this state cannot access the data directly.
 133  *
 134  * Blocks that are actively being referenced or have not been evicted
 135  * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
 136  * the arc_buf_hdr_t that will point to the data block in memory. A block can
 137  * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
 138  * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
 139  * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd).
 140  *
 141  * The L1ARC's data pointer may or may not be uncompressed. The ARC has the
 142  * ability to store the physical data (b_pabd) associated with the DVA of the
 143  * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block,
 144  * it will match its on-disk compression characteristics. This behavior can be
 145  * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
 146  * compressed ARC functionality is disabled, the b_pabd will point to an
 147  * uncompressed version of the on-disk data.
 148  *
 149  * Data in the L1ARC is not accessed by consumers of the ARC directly. Each
 150  * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it.
 151  * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC
 152  * consumer. The ARC will provide references to this data and will keep it
 153  * cached until it is no longer in use. The ARC caches only the L1ARC's physical
 154  * data block and will evict any arc_buf_t that is no longer referenced. The
 155  * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the
 156  * "overhead_size" kstat.
 157  *
 158  * Depending on the consumer, an arc_buf_t can be requested in uncompressed or
 159  * compressed form. The typical case is that consumers will want uncompressed
 160  * data, and when that happens a new data buffer is allocated where the data is
 161  * decompressed for them to use. Currently the only consumer who wants
 162  * compressed arc_buf_t's is "zfs send", when it streams data exactly as it
 163  * exists on disk. When this happens, the arc_buf_t's data buffer is shared
 164  * with the arc_buf_hdr_t.
 165  *
 166  * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The
 167  * first one is owned by a compressed send consumer (and therefore references
 168  * the same compressed data buffer as the arc_buf_hdr_t) and the second could be
 169  * used by any other consumer (and has its own uncompressed copy of the data
 170  * buffer).
 171  *
 172  *   arc_buf_hdr_t
 173  *   +-----------+
 174  *   | fields    |
 175  *   | common to |
 176  *   | L1- and   |
 177  *   | L2ARC     |
 178  *   +-----------+
 179  *   | l2arc_buf_hdr_t
 180  *   |           |
 181  *   +-----------+
 182  *   | l1arc_buf_hdr_t
 183  *   |           |              arc_buf_t
 184  *   | b_buf     +------------>+-----------+      arc_buf_t
 185  *   | b_pabd    +-+           |b_next     +---->+-----------+
 186  *   +-----------+ |           |-----------|     |b_next     +-->NULL
 187  *                 |           |b_comp = T |     +-----------+
 188  *                 |           |b_data     +-+   |b_comp = F |
 189  *                 |           +-----------+ |   |b_data     +-+
 190  *                 +->+------+               |   +-----------+ |
 191  *        compressed  |      |               |                 |
 192  *           data     |      |<--------------+                 | uncompressed
 193  *                    +------+          compressed,            |     data
 194  *                                        shared               +-->+------+
 195  *                                         data                    |      |
 196  *                                                                 |      |
 197  *                                                                 +------+
 198  *
 199  * When a consumer reads a block, the ARC must first look to see if the
 200  * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
 201  * arc_buf_t and either copies uncompressed data into a new data buffer from an
 202  * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a
 203  * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the
 204  * hdr is compressed and the desired compression characteristics of the
 205  * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
 206  * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
 207  * the last buffer in the hdr's b_buf list, however a shared compressed buf can
 208  * be anywhere in the hdr's list.
 209  *
 210  * The diagram below shows an example of an uncompressed ARC hdr that is
 211  * sharing its data with an arc_buf_t (note that the shared uncompressed buf is
 212  * the last element in the buf list):
 213  *
 214  *                arc_buf_hdr_t
 215  *                +-----------+
 216  *                |           |
 217  *                |           |
 218  *                |           |
 219  *                +-----------+
 220  * l2arc_buf_hdr_t|           |
 221  *                |           |
 222  *                +-----------+
 223  * l1arc_buf_hdr_t|           |
 224  *                |           |                 arc_buf_t    (shared)
 225  *                |    b_buf  +------------>+---------+      arc_buf_t
 226  *                |           |             |b_next   +---->+---------+
 227  *                |  b_pabd   +-+           |---------|     |b_next   +-->NULL
 228  *                +-----------+ |           |         |     +---------+
 229  *                              |           |b_data   +-+   |         |
 230  *                              |           +---------+ |   |b_data   +-+
 231  *                              +->+------+             |   +---------+ |
 232  *                                 |      |             |               |
 233  *                   uncompressed  |      |             |               |
 234  *                        data     +------+             |               |
 235  *                                    ^                 +->+------+     |
 236  *                                    |       uncompressed |      |     |
 237  *                                    |           data     |      |     |
 238  *                                    |                    +------+     |
 239  *                                    +---------------------------------+
 240  *
 241  * Writing to the ARC requires that the ARC first discard the hdr's b_pabd
 242  * since the physical block is about to be rewritten. The new data contents
 243  * will be contained in the arc_buf_t. As the I/O pipeline performs the write,
 244  * it may compress the data before writing it to disk. The ARC will be called
 245  * with the transformed data and will bcopy the transformed on-disk block into
 246  * a newly allocated b_pabd. Writes are always done into buffers which have
 247  * either been loaned (and hence are new and don't have other readers) or
 248  * buffers which have been released (and hence have their own hdr, if there
 249  * were originally other readers of the buf's original hdr). This ensures that
 250  * the ARC only needs to update a single buf and its hdr after a write occurs.
 251  *
 252  * When the L2ARC is in use, it will also take advantage of the b_pabd. The
 253  * L2ARC will always write the contents of b_pabd to the L2ARC. This means
 254  * that when compressed ARC is enabled that the L2ARC blocks are identical
 255  * to the on-disk block in the main data pool. This provides a significant
 256  * advantage since the ARC can leverage the bp's checksum when reading from the
 257  * L2ARC to determine if the contents are valid. However, if the compressed
 258  * ARC is disabled, then the L2ARC's block must be transformed to look
 259  * like the physical block in the main data pool before comparing the
 260  * checksum and determining its validity.
 261  */
 262
 263 #include <sys/spa.h>
 264 #include <sys/zio.h>
 265 #include <sys/spa_impl.h>
 266 #include <sys/zio_compress.h>
 267 #include <sys/zio_checksum.h>
 268 #include <sys/zfs_context.h>
 269 #include <sys/arc.h>
 270 #include <sys/refcount.h>
 271 #include <sys/vdev.h>
 272 #include <sys/vdev_impl.h>
 273 #include <sys/dsl_pool.h>
 274 #include <sys/zio_checksum.h>
 275 #include <sys/multilist.h>
 276 #include <sys/abd.h>
 277 #ifdef _KERNEL
 278 #include <sys/dnlc.h>
 279 #include <sys/racct.h>
 280 #endif
 281 #include <sys/callb.h>
 282 #include <sys/kstat.h>
 283 #include <sys/trim_map.h>
 284 #include <zfs_fletcher.h>
 285 #include <sys/sdt.h>
 286 #include <sys/aggsum.h>
 287 #include <sys/cityhash.h>
 288
 289 #include <machine/vmparam.h>
 290
 291 #ifdef illumos
 292 #ifndef _KERNEL
 293 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 294 boolean_t arc_watch = B_FALSE;
 295 int arc_procfd;
 296 #endif
 297 #endif /* illumos */
 298
 299 static kmutex_t         arc_reclaim_lock;
 300 static kcondvar_t       arc_reclaim_thread_cv;
 301 static boolean_t        arc_reclaim_thread_exit;
 302 static kcondvar_t       arc_reclaim_waiters_cv;
 303
 304 static kmutex_t         arc_dnlc_evicts_lock;
 305 static kcondvar_t       arc_dnlc_evicts_cv;
 306 static boolean_t        arc_dnlc_evicts_thread_exit;
 307
 308 uint_t arc_reduce_dnlc_percent = 3;
 309
 310 /*
 311  * The number of headers to evict in arc_evict_state_impl() before
 312  * dropping the sublist lock and evicting from another sublist. A lower
 313  * value means we're more likely to evict the "correct" header (i.e. the
 314  * oldest header in the arc state), but comes with higher overhead
 315  * (i.e. more invocations of arc_evict_state_impl()).
 316  */
 317 int zfs_arc_evict_batch_limit = 10;
 318
 319 /* number of seconds before growing cache again */
 320 static int              arc_grow_retry = 60;
 321
 322 /* number of milliseconds before attempting a kmem-cache-reap */
 323 static int              arc_kmem_cache_reap_retry_ms = 1000;
 324
 325 /* shift of arc_c for calculating overflow limit in arc_get_data_impl */
 326 int             zfs_arc_overflow_shift = 8;
 327
 328 /* shift of arc_c for calculating both min and max arc_p */
 329 static int              arc_p_min_shift = 4;
 330
 331 /* log2(fraction of arc to reclaim) */
 332 static int              arc_shrink_shift = 7;
 333
 334 /*
 335  * log2(fraction of ARC which must be free to allow growing).
 336  * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
 337  * when reading a new block into the ARC, we will evict an equal-sized block
 338  * from the ARC.
 339  *
 340  * This must be less than arc_shrink_shift, so that when we shrink the ARC,
 341  * we will still not allow it to grow.
 342  */
 343 int                     arc_no_grow_shift = 5;
 344
 345
 346 /*
 347  * minimum lifespan of a prefetch block in clock ticks
 348  * (initialized in arc_init())
 349  */
 350 static int              zfs_arc_min_prefetch_ms = 1;
 351 static int              zfs_arc_min_prescient_prefetch_ms = 6;
 352
 353 /*
 354  * If this percent of memory is free, don't throttle.
 355  */
 356 int arc_lotsfree_percent = 10;
 357
 358 static int arc_dead;
 359 extern boolean_t zfs_prefetch_disable;
 360
 361 /*
 362  * The arc has filled available memory and has now warmed up.
 363  */
 364 static boolean_t arc_warm;
 365
 366 /*
 367  * log2 fraction of the zio arena to keep free.
 368  */
 369 int arc_zio_arena_free_shift = 2;
 370
 371 /*
 372  * These tunables are for performance analysis.
 373  */
 374 uint64_t zfs_arc_max;
 375 uint64_t zfs_arc_min;
 376 uint64_t zfs_arc_meta_limit = 0;
 377 uint64_t zfs_arc_meta_min = 0;
 378 uint64_t zfs_arc_dnode_limit = 0;
 379 uint64_t zfs_arc_dnode_reduce_percent = 10;
 380 int zfs_arc_grow_retry = 0;
 381 int zfs_arc_shrink_shift = 0;
 382 int zfs_arc_no_grow_shift = 0;
 383 int zfs_arc_p_min_shift = 0;
 384 uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
 385 u_int zfs_arc_free_target = 0;
 386
 387 /* Absolute min for arc min / max is 16MB. */
 388 static uint64_t arc_abs_min = 16 << 20;
 389
 390 /*
 391  * ARC dirty data constraints for arc_tempreserve_space() throttle
 392  */
 393 uint_t zfs_arc_dirty_limit_percent = 50;        /* total dirty data limit */
 394 uint_t zfs_arc_anon_limit_percent = 25;         /* anon block dirty limit */
 395 uint_t zfs_arc_pool_dirty_percent = 20;         /* each pool's anon allowance */
 396
 397 boolean_t zfs_compressed_arc_enabled = B_TRUE;
 398
 399 static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
 400 static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS);
 401 static int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS);
 402 static int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS);
 403 static int sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS);
 404
 405 #if defined(__FreeBSD__) && defined(_KERNEL)
 406 static void
 407 arc_free_target_init(void *unused __unused)
 408 {
 409
 410         zfs_arc_free_target = vm_cnt.v_free_target;
 411 }
 412 SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
 413     arc_free_target_init, NULL);
 414
 415 TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
 416 TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min);
 417 TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift);
 418 TUNABLE_INT("vfs.zfs.arc_grow_retry", &zfs_arc_grow_retry);
 419 TUNABLE_INT("vfs.zfs.arc_no_grow_shift", &zfs_arc_no_grow_shift);
 420 SYSCTL_DECL(_vfs_zfs);
 421 SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, CTLTYPE_U64 | CTLFLAG_RWTUN,
 422     0, sizeof(uint64_t), sysctl_vfs_zfs_arc_max, "QU", "Maximum ARC size");
 423 SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_U64 | CTLFLAG_RWTUN,
 424     0, sizeof(uint64_t), sysctl_vfs_zfs_arc_min, "QU", "Minimum ARC size");
 425 SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, CTLTYPE_U32 | CTLFLAG_RWTUN,
 426     0, sizeof(uint32_t), sysctl_vfs_zfs_arc_no_grow_shift, "U",
 427     "log2(fraction of ARC which must be free to allow growing)");
 428 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
 429     &zfs_arc_average_blocksize, 0,
 430     "ARC average blocksize");
 431 SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
 432     &arc_shrink_shift, 0,
 433     "log2(fraction of arc to reclaim)");
 434 SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_grow_retry, CTLFLAG_RW,
 435     &arc_grow_retry, 0,
 436     "Wait in seconds before considering growing ARC");
 437 SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RDTUN,
 438     &zfs_compressed_arc_enabled, 0, "Enable compressed ARC");
 439
 440 /*
 441  * We don't have a tunable for arc_free_target due to the dependency on
 442  * pagedaemon initialisation.
 443  */
 444 SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
 445     CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int),
 446     sysctl_vfs_zfs_arc_free_target, "IU",
 447     "Desired number of free pages below which ARC triggers reclaim");
 448
 449 static int
 450 sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
 451 {
 452         u_int val;
 453         int err;
 454
 455         val = zfs_arc_free_target;
 456         err = sysctl_handle_int(oidp, &val, 0, req);
 457         if (err != 0 || req->newptr == NULL)
 458                 return (err);
 459
 460         if (val < minfree)
 461                 return (EINVAL);
 462         if (val > vm_cnt.v_page_count)
 463                 return (EINVAL);
 464
 465         zfs_arc_free_target = val;
 466
 467         return (0);
 468 }
 469
 470 /*
 471  * Must be declared here, before the definition of corresponding kstat
 472  * macro which uses the same names will confuse the compiler.
 473  */
 474 SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit,
 475     CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
 476     sysctl_vfs_zfs_arc_meta_limit, "QU",
 477     "ARC metadata limit");
 478 #endif
 479
 480 /*
 481  * Note that buffers can be in one of 6 states:
 482  *      ARC_anon        - anonymous (discussed below)
 483  *      ARC_mru         - recently used, currently cached
 484  *      ARC_mru_ghost   - recentely used, no longer in cache
 485  *      ARC_mfu         - frequently used, currently cached
 486  *      ARC_mfu_ghost   - frequently used, no longer in cache
 487  *      ARC_l2c_only    - exists in L2ARC but not other states
 488  * When there are no active references to the buffer, they are
 489  * are linked onto a list in one of these arc states.  These are
 490  * the only buffers that can be evicted or deleted.  Within each
 491  * state there are multiple lists, one for meta-data and one for
 492  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
 493  * etc.) is tracked separately so that it can be managed more
 494  * explicitly: favored over data, limited explicitly.
 495  *
 496  * Anonymous buffers are buffers that are not associated with
 497  * a DVA.  These are buffers that hold dirty block copies
 498  * before they are written to stable storage.  By definition,
 499  * they are "ref'd" and are considered part of arc_mru
 500  * that cannot be freed.  Generally, they will aquire a DVA
 501  * as they are written and migrate onto the arc_mru list.
 502  *
 503  * The ARC_l2c_only state is for buffers that are in the second
 504  * level ARC but no longer in any of the ARC_m* lists.  The second
 505  * level ARC itself may also contain buffers that are in any of
 506  * the ARC_m* states - meaning that a buffer can exist in two
 507  * places.  The reason for the ARC_l2c_only state is to keep the
 508  * buffer header in the hash table, so that reads that hit the
 509  * second level ARC benefit from these fast lookups.
 510  */
 511
 512 typedef struct arc_state {
 513         /*
 514          * list of evictable buffers
 515          */
 516         multilist_t *arcs_list[ARC_BUFC_NUMTYPES];
 517         /*
 518          * total amount of evictable data in this state
 519          */
 520         refcount_t arcs_esize[ARC_BUFC_NUMTYPES];
 521         /*
 522          * total amount of data in this state; this includes: evictable,
 523          * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
 524          */
 525         refcount_t arcs_size;
 526         /*
 527          * supports the "dbufs" kstat
 528          */
 529         arc_state_type_t arcs_state;
 530 } arc_state_t;
 531
 532 /*
 533  * Percentage that can be consumed by dnodes of ARC meta buffers.
 534  */
 535 int zfs_arc_meta_prune = 10000;
 536 unsigned long zfs_arc_dnode_limit_percent = 10;
 537 int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED;
 538 int zfs_arc_meta_adjust_restarts = 4096;
 539
 540 /* The 6 states: */
 541 static arc_state_t ARC_anon;
 542 static arc_state_t ARC_mru;
 543 static arc_state_t ARC_mru_ghost;
 544 static arc_state_t ARC_mfu;
 545 static arc_state_t ARC_mfu_ghost;
 546 static arc_state_t ARC_l2c_only;
 547
 548 typedef struct arc_stats {
 549         kstat_named_t arcstat_hits;
 550         kstat_named_t arcstat_misses;
 551         kstat_named_t arcstat_demand_data_hits;
 552         kstat_named_t arcstat_demand_data_misses;
 553         kstat_named_t arcstat_demand_metadata_hits;
 554         kstat_named_t arcstat_demand_metadata_misses;
 555         kstat_named_t arcstat_prefetch_data_hits;
 556         kstat_named_t arcstat_prefetch_data_misses;
 557         kstat_named_t arcstat_prefetch_metadata_hits;
 558         kstat_named_t arcstat_prefetch_metadata_misses;
 559         kstat_named_t arcstat_mru_hits;
 560         kstat_named_t arcstat_mru_ghost_hits;
 561         kstat_named_t arcstat_mfu_hits;
 562         kstat_named_t arcstat_mfu_ghost_hits;
 563         kstat_named_t arcstat_allocated;
 564         kstat_named_t arcstat_deleted;
 565         /*
 566          * Number of buffers that could not be evicted because the hash lock
 567          * was held by another thread.  The lock may not necessarily be held
 568          * by something using the same buffer, since hash locks are shared
 569          * by multiple buffers.
 570          */
 571         kstat_named_t arcstat_mutex_miss;
 572         /*
 573          * Number of buffers skipped when updating the access state due to the
 574          * header having already been released after acquiring the hash lock.
 575          */
 576         kstat_named_t arcstat_access_skip;
 577         /*
 578          * Number of buffers skipped because they have I/O in progress, are
 579          * indirect prefetch buffers that have not lived long enough, or are
 580          * not from the spa we're trying to evict from.
 581          */
 582         kstat_named_t arcstat_evict_skip;
 583         /*
 584          * Number of times arc_evict_state() was unable to evict enough
 585          * buffers to reach it's target amount.
 586          */
 587         kstat_named_t arcstat_evict_not_enough;
 588         kstat_named_t arcstat_evict_l2_cached;
 589         kstat_named_t arcstat_evict_l2_eligible;
 590         kstat_named_t arcstat_evict_l2_ineligible;
 591         kstat_named_t arcstat_evict_l2_skip;
 592         kstat_named_t arcstat_hash_elements;
 593         kstat_named_t arcstat_hash_elements_max;
 594         kstat_named_t arcstat_hash_collisions;
 595         kstat_named_t arcstat_hash_chains;
 596         kstat_named_t arcstat_hash_chain_max;
 597         kstat_named_t arcstat_p;
 598         kstat_named_t arcstat_c;
 599         kstat_named_t arcstat_c_min;
 600         kstat_named_t arcstat_c_max;
 601         /* Not updated directly; only synced in arc_kstat_update. */
 602         kstat_named_t arcstat_size;
 603         /*
 604          * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
 605          * Note that the compressed bytes may match the uncompressed bytes
 606          * if the block is either not compressed or compressed arc is disabled.
 607          */
 608         kstat_named_t arcstat_compressed_size;
 609         /*
 610          * Uncompressed size of the data stored in b_pabd. If compressed
 611          * arc is disabled then this value will be identical to the stat
 612          * above.
 613          */
 614         kstat_named_t arcstat_uncompressed_size;
 615         /*
 616          * Number of bytes stored in all the arc_buf_t's. This is classified
 617          * as "overhead" since this data is typically short-lived and will
 618          * be evicted from the arc when it becomes unreferenced unless the
 619          * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level
 620          * values have been set (see comment in dbuf.c for more information).
 621          */
 622         kstat_named_t arcstat_overhead_size;
 623         /*
 624          * Number of bytes consumed by internal ARC structures necessary
 625          * for tracking purposes; these structures are not actually
 626          * backed by ARC buffers. This includes arc_buf_hdr_t structures
 627          * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
 628          * caches), and arc_buf_t structures (allocated via arc_buf_t
 629          * cache).
 630          * Not updated directly; only synced in arc_kstat_update.
 631          */
 632         kstat_named_t arcstat_hdr_size;
 633         /*
 634          * Number of bytes consumed by ARC buffers of type equal to
 635          * ARC_BUFC_DATA. This is generally consumed by buffers backing
 636          * on disk user data (e.g. plain file contents).
 637          * Not updated directly; only synced in arc_kstat_update.
 638          */
 639         kstat_named_t arcstat_data_size;
 640         /*
 641          * Number of bytes consumed by ARC buffers of type equal to
 642          * ARC_BUFC_METADATA. This is generally consumed by buffers
 643          * backing on disk data that is used for internal ZFS
 644          * structures (e.g. ZAP, dnode, indirect blocks, etc).
 645          * Not updated directly; only synced in arc_kstat_update.
 646          */
 647         kstat_named_t arcstat_metadata_size;
 648         /*
 649          * Number of bytes consumed by dmu_buf_impl_t objects.
 650          */
 651         kstat_named_t arcstat_dbuf_size;
 652         /*
 653          * Number of bytes consumed by dnode_t objects.
 654          */
 655         kstat_named_t arcstat_dnode_size;
 656         /*
 657          * Number of bytes consumed by bonus buffers.
 658          */
 659         kstat_named_t arcstat_bonus_size;
 660         /*
 661          * Total number of bytes consumed by ARC buffers residing in the
 662          * arc_anon state. This includes *all* buffers in the arc_anon
 663          * state; e.g. data, metadata, evictable, and unevictable buffers
 664          * are all included in this value.
 665          * Not updated directly; only synced in arc_kstat_update.
 666          */
 667         kstat_named_t arcstat_anon_size;
 668         /*
 669          * Number of bytes consumed by ARC buffers that meet the
 670          * following criteria: backing buffers of type ARC_BUFC_DATA,
 671          * residing in the arc_anon state, and are eligible for eviction
 672          * (e.g. have no outstanding holds on the buffer).
 673          * Not updated directly; only synced in arc_kstat_update.
 674          */
 675         kstat_named_t arcstat_anon_evictable_data;
 676         /*
 677          * Number of bytes consumed by ARC buffers that meet the
 678          * following criteria: backing buffers of type ARC_BUFC_METADATA,
 679          * residing in the arc_anon state, and are eligible for eviction
 680          * (e.g. have no outstanding holds on the buffer).
 681          * Not updated directly; only synced in arc_kstat_update.
 682          */
 683         kstat_named_t arcstat_anon_evictable_metadata;
 684         /*
 685          * Total number of bytes consumed by ARC buffers residing in the
 686          * arc_mru state. This includes *all* buffers in the arc_mru
 687          * state; e.g. data, metadata, evictable, and unevictable buffers
 688          * are all included in this value.
 689          * Not updated directly; only synced in arc_kstat_update.
 690          */
 691         kstat_named_t arcstat_mru_size;
 692         /*
 693          * Number of bytes consumed by ARC buffers that meet the
 694          * following criteria: backing buffers of type ARC_BUFC_DATA,
 695          * residing in the arc_mru state, and are eligible for eviction
 696          * (e.g. have no outstanding holds on the buffer).
 697          * Not updated directly; only synced in arc_kstat_update.
 698          */
 699         kstat_named_t arcstat_mru_evictable_data;
 700         /*
 701          * Number of bytes consumed by ARC buffers that meet the
 702          * following criteria: backing buffers of type ARC_BUFC_METADATA,
 703          * residing in the arc_mru state, and are eligible for eviction
 704          * (e.g. have no outstanding holds on the buffer).
 705          * Not updated directly; only synced in arc_kstat_update.
 706          */
 707         kstat_named_t arcstat_mru_evictable_metadata;
 708         /*
 709          * Total number of bytes that *would have been* consumed by ARC
 710          * buffers in the arc_mru_ghost state. The key thing to note
 711          * here, is the fact that this size doesn't actually indicate
 712          * RAM consumption. The ghost lists only consist of headers and
 713          * don't actually have ARC buffers linked off of these headers.
 714          * Thus, *if* the headers had associated ARC buffers, these
 715          * buffers *would have* consumed this number of bytes.
 716          * Not updated directly; only synced in arc_kstat_update.
 717          */
 718         kstat_named_t arcstat_mru_ghost_size;
 719         /*
 720          * Number of bytes that *would have been* consumed by ARC
 721          * buffers that are eligible for eviction, of type
 722          * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
 723          * Not updated directly; only synced in arc_kstat_update.
 724          */
 725         kstat_named_t arcstat_mru_ghost_evictable_data;
 726         /*
 727          * Number of bytes that *would have been* consumed by ARC
 728          * buffers that are eligible for eviction, of type
 729          * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
 730          * Not updated directly; only synced in arc_kstat_update.
 731          */
 732         kstat_named_t arcstat_mru_ghost_evictable_metadata;
 733         /*
 734          * Total number of bytes consumed by ARC buffers residing in the
 735          * arc_mfu state. This includes *all* buffers in the arc_mfu
 736          * state; e.g. data, metadata, evictable, and unevictable buffers
 737          * are all included in this value.
 738          * Not updated directly; only synced in arc_kstat_update.
 739          */
 740         kstat_named_t arcstat_mfu_size;
 741         /*
 742          * Number of bytes consumed by ARC buffers that are eligible for
 743          * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
 744          * state.
 745          * Not updated directly; only synced in arc_kstat_update.
 746          */
 747         kstat_named_t arcstat_mfu_evictable_data;
 748         /*
 749          * Number of bytes consumed by ARC buffers that are eligible for
 750          * eviction, of type ARC_BUFC_METADATA, and reside in the
 751          * arc_mfu state.
 752          * Not updated directly; only synced in arc_kstat_update.
 753          */
 754         kstat_named_t arcstat_mfu_evictable_metadata;
 755         /*
 756          * Total number of bytes that *would have been* consumed by ARC
 757          * buffers in the arc_mfu_ghost state. See the comment above
 758          * arcstat_mru_ghost_size for more details.
 759          * Not updated directly; only synced in arc_kstat_update.
 760          */
 761         kstat_named_t arcstat_mfu_ghost_size;
 762         /*
 763          * Number of bytes that *would have been* consumed by ARC
 764          * buffers that are eligible for eviction, of type
 765          * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
 766          * Not updated directly; only synced in arc_kstat_update.
 767          */
 768         kstat_named_t arcstat_mfu_ghost_evictable_data;
 769         /*
 770          * Number of bytes that *would have been* consumed by ARC
 771          * buffers that are eligible for eviction, of type
 772          * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
 773          * Not updated directly; only synced in arc_kstat_update.
 774          */
 775         kstat_named_t arcstat_mfu_ghost_evictable_metadata;
 776         kstat_named_t arcstat_l2_hits;
 777         kstat_named_t arcstat_l2_misses;
 778         kstat_named_t arcstat_l2_feeds;
 779         kstat_named_t arcstat_l2_rw_clash;
 780         kstat_named_t arcstat_l2_read_bytes;
 781         kstat_named_t arcstat_l2_write_bytes;
 782         kstat_named_t arcstat_l2_writes_sent;
 783         kstat_named_t arcstat_l2_writes_done;
 784         kstat_named_t arcstat_l2_writes_error;
 785         kstat_named_t arcstat_l2_writes_lock_retry;
 786         kstat_named_t arcstat_l2_evict_lock_retry;
 787         kstat_named_t arcstat_l2_evict_reading;
 788         kstat_named_t arcstat_l2_evict_l1cached;
 789         kstat_named_t arcstat_l2_free_on_write;
 790         kstat_named_t arcstat_l2_abort_lowmem;
 791         kstat_named_t arcstat_l2_cksum_bad;
 792         kstat_named_t arcstat_l2_io_error;
 793         kstat_named_t arcstat_l2_lsize;
 794         kstat_named_t arcstat_l2_psize;
 795         /* Not updated directly; only synced in arc_kstat_update. */
 796         kstat_named_t arcstat_l2_hdr_size;
 797         kstat_named_t arcstat_l2_write_trylock_fail;
 798         kstat_named_t arcstat_l2_write_passed_headroom;
 799         kstat_named_t arcstat_l2_write_spa_mismatch;
 800         kstat_named_t arcstat_l2_write_in_l2;
 801         kstat_named_t arcstat_l2_write_hdr_io_in_progress;
 802         kstat_named_t arcstat_l2_write_not_cacheable;
 803         kstat_named_t arcstat_l2_write_full;
 804         kstat_named_t arcstat_l2_write_buffer_iter;
 805         kstat_named_t arcstat_l2_write_pios;
 806         kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
 807         kstat_named_t arcstat_l2_write_buffer_list_iter;
 808         kstat_named_t arcstat_l2_write_buffer_list_null_iter;
 809         kstat_named_t arcstat_memory_throttle_count;
 810         kstat_named_t arcstat_memory_direct_count;
 811         kstat_named_t arcstat_memory_indirect_count;
 812         kstat_named_t arcstat_memory_all_bytes;
 813         kstat_named_t arcstat_memory_free_bytes;
 814         kstat_named_t arcstat_memory_available_bytes;
 815         kstat_named_t arcstat_no_grow;
 816         kstat_named_t arcstat_tempreserve;
 817         kstat_named_t arcstat_loaned_bytes;
 818         kstat_named_t arcstat_prune;
 819         /* Not updated directly; only synced in arc_kstat_update. */
 820         kstat_named_t arcstat_meta_used;
 821         kstat_named_t arcstat_meta_limit;
 822         kstat_named_t arcstat_dnode_limit;
 823         kstat_named_t arcstat_meta_max;
 824         kstat_named_t arcstat_meta_min;
 825         kstat_named_t arcstat_async_upgrade_sync;
 826         kstat_named_t arcstat_demand_hit_predictive_prefetch;
 827         kstat_named_t arcstat_demand_hit_prescient_prefetch;
 828 } arc_stats_t;
 829
 830 static arc_stats_t arc_stats = {
 831         { "hits",                       KSTAT_DATA_UINT64 },
 832         { "misses",                     KSTAT_DATA_UINT64 },
 833         { "demand_data_hits",           KSTAT_DATA_UINT64 },
 834         { "demand_data_misses",         KSTAT_DATA_UINT64 },
 835         { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
 836         { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
 837         { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
 838         { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
 839         { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
 840         { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },
 841         { "mru_hits",                   KSTAT_DATA_UINT64 },
 842         { "mru_ghost_hits",             KSTAT_DATA_UINT64 },
 843         { "mfu_hits",                   KSTAT_DATA_UINT64 },
 844         { "mfu_ghost_hits",             KSTAT_DATA_UINT64 },
 845         { "allocated",                  KSTAT_DATA_UINT64 },
 846         { "deleted",                    KSTAT_DATA_UINT64 },
 847         { "mutex_miss",                 KSTAT_DATA_UINT64 },
 848         { "access_skip",                KSTAT_DATA_UINT64 },
 849         { "evict_skip",                 KSTAT_DATA_UINT64 },
 850         { "evict_not_enough",           KSTAT_DATA_UINT64 },
 851         { "evict_l2_cached",            KSTAT_DATA_UINT64 },
 852         { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
 853         { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
 854         { "evict_l2_skip",              KSTAT_DATA_UINT64 },
 855         { "hash_elements",              KSTAT_DATA_UINT64 },
 856         { "hash_elements_max",          KSTAT_DATA_UINT64 },
 857         { "hash_collisions",            KSTAT_DATA_UINT64 },
 858         { "hash_chains",                KSTAT_DATA_UINT64 },
 859         { "hash_chain_max",             KSTAT_DATA_UINT64 },
 860         { "p",                          KSTAT_DATA_UINT64 },
 861         { "c",                          KSTAT_DATA_UINT64 },
 862         { "c_min",                      KSTAT_DATA_UINT64 },
 863         { "c_max",                      KSTAT_DATA_UINT64 },
 864         { "size",                       KSTAT_DATA_UINT64 },
 865         { "compressed_size",            KSTAT_DATA_UINT64 },
 866         { "uncompressed_size",          KSTAT_DATA_UINT64 },
 867         { "overhead_size",              KSTAT_DATA_UINT64 },
 868         { "hdr_size",                   KSTAT_DATA_UINT64 },
 869         { "data_size",                  KSTAT_DATA_UINT64 },
 870         { "metadata_size",              KSTAT_DATA_UINT64 },
 871         { "dbuf_size",                  KSTAT_DATA_UINT64 },
 872         { "dnode_size",                 KSTAT_DATA_UINT64 },
 873         { "bonus_size",                 KSTAT_DATA_UINT64 },
 874         { "anon_size",                  KSTAT_DATA_UINT64 },
 875         { "anon_evictable_data",        KSTAT_DATA_UINT64 },
 876         { "anon_evictable_metadata",    KSTAT_DATA_UINT64 },
 877         { "mru_size",                   KSTAT_DATA_UINT64 },
 878         { "mru_evictable_data",         KSTAT_DATA_UINT64 },
 879         { "mru_evictable_metadata",     KSTAT_DATA_UINT64 },
 880         { "mru_ghost_size",             KSTAT_DATA_UINT64 },
 881         { "mru_ghost_evictable_data",   KSTAT_DATA_UINT64 },
 882         { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
 883         { "mfu_size",                   KSTAT_DATA_UINT64 },
 884         { "mfu_evictable_data",         KSTAT_DATA_UINT64 },
 885         { "mfu_evictable_metadata",     KSTAT_DATA_UINT64 },
 886         { "mfu_ghost_size",             KSTAT_DATA_UINT64 },
 887         { "mfu_ghost_evictable_data",   KSTAT_DATA_UINT64 },
 888         { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
 889         { "l2_hits",                    KSTAT_DATA_UINT64 },
 890         { "l2_misses",                  KSTAT_DATA_UINT64 },
 891         { "l2_feeds",                   KSTAT_DATA_UINT64 },
 892         { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 893         { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 894         { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 895         { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 896         { "l2_writes_done",             KSTAT_DATA_UINT64 },
 897         { "l2_writes_error",            KSTAT_DATA_UINT64 },
 898         { "l2_writes_lock_retry",       KSTAT_DATA_UINT64 },
 899         { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 900         { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 901         { "l2_evict_l1cached",          KSTAT_DATA_UINT64 },
 902         { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 903         { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 904         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 905         { "l2_io_error",                KSTAT_DATA_UINT64 },
 906         { "l2_size",                    KSTAT_DATA_UINT64 },
 907         { "l2_asize",                   KSTAT_DATA_UINT64 },
 908         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 909         { "l2_write_trylock_fail",      KSTAT_DATA_UINT64 },
 910         { "l2_write_passed_headroom",   KSTAT_DATA_UINT64 },
 911         { "l2_write_spa_mismatch",      KSTAT_DATA_UINT64 },
 912         { "l2_write_in_l2",             KSTAT_DATA_UINT64 },
 913         { "l2_write_io_in_progress",    KSTAT_DATA_UINT64 },
 914         { "l2_write_not_cacheable",     KSTAT_DATA_UINT64 },
 915         { "l2_write_full",              KSTAT_DATA_UINT64 },
 916         { "l2_write_buffer_iter",       KSTAT_DATA_UINT64 },
 917         { "l2_write_pios",              KSTAT_DATA_UINT64 },
 918         { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
 919         { "l2_write_buffer_list_iter",  KSTAT_DATA_UINT64 },
 920         { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
 921         { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 922         { "memory_direct_count",        KSTAT_DATA_UINT64 },
 923         { "memory_indirect_count",      KSTAT_DATA_UINT64 },
 924         { "memory_all_bytes",           KSTAT_DATA_UINT64 },
 925         { "memory_free_bytes",          KSTAT_DATA_UINT64 },
 926         { "memory_available_bytes",     KSTAT_DATA_UINT64 },
 927         { "arc_no_grow",                KSTAT_DATA_UINT64 },
 928         { "arc_tempreserve",            KSTAT_DATA_UINT64 },
 929         { "arc_loaned_bytes",           KSTAT_DATA_UINT64 },
 930         { "arc_prune",                  KSTAT_DATA_UINT64 },
 931         { "arc_meta_used",              KSTAT_DATA_UINT64 },
 932         { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 933         { "arc_dnode_limit",            KSTAT_DATA_UINT64 },
 934         { "arc_meta_max",               KSTAT_DATA_UINT64 },
 935         { "arc_meta_min",               KSTAT_DATA_UINT64 },
 936         { "async_upgrade_sync",         KSTAT_DATA_UINT64 },
 937         { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
 938         { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
 939 };
 940
 941 #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
 942
 943 #define ARCSTAT_INCR(stat, val) \
 944         atomic_add_64(&arc_stats.stat.value.ui64, (val))
 945
 946 #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 947 #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 948
 949 #define ARCSTAT_MAX(stat, val) {                                        \
 950         uint64_t m;                                                     \
 951         while ((val) > (m = arc_stats.stat.value.ui64) &&               \
 952             (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
 953                 continue;                                               \
 954 }
 955
 956 #define ARCSTAT_MAXSTAT(stat) \
 957         ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
 958
 959 /*
 960  * We define a macro to allow ARC hits/misses to be easily broken down by
 961  * two separate conditions, giving a total of four different subtypes for
 962  * each of hits and misses (so eight statistics total).
 963  */
 964 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 965         if (cond1) {                                                    \
 966                 if (cond2) {                                            \
 967                         ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 968                 } else {                                                \
 969                         ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 970                 }                                                       \
 971         } else {                                                        \
 972                 if (cond2) {                                            \
 973                         ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 974                 } else {                                                \
 975                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 976                 }                                                       \
 977         }
 978
 979 kstat_t                 *arc_ksp;
 980 static arc_state_t      *arc_anon;
 981 static arc_state_t      *arc_mru;
 982 static arc_state_t      *arc_mru_ghost;
 983 static arc_state_t      *arc_mfu;
 984 static arc_state_t      *arc_mfu_ghost;
 985 static arc_state_t      *arc_l2c_only;
 986
 987 /*
 988  * There are several ARC variables that are critical to export as kstats --
 989  * but we don't want to have to grovel around in the kstat whenever we wish to
 990  * manipulate them.  For these variables, we therefore define them to be in
 991  * terms of the statistic variable.  This assures that we are not introducing
 992  * the possibility of inconsistency by having shadow copies of the variables,
 993  * while still allowing the code to be readable.
 994  */
 995 #define arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
 996 #define arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
 997 #define arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
 998 #define arc_c_max       ARCSTAT(arcstat_c_max)  /* max target cache size */
 999 #define arc_meta_limit  ARCSTAT(arcstat_meta_limit) /* max size for metadata */
1000 #define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */
1001 #define arc_meta_min    ARCSTAT(arcstat_meta_min) /* min size for metadata */
1002 #define arc_meta_max    ARCSTAT(arcstat_meta_max) /* max size of metadata */
1003 #define arc_dbuf_size   ARCSTAT(arcstat_dbuf_size) /* dbuf metadata */
1004 #define arc_dnode_size  ARCSTAT(arcstat_dnode_size) /* dnode metadata */
1005 #define arc_bonus_size  ARCSTAT(arcstat_bonus_size) /* bonus buffer metadata */
1006
1007 /* compressed size of entire arc */
1008 #define arc_compressed_size     ARCSTAT(arcstat_compressed_size)
1009 /* uncompressed size of entire arc */
1010 #define arc_uncompressed_size   ARCSTAT(arcstat_uncompressed_size)
1011 /* number of bytes in the arc from arc_buf_t's */
1012 #define arc_overhead_size       ARCSTAT(arcstat_overhead_size)
1013
1014 /*
1015  * There are also some ARC variables that we want to export, but that are
1016  * updated so often that having the canonical representation be the statistic
1017  * variable causes a performance bottleneck. We want to use aggsum_t's for these
1018  * instead, but still be able to export the kstat in the same way as before.
1019  * The solution is to always use the aggsum version, except in the kstat update
1020  * callback.
1021  */
1022 aggsum_t arc_size;
1023 aggsum_t arc_meta_used;
1024 aggsum_t astat_data_size;
1025 aggsum_t astat_metadata_size;
1026 aggsum_t astat_hdr_size;
1027 aggsum_t astat_bonus_size;
1028 aggsum_t astat_dnode_size;
1029 aggsum_t astat_dbuf_size;
1030 aggsum_t astat_l2_hdr_size;
1031
1032 static list_t arc_prune_list;
1033 static kmutex_t arc_prune_mtx;
1034 static taskq_t *arc_prune_taskq;
1035
1036 static int              arc_no_grow;    /* Don't try to grow cache size */
1037 static uint64_t         arc_tempreserve;
1038 static uint64_t         arc_loaned_bytes;
1039
1040 typedef struct arc_callback arc_callback_t;
1041
1042 struct arc_callback {
1043         void                    *acb_private;
1044         arc_read_done_func_t    *acb_done;
1045         arc_buf_t               *acb_buf;
1046         boolean_t               acb_compressed;
1047         zio_t                   *acb_zio_dummy;
1048         zio_t                   *acb_zio_head;
1049         arc_callback_t          *acb_next;
1050 };
1051
1052 typedef struct arc_write_callback arc_write_callback_t;
1053
1054 struct arc_write_callback {
1055         void                    *awcb_private;
1056         arc_write_done_func_t   *awcb_ready;
1057         arc_write_done_func_t   *awcb_children_ready;
1058         arc_write_done_func_t   *awcb_physdone;
1059         arc_write_done_func_t   *awcb_done;
1060         arc_buf_t               *awcb_buf;
1061 };
1062
1063 /*
1064  * ARC buffers are separated into multiple structs as a memory saving measure:
1065  *   - Common fields struct, always defined, and embedded within it:
1066  *       - L2-only fields, always allocated but undefined when not in L2ARC
1067  *       - L1-only fields, only allocated when in L1ARC
1068  *
1069  *           Buffer in L1                     Buffer only in L2
1070  *    +------------------------+          +------------------------+
1071  *    | arc_buf_hdr_t          |          | arc_buf_hdr_t          |
1072  *    |                        |          |                        |
1073  *    |                        |          |                        |
1074  *    |                        |          |                        |
1075  *    +------------------------+          +------------------------+
1076  *    | l2arc_buf_hdr_t        |          | l2arc_buf_hdr_t        |
1077  *    | (undefined if L1-only) |          |                        |
1078  *    +------------------------+          +------------------------+
1079  *    | l1arc_buf_hdr_t        |
1080  *    |                        |
1081  *    |                        |
1082  *    |                        |
1083  *    |                        |
1084  *    +------------------------+
1085  *
1086  * Because it's possible for the L2ARC to become extremely large, we can wind
1087  * up eating a lot of memory in L2ARC buffer headers, so the size of a header
1088  * is minimized by only allocating the fields necessary for an L1-cached buffer
1089  * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
1090  * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
1091  * words in pointers. arc_hdr_realloc() is used to switch a header between
1092  * these two allocation states.
1093  */
1094 typedef struct l1arc_buf_hdr {
1095         kmutex_t                b_freeze_lock;
1096         zio_cksum_t             *b_freeze_cksum;
1097 #ifdef ZFS_DEBUG
1098         /*
1099          * Used for debugging with kmem_flags - by allocating and freeing
1100          * b_thawed when the buffer is thawed, we get a record of the stack
1101          * trace that thawed it.
1102          */
1103         void                    *b_thawed;
1104 #endif
1105
1106         arc_buf_t               *b_buf;
1107         uint32_t                b_bufcnt;
1108         /* for waiting on writes to complete */
1109         kcondvar_t              b_cv;
1110         uint8_t                 b_byteswap;
1111
1112         /* protected by arc state mutex */
1113         arc_state_t             *b_state;
1114         multilist_node_t        b_arc_node;
1115
1116         /* updated atomically */
1117         clock_t                 b_arc_access;
1118         uint32_t                b_mru_hits;
1119         uint32_t                b_mru_ghost_hits;
1120         uint32_t                b_mfu_hits;
1121         uint32_t                b_mfu_ghost_hits;
1122         uint32_t                b_l2_hits;
1123
1124         /* self protecting */
1125         refcount_t              b_refcnt;
1126
1127         arc_callback_t          *b_acb;
1128         abd_t                   *b_pabd;
1129 } l1arc_buf_hdr_t;
1130
1131 typedef struct l2arc_dev l2arc_dev_t;
1132
1133 typedef struct l2arc_buf_hdr {
1134         /* protected by arc_buf_hdr mutex */
1135         l2arc_dev_t             *b_dev;         /* L2ARC device */
1136         uint64_t                b_daddr;        /* disk address, offset byte */
1137         uint32_t                b_hits;
1138
1139         list_node_t             b_l2node;
1140 } l2arc_buf_hdr_t;
1141
1142 struct arc_buf_hdr {
1143         /* protected by hash lock */
1144         dva_t                   b_dva;
1145         uint64_t                b_birth;
1146
1147         arc_buf_contents_t      b_type;
1148         arc_buf_hdr_t           *b_hash_next;
1149         arc_flags_t             b_flags;
1150
1151         /*
1152          * This field stores the size of the data buffer after
1153          * compression, and is set in the arc's zio completion handlers.
1154          * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
1155          *
1156          * While the block pointers can store up to 32MB in their psize
1157          * field, we can only store up to 32MB minus 512B. This is due
1158          * to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
1159          * a field of zeros represents 512B in the bp). We can't use a
1160          * bias of 1 since we need to reserve a psize of zero, here, to
1161          * represent holes and embedded blocks.
1162          *
1163          * This isn't a problem in practice, since the maximum size of a
1164          * buffer is limited to 16MB, so we never need to store 32MB in
1165          * this field. Even in the upstream illumos code base, the
1166          * maximum size of a buffer is limited to 16MB.
1167          */
1168         uint16_t                b_psize;
1169
1170         /*
1171          * This field stores the size of the data buffer before
1172          * compression, and cannot change once set. It is in units
1173          * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
1174          */
1175         uint16_t                b_lsize;        /* immutable */
1176         uint64_t                b_spa;          /* immutable */
1177
1178         /* L2ARC fields. Undefined when not in L2ARC. */
1179         l2arc_buf_hdr_t         b_l2hdr;
1180         /* L1ARC fields. Undefined when in l2arc_only state */
1181         l1arc_buf_hdr_t         b_l1hdr;
1182 };
1183
1184 #if defined(__FreeBSD__) && defined(_KERNEL)
1185 static int
1186 sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS)
1187 {
1188         uint64_t val;
1189         int err;
1190
1191         val = arc_meta_limit;
1192         err = sysctl_handle_64(oidp, &val, 0, req);
1193         if (err != 0 || req->newptr == NULL)
1194                 return (err);
1195
1196         if (val <= 0 || val > arc_c_max)
1197                 return (EINVAL);
1198
1199         arc_meta_limit = val;
1200         return (0);
1201 }
1202
1203 static int
1204 sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS)
1205 {
1206         uint32_t val;
1207         int err;
1208
1209         val = arc_no_grow_shift;
1210         err = sysctl_handle_32(oidp, &val, 0, req);
1211         if (err != 0 || req->newptr == NULL)
1212                 return (err);
1213
1214         if (val >= arc_shrink_shift)
1215                 return (EINVAL);
1216
1217         arc_no_grow_shift = val;
1218         return (0);
1219 }
1220
1221 static int
1222 sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS)
1223 {
1224         uint64_t val;
1225         int err;
1226
1227         val = zfs_arc_max;
1228         err = sysctl_handle_64(oidp, &val, 0, req);
1229         if (err != 0 || req->newptr == NULL)
1230                 return (err);
1231
1232         if (zfs_arc_max == 0) {
1233                 /* Loader tunable so blindly set */
1234                 zfs_arc_max = val;
1235                 return (0);
1236         }
1237
1238         if (val < arc_abs_min || val > kmem_size())
1239                 return (EINVAL);
1240         if (val < arc_c_min)
1241                 return (EINVAL);
1242         if (zfs_arc_meta_limit > 0 && val < zfs_arc_meta_limit)
1243                 return (EINVAL);
1244
1245         arc_c_max = val;
1246
1247         arc_c = arc_c_max;
1248         arc_p = (arc_c >> 1);
1249
1250         if (zfs_arc_meta_limit == 0) {
1251                 /* limit meta-data to 1/4 of the arc capacity */
1252                 arc_meta_limit = arc_c_max / 4;
1253         }
1254
1255         /* if kmem_flags are set, lets try to use less memory */
1256         if (kmem_debugging())
1257                 arc_c = arc_c / 2;
1258
1259         zfs_arc_max = arc_c;
1260
1261         return (0);
1262 }
1263
1264 static int
1265 sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS)
1266 {
1267         uint64_t val;
1268         int err;
1269
1270         val = zfs_arc_min;
1271         err = sysctl_handle_64(oidp, &val, 0, req);
1272         if (err != 0 || req->newptr == NULL)
1273                 return (err);
1274
1275         if (zfs_arc_min == 0) {
1276                 /* Loader tunable so blindly set */
1277                 zfs_arc_min = val;
1278                 return (0);
1279         }
1280
1281         if (val < arc_abs_min || val > arc_c_max)
1282                 return (EINVAL);
1283
1284         arc_c_min = val;
1285
1286         if (zfs_arc_meta_min == 0)
1287                 arc_meta_min = arc_c_min / 2;
1288
1289         if (arc_c < arc_c_min)
1290                 arc_c = arc_c_min;
1291
1292         zfs_arc_min = arc_c_min;
1293
1294         return (0);
1295 }
1296 #endif
1297
1298 #define GHOST_STATE(state)      \
1299         ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
1300         (state) == arc_l2c_only)
1301
1302 #define HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
1303 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
1304 #define HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
1305 #define HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_FLAG_PREFETCH)
1306 #define HDR_PRESCIENT_PREFETCH(hdr)     \
1307         ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
1308 #define HDR_COMPRESSION_ENABLED(hdr)    \
1309         ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
1310
1311 #define HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_FLAG_L2CACHE)
1312 #define HDR_L2_READING(hdr)     \
1313         (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&  \
1314         ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
1315 #define HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
1316 #define HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
1317 #define HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
1318 #define HDR_SHARED_DATA(hdr)    ((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
1319
1320 #define HDR_ISTYPE_METADATA(hdr)        \
1321         ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
1322 #define HDR_ISTYPE_DATA(hdr)    (!HDR_ISTYPE_METADATA(hdr))
1323
1324 #define HDR_HAS_L1HDR(hdr)      ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
1325 #define HDR_HAS_L2HDR(hdr)      ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
1326
1327 /* For storing compression mode in b_flags */
1328 #define HDR_COMPRESS_OFFSET     (highbit64(ARC_FLAG_COMPRESS_0) - 1)
1329
1330 #define HDR_GET_COMPRESS(hdr)   ((enum zio_compress)BF32_GET((hdr)->b_flags, \
1331         HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
1332 #define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
1333         HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
1334
1335 #define ARC_BUF_LAST(buf)       ((buf)->b_next == NULL)
1336 #define ARC_BUF_SHARED(buf)     ((buf)->b_flags & ARC_BUF_FLAG_SHARED)
1337 #define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED)
1338
1339 /*
1340  * Other sizes
1341  */
1342
1343 #define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
1344 #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
1345
1346 /*
1347  * Hash table routines
1348  */
1349
1350 #define HT_LOCK_PAD     CACHE_LINE_SIZE
1351
1352 struct ht_lock {
1353         kmutex_t        ht_lock;
1354 #ifdef _KERNEL
1355         unsigned char   pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
1356 #endif
1357 };
1358
1359 #define BUF_LOCKS 256
1360 typedef struct buf_hash_table {
1361         uint64_t ht_mask;
1362         arc_buf_hdr_t **ht_table;
1363         struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
1364 } buf_hash_table_t;
1365
1366 static buf_hash_table_t buf_hash_table;
1367
1368 #define BUF_HASH_INDEX(spa, dva, birth) \
1369         (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
1370 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
1371 #define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
1372 #define HDR_LOCK(hdr) \
1373         (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
1374
1375 uint64_t zfs_crc64_table[256];
1376
1377 /*
1378  * Level 2 ARC
1379  */
1380
1381 #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
1382 #define L2ARC_HEADROOM          2                       /* num of writes */
1383 /*
1384  * If we discover during ARC scan any buffers to be compressed, we boost
1385  * our headroom for the next scanning cycle by this percentage multiple.
1386  */
1387 #define L2ARC_HEADROOM_BOOST    200
1388 #define L2ARC_FEED_SECS         1               /* caching interval secs */
1389 #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
1390
1391 #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
1392 #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
1393
1394 /* L2ARC Performance Tunables */
1395 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;    /* default max write size */
1396 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;  /* extra write during warmup */
1397 uint64_t l2arc_headroom = L2ARC_HEADROOM;       /* number of dev writes */
1398 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
1399 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     /* interval seconds */
1400 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
1401 boolean_t l2arc_noprefetch = B_TRUE;            /* don't cache prefetch bufs */
1402 boolean_t l2arc_feed_again = B_TRUE;            /* turbo warmup */
1403 boolean_t l2arc_norw = B_TRUE;                  /* no reads during writes */
1404
1405 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
1406     &l2arc_write_max, 0, "max write size");
1407 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
1408     &l2arc_write_boost, 0, "extra write during warmup");
1409 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
1410     &l2arc_headroom, 0, "number of dev writes");
1411 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
1412     &l2arc_feed_secs, 0, "interval seconds");
1413 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
1414     &l2arc_feed_min_ms, 0, "min interval milliseconds");
1415
1416 SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
1417     &l2arc_noprefetch, 0, "don't cache prefetch bufs");
1418 SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
1419     &l2arc_feed_again, 0, "turbo warmup");
1420 SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
1421     &l2arc_norw, 0, "no reads during writes");
1422
1423 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
1424     &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state");
1425 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD,
1426     &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
1427     "size of anonymous state");
1428 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD,
1429     &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
1430     "size of anonymous state");
1431
1432 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
1433     &ARC_mru.arcs_size.rc_count, 0, "size of mru state");
1434 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD,
1435     &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
1436     "size of metadata in mru state");
1437 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD,
1438     &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
1439     "size of data in mru state");
1440
1441 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
1442     &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state");
1443 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD,
1444     &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
1445     "size of metadata in mru ghost state");
1446 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD,
1447     &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
1448     "size of data in mru ghost state");
1449
1450 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
1451     &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state");
1452 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD,
1453     &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
1454     "size of metadata in mfu state");
1455 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD,
1456     &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
1457     "size of data in mfu state");
1458
1459 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
1460     &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state");
1461 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD,
1462     &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
1463     "size of metadata in mfu ghost state");
1464 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD,
1465     &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
1466     "size of data in mfu ghost state");
1467
1468 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
1469     &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state");
1470
1471 SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prefetch_ms, CTLFLAG_RW,
1472     &zfs_arc_min_prefetch_ms, 0, "Min life of prefetch block in ms");
1473 SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prescient_prefetch_ms, CTLFLAG_RW,
1474     &zfs_arc_min_prescient_prefetch_ms, 0, "Min life of prescient prefetched block in ms");
1475
1476 /*
1477  * L2ARC Internals
1478  */
1479 struct l2arc_dev {
1480         vdev_t                  *l2ad_vdev;     /* vdev */
1481         spa_t                   *l2ad_spa;      /* spa */
1482         uint64_t                l2ad_hand;      /* next write location */
1483         uint64_t                l2ad_start;     /* first addr on device */
1484         uint64_t                l2ad_end;       /* last addr on device */
1485         boolean_t               l2ad_first;     /* first sweep through */
1486         boolean_t               l2ad_writing;   /* currently writing */
1487         kmutex_t                l2ad_mtx;       /* lock for buffer list */
1488         list_t                  l2ad_buflist;   /* buffer list */
1489         list_node_t             l2ad_node;      /* device list node */
1490         refcount_t              l2ad_alloc;     /* allocated bytes */
1491 };
1492
1493 static list_t L2ARC_dev_list;                   /* device list */
1494 static list_t *l2arc_dev_list;                  /* device list pointer */
1495 static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
1496 static l2arc_dev_t *l2arc_dev_last;             /* last device used */
1497 static list_t L2ARC_free_on_write;              /* free after write buf list */
1498 static list_t *l2arc_free_on_write;             /* free after write list ptr */
1499 static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
1500 static uint64_t l2arc_ndev;                     /* number of devices */
1501
1502 typedef struct l2arc_read_callback {
1503         arc_buf_hdr_t           *l2rcb_hdr;             /* read header */
1504         blkptr_t                l2rcb_bp;               /* original blkptr */
1505         zbookmark_phys_t        l2rcb_zb;               /* original bookmark */
1506         int                     l2rcb_flags;            /* original flags */
1507         abd_t                   *l2rcb_abd;             /* temporary buffer */
1508 } l2arc_read_callback_t;
1509
1510 typedef struct l2arc_write_callback {
1511         l2arc_dev_t     *l2wcb_dev;             /* device info */
1512         arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
1513 } l2arc_write_callback_t;
1514
1515 typedef struct l2arc_data_free {
1516         /* protected by l2arc_free_on_write_mtx */
1517         abd_t           *l2df_abd;
1518         size_t          l2df_size;
1519         arc_buf_contents_t l2df_type;
1520         list_node_t     l2df_list_node;
1521 } l2arc_data_free_t;
1522
1523 static kmutex_t l2arc_feed_thr_lock;
1524 static kcondvar_t l2arc_feed_thr_cv;
1525 static uint8_t l2arc_thread_exit;
1526
1527 static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *);
1528 static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
1529 static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *);
1530 static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *);
1531 static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
1532 static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag);
1533 static void arc_hdr_free_pabd(arc_buf_hdr_t *);
1534 static void arc_hdr_alloc_pabd(arc_buf_hdr_t *);
1535 static void arc_access(arc_buf_hdr_t *, kmutex_t *);
1536 static boolean_t arc_is_overflowing();
1537 static void arc_buf_watch(arc_buf_t *);
1538 static void arc_prune_async(int64_t);
1539
1540 static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
1541 static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
1542 static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
1543 static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
1544
1545 static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
1546 static void l2arc_read_done(zio_t *);
1547
1548 static void
1549 l2arc_trim(const arc_buf_hdr_t *hdr)
1550 {
1551         l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
1552
1553         ASSERT(HDR_HAS_L2HDR(hdr));
1554         ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
1555
1556         if (HDR_GET_PSIZE(hdr) != 0) {
1557                 trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr,
1558                     HDR_GET_PSIZE(hdr), 0);
1559         }
1560 }
1561
1562 /*
1563  * We use Cityhash for this. It's fast, and has good hash properties without
1564  * requiring any large static buffers.
1565  */
1566 static uint64_t
1567 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
1568 {
1569         return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth));
1570 }
1571
1572 #define HDR_EMPTY(hdr)                                          \
1573         ((hdr)->b_dva.dva_word[0] == 0 &&                       \
1574         (hdr)->b_dva.dva_word[1] == 0)
1575
1576 #define HDR_EQUAL(spa, dva, birth, hdr)                         \
1577         ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&     \
1578         ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&     \
1579         ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
1580
1581 static void
1582 buf_discard_identity(arc_buf_hdr_t *hdr)
1583 {
1584         hdr->b_dva.dva_word[0] = 0;
1585         hdr->b_dva.dva_word[1] = 0;
1586         hdr->b_birth = 0;
1587 }
1588
1589 static arc_buf_hdr_t *
1590 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
1591 {
1592         const dva_t *dva = BP_IDENTITY(bp);
1593         uint64_t birth = BP_PHYSICAL_BIRTH(bp);
1594         uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
1595         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1596         arc_buf_hdr_t *hdr;
1597
1598         mutex_enter(hash_lock);
1599         for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
1600             hdr = hdr->b_hash_next) {
1601                 if (HDR_EQUAL(spa, dva, birth, hdr)) {
1602                         *lockp = hash_lock;
1603                         return (hdr);
1604                 }
1605         }
1606         mutex_exit(hash_lock);
1607         *lockp = NULL;
1608         return (NULL);
1609 }
1610
1611 /*
1612  * Insert an entry into the hash table.  If there is already an element
1613  * equal to elem in the hash table, then the already existing element
1614  * will be returned and the new element will not be inserted.
1615  * Otherwise returns NULL.
1616  * If lockp == NULL, the caller is assumed to already hold the hash lock.
1617  */
1618 static arc_buf_hdr_t *
1619 buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
1620 {
1621         uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1622         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1623         arc_buf_hdr_t *fhdr;
1624         uint32_t i;
1625
1626         ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
1627         ASSERT(hdr->b_birth != 0);
1628         ASSERT(!HDR_IN_HASH_TABLE(hdr));
1629
1630         if (lockp != NULL) {
1631                 *lockp = hash_lock;
1632                 mutex_enter(hash_lock);
1633         } else {
1634                 ASSERT(MUTEX_HELD(hash_lock));
1635         }
1636
1637         for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
1638             fhdr = fhdr->b_hash_next, i++) {
1639                 if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
1640                         return (fhdr);
1641         }
1642
1643         hdr->b_hash_next = buf_hash_table.ht_table[idx];
1644         buf_hash_table.ht_table[idx] = hdr;
1645         arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
1646
1647         /* collect some hash table performance data */
1648         if (i > 0) {
1649                 ARCSTAT_BUMP(arcstat_hash_collisions);
1650                 if (i == 1)
1651                         ARCSTAT_BUMP(arcstat_hash_chains);
1652
1653                 ARCSTAT_MAX(arcstat_hash_chain_max, i);
1654         }
1655
1656         ARCSTAT_BUMP(arcstat_hash_elements);
1657         ARCSTAT_MAXSTAT(arcstat_hash_elements);
1658
1659         return (NULL);
1660 }
1661
1662 static void
1663 buf_hash_remove(arc_buf_hdr_t *hdr)
1664 {
1665         arc_buf_hdr_t *fhdr, **hdrp;
1666         uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1667
1668         ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
1669         ASSERT(HDR_IN_HASH_TABLE(hdr));
1670
1671         hdrp = &buf_hash_table.ht_table[idx];
1672         while ((fhdr = *hdrp) != hdr) {
1673                 ASSERT3P(fhdr, !=, NULL);
1674                 hdrp = &fhdr->b_hash_next;
1675         }
1676         *hdrp = hdr->b_hash_next;
1677         hdr->b_hash_next = NULL;
1678         arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
1679
1680         /* collect some hash table performance data */
1681         ARCSTAT_BUMPDOWN(arcstat_hash_elements);
1682
1683         if (buf_hash_table.ht_table[idx] &&
1684             buf_hash_table.ht_table[idx]->b_hash_next == NULL)
1685                 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
1686 }
1687
1688 /*
1689  * Global data structures and functions for the buf kmem cache.
1690  */
1691 static kmem_cache_t *hdr_full_cache;
1692 static kmem_cache_t *hdr_l2only_cache;
1693 static kmem_cache_t *buf_cache;
1694
1695 static void
1696 buf_fini(void)
1697 {
1698         int i;
1699
1700         kmem_free(buf_hash_table.ht_table,
1701             (buf_hash_table.ht_mask + 1) * sizeof (void *));
1702         for (i = 0; i < BUF_LOCKS; i++)
1703                 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
1704         kmem_cache_destroy(hdr_full_cache);
1705         kmem_cache_destroy(hdr_l2only_cache);
1706         kmem_cache_destroy(buf_cache);
1707 }
1708
1709 /*
1710  * Constructor callback - called when the cache is empty
1711  * and a new buf is requested.
1712  */
1713 /* ARGSUSED */
1714 static int
1715 hdr_full_cons(void *vbuf, void *unused, int kmflag)
1716 {
1717         arc_buf_hdr_t *hdr = vbuf;
1718
1719         bzero(hdr, HDR_FULL_SIZE);
1720         cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
1721         refcount_create(&hdr->b_l1hdr.b_refcnt);
1722         mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
1723         multilist_link_init(&hdr->b_l1hdr.b_arc_node);
1724         arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1725
1726         return (0);
1727 }
1728
1729 /* ARGSUSED */
1730 static int
1731 hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
1732 {
1733         arc_buf_hdr_t *hdr = vbuf;
1734
1735         bzero(hdr, HDR_L2ONLY_SIZE);
1736         arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1737
1738         return (0);
1739 }
1740
1741 /* ARGSUSED */
1742 static int
1743 buf_cons(void *vbuf, void *unused, int kmflag)
1744 {
1745         arc_buf_t *buf = vbuf;
1746
1747         bzero(buf, sizeof (arc_buf_t));
1748         mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1749         arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1750
1751         return (0);
1752 }
1753
1754 /*
1755  * Destructor callback - called when a cached buf is
1756  * no longer required.
1757  */
1758 /* ARGSUSED */
1759 static void
1760 hdr_full_dest(void *vbuf, void *unused)
1761 {
1762         arc_buf_hdr_t *hdr = vbuf;
1763
1764         ASSERT(HDR_EMPTY(hdr));
1765         cv_destroy(&hdr->b_l1hdr.b_cv);
1766         refcount_destroy(&hdr->b_l1hdr.b_refcnt);
1767         mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
1768         ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
1769         arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1770 }
1771
1772 /* ARGSUSED */
1773 static void
1774 hdr_l2only_dest(void *vbuf, void *unused)
1775 {
1776         arc_buf_hdr_t *hdr = vbuf;
1777
1778         ASSERT(HDR_EMPTY(hdr));
1779         arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1780 }
1781
1782 /* ARGSUSED */
1783 static void
1784 buf_dest(void *vbuf, void *unused)
1785 {
1786         arc_buf_t *buf = vbuf;
1787
1788         mutex_destroy(&buf->b_evict_lock);
1789         arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1790 }
1791
1792 /*
1793  * Reclaim callback -- invoked when memory is low.
1794  */
1795 /* ARGSUSED */
1796 static void
1797 hdr_recl(void *unused)
1798 {
1799         dprintf("hdr_recl called\n");
1800         /*
1801          * umem calls the reclaim func when we destroy the buf cache,
1802          * which is after we do arc_fini().
1803          */
1804         if (!arc_dead)
1805                 cv_signal(&arc_reclaim_thread_cv);
1806 }
1807
1808 static void
1809 buf_init(void)
1810 {
1811         uint64_t *ct;
1812         uint64_t hsize = 1ULL << 12;
1813         int i, j;
1814
1815         /*
1816          * The hash table is big enough to fill all of physical memory
1817          * with an average block size of zfs_arc_average_blocksize (default 8K).
1818          * By default, the table will take up
1819          * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
1820          */
1821         while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE)
1822                 hsize <<= 1;
1823 retry:
1824         buf_hash_table.ht_mask = hsize - 1;
1825         buf_hash_table.ht_table =
1826             kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1827         if (buf_hash_table.ht_table == NULL) {
1828                 ASSERT(hsize > (1ULL << 8));
1829                 hsize >>= 1;
1830                 goto retry;
1831         }
1832
1833         hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
1834             0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
1835         hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
1836             HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
1837             NULL, NULL, 0);
1838         buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1839             0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1840
1841         for (i = 0; i < 256; i++)
1842                 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1843                         *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1844
1845         for (i = 0; i < BUF_LOCKS; i++) {
1846                 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1847                     NULL, MUTEX_DEFAULT, NULL);
1848         }
1849 }
1850
1851 /*
1852  * This is the size that the buf occupies in memory. If the buf is compressed,
1853  * it will correspond to the compressed size. You should use this method of
1854  * getting the buf size unless you explicitly need the logical size.
1855  */
1856 int32_t
1857 arc_buf_size(arc_buf_t *buf)
1858 {
1859         return (ARC_BUF_COMPRESSED(buf) ?
1860             HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr));
1861 }
1862
1863 int32_t
1864 arc_buf_lsize(arc_buf_t *buf)
1865 {
1866         return (HDR_GET_LSIZE(buf->b_hdr));
1867 }
1868
1869 enum zio_compress
1870 arc_get_compression(arc_buf_t *buf)
1871 {
1872         return (ARC_BUF_COMPRESSED(buf) ?
1873             HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF);
1874 }
1875
1876 #define ARC_MINTIME     (hz>>4) /* 62 ms */
1877
1878 static inline boolean_t
1879 arc_buf_is_shared(arc_buf_t *buf)
1880 {
1881         boolean_t shared = (buf->b_data != NULL &&
1882             buf->b_hdr->b_l1hdr.b_pabd != NULL &&
1883             abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
1884             buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
1885         IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
1886         IMPLY(shared, ARC_BUF_SHARED(buf));
1887         IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
1888
1889         /*
1890          * It would be nice to assert arc_can_share() too, but the "hdr isn't
1891          * already being shared" requirement prevents us from doing that.
1892          */
1893
1894         return (shared);
1895 }
1896
1897 /*
1898  * Free the checksum associated with this header. If there is no checksum, this
1899  * is a no-op.
1900  */
1901 static inline void
1902 arc_cksum_free(arc_buf_hdr_t *hdr)
1903 {
1904         ASSERT(HDR_HAS_L1HDR(hdr));
1905         mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
1906         if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
1907                 kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
1908                 hdr->b_l1hdr.b_freeze_cksum = NULL;
1909         }
1910         mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1911 }
1912
1913 /*
1914  * Return true iff at least one of the bufs on hdr is not compressed.
1915  */
1916 static boolean_t
1917 arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
1918 {
1919         for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
1920                 if (!ARC_BUF_COMPRESSED(b)) {
1921                         return (B_TRUE);
1922                 }
1923         }
1924         return (B_FALSE);
1925 }
1926
1927 /*
1928  * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data
1929  * matches the checksum that is stored in the hdr. If there is no checksum,
1930  * or if the buf is compressed, this is a no-op.
1931  */
1932 static void
1933 arc_cksum_verify(arc_buf_t *buf)
1934 {
1935         arc_buf_hdr_t *hdr = buf->b_hdr;
1936         zio_cksum_t zc;
1937
1938         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1939                 return;
1940
1941         if (ARC_BUF_COMPRESSED(buf)) {
1942                 ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
1943                     arc_hdr_has_uncompressed_buf(hdr));
1944                 return;
1945         }
1946
1947         ASSERT(HDR_HAS_L1HDR(hdr));
1948
1949         mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
1950         if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) {
1951                 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1952                 return;
1953         }
1954
1955         fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc);
1956         if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
1957                 panic("buffer modified while frozen!");
1958         mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1959 }
1960
1961 static boolean_t
1962 arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
1963 {
1964         enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp);
1965         boolean_t valid_cksum;
1966
1967         ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
1968         VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
1969
1970         /*
1971          * We rely on the blkptr's checksum to determine if the block
1972          * is valid or not. When compressed arc is enabled, the l2arc
1973          * writes the block to the l2arc just as it appears in the pool.
1974          * This allows us to use the blkptr's checksum to validate the
1975          * data that we just read off of the l2arc without having to store
1976          * a separate checksum in the arc_buf_hdr_t. However, if compressed
1977          * arc is disabled, then the data written to the l2arc is always
1978          * uncompressed and won't match the block as it exists in the main
1979          * pool. When this is the case, we must first compress it if it is
1980          * compressed on the main pool before we can validate the checksum.
1981          */
1982         if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) {
1983                 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
1984                 uint64_t lsize = HDR_GET_LSIZE(hdr);
1985                 uint64_t csize;
1986
1987                 abd_t *cdata = abd_alloc_linear(HDR_GET_PSIZE(hdr), B_TRUE);
1988                 csize = zio_compress_data(compress, zio->io_abd,
1989                     abd_to_buf(cdata), lsize);
1990
1991                 ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr));
1992                 if (csize < HDR_GET_PSIZE(hdr)) {
1993                         /*
1994                          * Compressed blocks are always a multiple of the
1995                          * smallest ashift in the pool. Ideally, we would
1996                          * like to round up the csize to the next
1997                          * spa_min_ashift but that value may have changed
1998                          * since the block was last written. Instead,
1999                          * we rely on the fact that the hdr's psize
2000                          * was set to the psize of the block when it was
2001                          * last written. We set the csize to that value
2002                          * and zero out any part that should not contain
2003                          * data.
2004                          */
2005                         abd_zero_off(cdata, csize, HDR_GET_PSIZE(hdr) - csize);
2006                         csize = HDR_GET_PSIZE(hdr);
2007                 }
2008                 zio_push_transform(zio, cdata, csize, HDR_GET_PSIZE(hdr), NULL);
2009         }
2010
2011         /*
2012          * Block pointers always store the checksum for the logical data.
2013          * If the block pointer has the gang bit set, then the checksum
2014          * it represents is for the reconstituted data and not for an
2015          * individual gang member. The zio pipeline, however, must be able to
2016          * determine the checksum of each of the gang constituents so it
2017          * treats the checksum comparison differently than what we need
2018          * for l2arc blocks. This prevents us from using the
2019          * zio_checksum_error() interface directly. Instead we must call the
2020          * zio_checksum_error_impl() so that we can ensure the checksum is
2021          * generated using the correct checksum algorithm and accounts for the
2022          * logical I/O size and not just a gang fragment.
2023          */
2024         valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
2025             BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
2026             zio->io_offset, NULL) == 0);
2027         zio_pop_transforms(zio);
2028         return (valid_cksum);
2029 }
2030
2031 /*
2032  * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a
2033  * checksum and attaches it to the buf's hdr so that we can ensure that the buf
2034  * isn't modified later on. If buf is compressed or there is already a checksum
2035  * on the hdr, this is a no-op (we only checksum uncompressed bufs).
2036  */
2037 static void
2038 arc_cksum_compute(arc_buf_t *buf)
2039 {
2040         arc_buf_hdr_t *hdr = buf->b_hdr;
2041
2042         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
2043                 return;
2044
2045         ASSERT(HDR_HAS_L1HDR(hdr));
2046
2047         mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
2048         if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
2049                 ASSERT(arc_hdr_has_uncompressed_buf(hdr));
2050                 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
2051                 return;
2052         } else if (ARC_BUF_COMPRESSED(buf)) {
2053                 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
2054                 return;
2055         }
2056
2057         ASSERT(!ARC_BUF_COMPRESSED(buf));
2058         hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
2059             KM_SLEEP);
2060         fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
2061             hdr->b_l1hdr.b_freeze_cksum);
2062         mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
2063 #ifdef illumos
2064         arc_buf_watch(buf);
2065 #endif
2066 }
2067
2068 #ifdef illumos
2069 #ifndef _KERNEL
2070 typedef struct procctl {
2071         long cmd;
2072         prwatch_t prwatch;
2073 } procctl_t;
2074 #endif
2075
2076 /* ARGSUSED */
2077 static void
2078 arc_buf_unwatch(arc_buf_t *buf)
2079 {
2080 #ifndef _KERNEL
2081         if (arc_watch) {
2082                 int result;
2083                 procctl_t ctl;
2084                 ctl.cmd = PCWATCH;
2085                 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
2086                 ctl.prwatch.pr_size = 0;
2087                 ctl.prwatch.pr_wflags = 0;
2088                 result = write(arc_procfd, &ctl, sizeof (ctl));
2089                 ASSERT3U(result, ==, sizeof (ctl));
2090         }
2091 #endif
2092 }
2093
2094 /* ARGSUSED */
2095 static void
2096 arc_buf_watch(arc_buf_t *buf)
2097 {
2098 #ifndef _KERNEL
2099         if (arc_watch) {
2100                 int result;
2101                 procctl_t ctl;
2102                 ctl.cmd = PCWATCH;
2103                 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
2104                 ctl.prwatch.pr_size = arc_buf_size(buf);
2105                 ctl.prwatch.pr_wflags = WA_WRITE;
2106                 result = write(arc_procfd, &ctl, sizeof (ctl));
2107                 ASSERT3U(result, ==, sizeof (ctl));
2108         }
2109 #endif
2110 }
2111 #endif /* illumos */
2112
2113 static arc_buf_contents_t
2114 arc_buf_type(arc_buf_hdr_t *hdr)
2115 {
2116         arc_buf_contents_t type;
2117         if (HDR_ISTYPE_METADATA(hdr)) {
2118                 type = ARC_BUFC_METADATA;
2119         } else {
2120                 type = ARC_BUFC_DATA;
2121         }
2122         VERIFY3U(hdr->b_type, ==, type);
2123         return (type);
2124 }
2125
2126 boolean_t
2127 arc_is_metadata(arc_buf_t *buf)
2128 {
2129         return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0);
2130 }
2131
2132 static uint32_t
2133 arc_bufc_to_flags(arc_buf_contents_t type)
2134 {
2135         switch (type) {
2136         case ARC_BUFC_DATA:
2137                 /* metadata field is 0 if buffer contains normal data */
2138                 return (0);
2139         case ARC_BUFC_METADATA:
2140                 return (ARC_FLAG_BUFC_METADATA);
2141         default:
2142                 break;
2143         }
2144         panic("undefined ARC buffer type!");
2145         return ((uint32_t)-1);
2146 }
2147
2148 void
2149 arc_buf_thaw(arc_buf_t *buf)
2150 {
2151         arc_buf_hdr_t *hdr = buf->b_hdr;
2152
2153         ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
2154         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2155
2156         arc_cksum_verify(buf);
2157
2158         /*
2159          * Compressed buffers do not manipulate the b_freeze_cksum or
2160          * allocate b_thawed.
2161          */
2162         if (ARC_BUF_COMPRESSED(buf)) {
2163                 ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
2164                     arc_hdr_has_uncompressed_buf(hdr));
2165                 return;
2166         }
2167
2168         ASSERT(HDR_HAS_L1HDR(hdr));
2169         arc_cksum_free(hdr);
2170
2171         mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
2172 #ifdef ZFS_DEBUG
2173         if (zfs_flags & ZFS_DEBUG_MODIFY) {
2174                 if (hdr->b_l1hdr.b_thawed != NULL)
2175                         kmem_free(hdr->b_l1hdr.b_thawed, 1);
2176                 hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP);
2177         }
2178 #endif
2179
2180         mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
2181
2182 #ifdef illumos
2183         arc_buf_unwatch(buf);
2184 #endif
2185 }
2186
2187 void
2188 arc_buf_freeze(arc_buf_t *buf)
2189 {
2190         arc_buf_hdr_t *hdr = buf->b_hdr;
2191         kmutex_t *hash_lock;
2192
2193         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
2194                 return;
2195
2196         if (ARC_BUF_COMPRESSED(buf)) {
2197                 ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
2198                     arc_hdr_has_uncompressed_buf(hdr));
2199                 return;
2200         }
2201
2202         hash_lock = HDR_LOCK(hdr);
2203         mutex_enter(hash_lock);
2204
2205         ASSERT(HDR_HAS_L1HDR(hdr));
2206         ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL ||
2207             hdr->b_l1hdr.b_state == arc_anon);
2208         arc_cksum_compute(buf);
2209         mutex_exit(hash_lock);
2210 }
2211
2212 /*
2213  * The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
2214  * the following functions should be used to ensure that the flags are
2215  * updated in a thread-safe way. When manipulating the flags either
2216  * the hash_lock must be held or the hdr must be undiscoverable. This
2217  * ensures that we're not racing with any other threads when updating
2218  * the flags.
2219  */
2220 static inline void
2221 arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
2222 {
2223         ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
2224         hdr->b_flags |= flags;
2225 }
2226
2227 static inline void
2228 arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
2229 {
2230         ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
2231         hdr->b_flags &= ~flags;
2232 }
2233
2234 /*
2235  * Setting the compression bits in the arc_buf_hdr_t's b_flags is
2236  * done in a special way since we have to clear and set bits
2237  * at the same time. Consumers that wish to set the compression bits
2238  * must use this function to ensure that the flags are updated in
2239  * thread-safe manner.
2240  */
2241 static void
2242 arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
2243 {
2244         ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
2245
2246         /*
2247          * Holes and embedded blocks will always have a psize = 0 so
2248          * we ignore the compression of the blkptr and set the
2249          * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF.
2250          * Holes and embedded blocks remain anonymous so we don't
2251          * want to uncompress them. Mark them as uncompressed.
2252          */
2253         if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) {
2254                 arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
2255                 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
2256                 ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
2257                 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
2258         } else {
2259                 arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
2260                 HDR_SET_COMPRESS(hdr, cmp);
2261                 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
2262                 ASSERT(HDR_COMPRESSION_ENABLED(hdr));
2263         }
2264 }
2265
2266 /*
2267  * Looks for another buf on the same hdr which has the data decompressed, copies
2268  * from it, and returns true. If no such buf exists, returns false.
2269  */
2270 static boolean_t
2271 arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
2272 {
2273         arc_buf_hdr_t *hdr = buf->b_hdr;
2274         boolean_t copied = B_FALSE;
2275
2276         ASSERT(HDR_HAS_L1HDR(hdr));
2277         ASSERT3P(buf->b_data, !=, NULL);
2278         ASSERT(!ARC_BUF_COMPRESSED(buf));
2279
2280         for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL;
2281             from = from->b_next) {
2282                 /* can't use our own data buffer */
2283                 if (from == buf) {
2284                         continue;
2285                 }
2286
2287                 if (!ARC_BUF_COMPRESSED(from)) {
2288                         bcopy(from->b_data, buf->b_data, arc_buf_size(buf));
2289                         copied = B_TRUE;
2290                         break;
2291                 }
2292         }
2293
2294         /*
2295          * There were no decompressed bufs, so there should not be a
2296          * checksum on the hdr either.
2297          */
2298         EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
2299
2300         return (copied);
2301 }
2302
2303 /*
2304  * Given a buf that has a data buffer attached to it, this function will
2305  * efficiently fill the buf with data of the specified compression setting from
2306  * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr
2307  * are already sharing a data buf, no copy is performed.
2308  *
2309  * If the buf is marked as compressed but uncompressed data was requested, this
2310  * will allocate a new data buffer for the buf, remove that flag, and fill the
2311  * buf with uncompressed data. You can't request a compressed buf on a hdr with
2312  * uncompressed data, and (since we haven't added support for it yet) if you
2313  * want compressed data your buf must already be marked as compressed and have
2314  * the correct-sized data buffer.
2315  */
2316 static int
2317 arc_buf_fill(arc_buf_t *buf, boolean_t compressed)
2318 {
2319         arc_buf_hdr_t *hdr = buf->b_hdr;
2320         boolean_t hdr_compressed = (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
2321         dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
2322
2323         ASSERT3P(buf->b_data, !=, NULL);
2324         IMPLY(compressed, hdr_compressed);
2325         IMPLY(compressed, ARC_BUF_COMPRESSED(buf));
2326
2327         if (hdr_compressed == compressed) {
2328                 if (!arc_buf_is_shared(buf)) {
2329                         abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
2330                             arc_buf_size(buf));
2331                 }
2332         } else {
2333                 ASSERT(hdr_compressed);
2334                 ASSERT(!compressed);
2335                 ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr));
2336
2337                 /*
2338                  * If the buf is sharing its data with the hdr, unlink it and
2339                  * allocate a new data buffer for the buf.
2340                  */
2341                 if (arc_buf_is_shared(buf)) {
2342                         ASSERT(ARC_BUF_COMPRESSED(buf));
2343
2344                         /* We need to give the buf it's own b_data */
2345                         buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
2346                         buf->b_data =
2347                             arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
2348                         arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
2349
2350                         /* Previously overhead was 0; just add new overhead */
2351                         ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
2352                 } else if (ARC_BUF_COMPRESSED(buf)) {
2353                         /* We need to reallocate the buf's b_data */
2354                         arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
2355                             buf);
2356                         buf->b_data =
2357                             arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
2358
2359                         /* We increased the size of b_data; update overhead */
2360                         ARCSTAT_INCR(arcstat_overhead_size,
2361                             HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr));
2362                 }
2363
2364                 /*
2365                  * Regardless of the buf's previous compression settings, it
2366                  * should not be compressed at the end of this function.
2367                  */
2368                 buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
2369
2370                 /*
2371                  * Try copying the data from another buf which already has a
2372                  * decompressed version. If that's not possible, it's time to
2373                  * bite the bullet and decompress the data from the hdr.
2374                  */
2375                 if (arc_buf_try_copy_decompressed_data(buf)) {
2376                         /* Skip byteswapping and checksumming (already done) */
2377                         ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL);
2378                         return (0);
2379                 } else {
2380                         int error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
2381                             hdr->b_l1hdr.b_pabd, buf->b_data,
2382                             HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
2383
2384                         /*
2385                          * Absent hardware errors or software bugs, this should
2386                          * be impossible, but log it anyway so we can debug it.
2387                          */
2388                         if (error != 0) {
2389                                 zfs_dbgmsg(
2390                                     "hdr %p, compress %d, psize %d, lsize %d",
2391                                     hdr, HDR_GET_COMPRESS(hdr),
2392                                     HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
2393                                 return (SET_ERROR(EIO));
2394                         }
2395                 }
2396         }
2397
2398         /* Byteswap the buf's data if necessary */
2399         if (bswap != DMU_BSWAP_NUMFUNCS) {
2400                 ASSERT(!HDR_SHARED_DATA(hdr));
2401                 ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
2402                 dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
2403         }
2404
2405         /* Compute the hdr's checksum if necessary */
2406         arc_cksum_compute(buf);
2407
2408         return (0);
2409 }
2410
2411 int
2412 arc_decompress(arc_buf_t *buf)
2413 {
2414         return (arc_buf_fill(buf, B_FALSE));
2415 }
2416
2417 /*
2418  * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
2419  */
2420 static uint64_t
2421 arc_hdr_size(arc_buf_hdr_t *hdr)
2422 {
2423         uint64_t size;
2424
2425         if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
2426             HDR_GET_PSIZE(hdr) > 0) {
2427                 size = HDR_GET_PSIZE(hdr);
2428         } else {
2429                 ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0);
2430                 size = HDR_GET_LSIZE(hdr);
2431         }
2432         return (size);
2433 }
2434
2435 /*
2436  * Increment the amount of evictable space in the arc_state_t's refcount.
2437  * We account for the space used by the hdr and the arc buf individually
2438  * so that we can add and remove them from the refcount individually.
2439  */
2440 static void
2441 arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
2442 {
2443         arc_buf_contents_t type = arc_buf_type(hdr);
2444
2445         ASSERT(HDR_HAS_L1HDR(hdr));
2446
2447         if (GHOST_STATE(state)) {
2448                 ASSERT0(hdr->b_l1hdr.b_bufcnt);
2449                 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2450                 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2451                 (void) refcount_add_many(&state->arcs_esize[type],
2452                     HDR_GET_LSIZE(hdr), hdr);
2453                 return;
2454         }
2455
2456         ASSERT(!GHOST_STATE(state));
2457         if (hdr->b_l1hdr.b_pabd != NULL) {
2458                 (void) refcount_add_many(&state->arcs_esize[type],
2459                     arc_hdr_size(hdr), hdr);
2460         }
2461         for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2462             buf = buf->b_next) {
2463                 if (arc_buf_is_shared(buf))
2464                         continue;
2465                 (void) refcount_add_many(&state->arcs_esize[type],
2466                     arc_buf_size(buf), buf);
2467         }
2468 }
2469
2470 /*
2471  * Decrement the amount of evictable space in the arc_state_t's refcount.
2472  * We account for the space used by the hdr and the arc buf individually
2473  * so that we can add and remove them from the refcount individually.
2474  */
2475 static void
2476 arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
2477 {
2478         arc_buf_contents_t type = arc_buf_type(hdr);
2479
2480         ASSERT(HDR_HAS_L1HDR(hdr));
2481
2482         if (GHOST_STATE(state)) {
2483                 ASSERT0(hdr->b_l1hdr.b_bufcnt);
2484                 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2485                 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2486                 (void) refcount_remove_many(&state->arcs_esize[type],
2487                     HDR_GET_LSIZE(hdr), hdr);
2488                 return;
2489         }
2490
2491         ASSERT(!GHOST_STATE(state));
2492         if (hdr->b_l1hdr.b_pabd != NULL) {
2493                 (void) refcount_remove_many(&state->arcs_esize[type],
2494                     arc_hdr_size(hdr), hdr);
2495         }
2496         for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2497             buf = buf->b_next) {
2498                 if (arc_buf_is_shared(buf))
2499                         continue;
2500                 (void) refcount_remove_many(&state->arcs_esize[type],
2501                     arc_buf_size(buf), buf);
2502         }
2503 }
2504
2505 /*
2506  * Add a reference to this hdr indicating that someone is actively
2507  * referencing that memory. When the refcount transitions from 0 to 1,
2508  * we remove it from the respective arc_state_t list to indicate that
2509  * it is not evictable.
2510  */
2511 static void
2512 add_reference(arc_buf_hdr_t *hdr, void *tag)
2513 {
2514         ASSERT(HDR_HAS_L1HDR(hdr));
2515         if (!MUTEX_HELD(HDR_LOCK(hdr))) {
2516                 ASSERT(hdr->b_l1hdr.b_state == arc_anon);
2517                 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2518                 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2519         }
2520
2521         arc_state_t *state = hdr->b_l1hdr.b_state;
2522
2523         if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
2524             (state != arc_anon)) {
2525                 /* We don't use the L2-only state list. */
2526                 if (state != arc_l2c_only) {
2527                         multilist_remove(state->arcs_list[arc_buf_type(hdr)],
2528                             hdr);
2529                         arc_evictable_space_decrement(hdr, state);
2530                 }
2531                 /* remove the prefetch flag if we get a reference */
2532                 arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
2533         }
2534 }
2535
2536 /*
2537  * Remove a reference from this hdr. When the reference transitions from
2538  * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's
2539  * list making it eligible for eviction.
2540  */
2541 static int
2542 remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
2543 {
2544         int cnt;
2545         arc_state_t *state = hdr->b_l1hdr.b_state;
2546
2547         ASSERT(HDR_HAS_L1HDR(hdr));
2548         ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
2549         ASSERT(!GHOST_STATE(state));
2550
2551         /*
2552          * arc_l2c_only counts as a ghost state so we don't need to explicitly
2553          * check to prevent usage of the arc_l2c_only list.
2554          */
2555         if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
2556             (state != arc_anon)) {
2557                 multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr);
2558                 ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
2559                 arc_evictable_space_increment(hdr, state);
2560         }
2561         return (cnt);
2562 }
2563
2564 /*
2565  * Returns detailed information about a specific arc buffer.  When the
2566  * state_index argument is set the function will calculate the arc header
2567  * list position for its arc state.  Since this requires a linear traversal
2568  * callers are strongly encourage not to do this.  However, it can be helpful
2569  * for targeted analysis so the functionality is provided.
2570  */
2571 void
2572 arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
2573 {
2574         arc_buf_hdr_t *hdr = ab->b_hdr;
2575         l1arc_buf_hdr_t *l1hdr = NULL;
2576         l2arc_buf_hdr_t *l2hdr = NULL;
2577         arc_state_t *state = NULL;
2578
2579         memset(abi, 0, sizeof (arc_buf_info_t));
2580
2581         if (hdr == NULL)
2582                 return;
2583
2584         abi->abi_flags = hdr->b_flags;
2585
2586         if (HDR_HAS_L1HDR(hdr)) {
2587                 l1hdr = &hdr->b_l1hdr;
2588                 state = l1hdr->b_state;
2589         }
2590         if (HDR_HAS_L2HDR(hdr))
2591                 l2hdr = &hdr->b_l2hdr;
2592
2593         if (l1hdr) {
2594                 abi->abi_bufcnt = l1hdr->b_bufcnt;
2595                 abi->abi_access = l1hdr->b_arc_access;
2596                 abi->abi_mru_hits = l1hdr->b_mru_hits;
2597                 abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits;
2598                 abi->abi_mfu_hits = l1hdr->b_mfu_hits;
2599                 abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits;
2600                 abi->abi_holds = refcount_count(&l1hdr->b_refcnt);
2601         }
2602
2603         if (l2hdr) {
2604                 abi->abi_l2arc_dattr = l2hdr->b_daddr;
2605                 abi->abi_l2arc_hits = l2hdr->b_hits;
2606         }
2607
2608         abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
2609         abi->abi_state_contents = arc_buf_type(hdr);
2610         abi->abi_size = arc_hdr_size(hdr);
2611 }
2612
2613 /*
2614  * Move the supplied buffer to the indicated state. The hash lock
2615  * for the buffer must be held by the caller.
2616  */
2617 static void
2618 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
2619     kmutex_t *hash_lock)
2620 {
2621         arc_state_t *old_state;
2622         int64_t refcnt;
2623         uint32_t bufcnt;
2624         boolean_t update_old, update_new;
2625         arc_buf_contents_t buftype = arc_buf_type(hdr);
2626
2627         /*
2628          * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
2629          * in arc_read() when bringing a buffer out of the L2ARC.  However, the
2630          * L1 hdr doesn't always exist when we change state to arc_anon before
2631          * destroying a header, in which case reallocating to add the L1 hdr is
2632          * pointless.
2633          */
2634         if (HDR_HAS_L1HDR(hdr)) {
2635                 old_state = hdr->b_l1hdr.b_state;
2636                 refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
2637                 bufcnt = hdr->b_l1hdr.b_bufcnt;
2638                 update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL);
2639         } else {
2640                 old_state = arc_l2c_only;
2641                 refcnt = 0;
2642                 bufcnt = 0;
2643                 update_old = B_FALSE;
2644         }
2645         update_new = update_old;
2646
2647         ASSERT(MUTEX_HELD(hash_lock));
2648         ASSERT3P(new_state, !=, old_state);
2649         ASSERT(!GHOST_STATE(new_state) || bufcnt == 0);
2650         ASSERT(old_state != arc_anon || bufcnt <= 1);
2651
2652         /*
2653          * If this buffer is evictable, transfer it from the
2654          * old state list to the new state list.
2655          */
2656         if (refcnt == 0) {
2657                 if (old_state != arc_anon && old_state != arc_l2c_only) {
2658                         ASSERT(HDR_HAS_L1HDR(hdr));
2659                         multilist_remove(old_state->arcs_list[buftype], hdr);
2660
2661                         if (GHOST_STATE(old_state)) {
2662                                 ASSERT0(bufcnt);
2663                                 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2664                                 update_old = B_TRUE;
2665                         }
2666                         arc_evictable_space_decrement(hdr, old_state);
2667                 }
2668                 if (new_state != arc_anon && new_state != arc_l2c_only) {
2669
2670                         /*
2671                          * An L1 header always exists here, since if we're
2672                          * moving to some L1-cached state (i.e. not l2c_only or
2673                          * anonymous), we realloc the header to add an L1hdr
2674                          * beforehand.
2675                          */
2676                         ASSERT(HDR_HAS_L1HDR(hdr));
2677                         multilist_insert(new_state->arcs_list[buftype], hdr);
2678
2679                         if (GHOST_STATE(new_state)) {
2680                                 ASSERT0(bufcnt);
2681                                 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2682                                 update_new = B_TRUE;
2683                         }
2684                         arc_evictable_space_increment(hdr, new_state);
2685                 }
2686         }
2687
2688         ASSERT(!HDR_EMPTY(hdr));
2689         if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
2690                 buf_hash_remove(hdr);
2691
2692         /* adjust state sizes (ignore arc_l2c_only) */
2693
2694         if (update_new && new_state != arc_l2c_only) {
2695                 ASSERT(HDR_HAS_L1HDR(hdr));
2696                 if (GHOST_STATE(new_state)) {
2697                         ASSERT0(bufcnt);
2698
2699                         /*
2700                          * When moving a header to a ghost state, we first
2701                          * remove all arc buffers. Thus, we'll have a
2702                          * bufcnt of zero, and no arc buffer to use for
2703                          * the reference. As a result, we use the arc
2704                          * header pointer for the reference.
2705                          */
2706                         (void) refcount_add_many(&new_state->arcs_size,
2707                             HDR_GET_LSIZE(hdr), hdr);
2708                         ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2709                 } else {
2710                         uint32_t buffers = 0;
2711
2712                         /*
2713                          * Each individual buffer holds a unique reference,
2714                          * thus we must remove each of these references one
2715                          * at a time.
2716                          */
2717                         for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2718                             buf = buf->b_next) {
2719                                 ASSERT3U(bufcnt, !=, 0);
2720                                 buffers++;
2721
2722                                 /*
2723                                  * When the arc_buf_t is sharing the data
2724                                  * block with the hdr, the owner of the
2725                                  * reference belongs to the hdr. Only
2726                                  * add to the refcount if the arc_buf_t is
2727                                  * not shared.
2728                                  */
2729                                 if (arc_buf_is_shared(buf))
2730                                         continue;
2731
2732                                 (void) refcount_add_many(&new_state->arcs_size,
2733                                     arc_buf_size(buf), buf);
2734                         }
2735                         ASSERT3U(bufcnt, ==, buffers);
2736
2737                         if (hdr->b_l1hdr.b_pabd != NULL) {
2738                                 (void) refcount_add_many(&new_state->arcs_size,
2739                                     arc_hdr_size(hdr), hdr);
2740                         } else {
2741                                 ASSERT(GHOST_STATE(old_state));
2742                         }
2743                 }
2744         }
2745
2746         if (update_old && old_state != arc_l2c_only) {
2747                 ASSERT(HDR_HAS_L1HDR(hdr));
2748                 if (GHOST_STATE(old_state)) {
2749                         ASSERT0(bufcnt);
2750                         ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2751
2752                         /*
2753                          * When moving a header off of a ghost state,
2754                          * the header will not contain any arc buffers.
2755                          * We use the arc header pointer for the reference
2756                          * which is exactly what we did when we put the
2757                          * header on the ghost state.
2758                          */
2759
2760                         (void) refcount_remove_many(&old_state->arcs_size,
2761                             HDR_GET_LSIZE(hdr), hdr);
2762                 } else {
2763                         uint32_t buffers = 0;
2764
2765                         /*
2766                          * Each individual buffer holds a unique reference,
2767                          * thus we must remove each of these references one
2768                          * at a time.
2769                          */
2770                         for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2771                             buf = buf->b_next) {
2772                                 ASSERT3U(bufcnt, !=, 0);
2773                                 buffers++;
2774
2775                                 /*
2776                                  * When the arc_buf_t is sharing the data
2777                                  * block with the hdr, the owner of the
2778                                  * reference belongs to the hdr. Only
2779                                  * add to the refcount if the arc_buf_t is
2780                                  * not shared.
2781                                  */
2782                                 if (arc_buf_is_shared(buf))
2783                                         continue;
2784
2785                                 (void) refcount_remove_many(
2786                                     &old_state->arcs_size, arc_buf_size(buf),
2787                                     buf);
2788                         }
2789                         ASSERT3U(bufcnt, ==, buffers);
2790                         ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
2791                         (void) refcount_remove_many(
2792                             &old_state->arcs_size, arc_hdr_size(hdr), hdr);
2793                 }
2794         }
2795
2796         if (HDR_HAS_L1HDR(hdr))
2797                 hdr->b_l1hdr.b_state = new_state;
2798
2799         /*
2800          * L2 headers should never be on the L2 state list since they don't
2801          * have L1 headers allocated.
2802          */
2803         ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
2804             multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
2805 }
2806
2807 void
2808 arc_space_consume(uint64_t space, arc_space_type_t type)
2809 {
2810         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
2811
2812         switch (type) {
2813         case ARC_SPACE_DATA:
2814                 aggsum_add(&astat_data_size, space);
2815                 break;
2816         case ARC_SPACE_META:
2817                 aggsum_add(&astat_metadata_size, space);
2818                 break;
2819         case ARC_SPACE_BONUS:
2820                 aggsum_add(&astat_bonus_size, space);
2821                 break;
2822         case ARC_SPACE_DNODE:
2823                 aggsum_add(&astat_dnode_size, space);
2824                 break;
2825         case ARC_SPACE_DBUF:
2826                 aggsum_add(&astat_dbuf_size, space);
2827                 break;
2828         case ARC_SPACE_HDRS:
2829                 aggsum_add(&astat_hdr_size, space);
2830                 break;
2831         case ARC_SPACE_L2HDRS:
2832                 aggsum_add(&astat_l2_hdr_size, space);
2833                 break;
2834         }
2835
2836         if (type != ARC_SPACE_DATA)
2837                 aggsum_add(&arc_meta_used, space);
2838
2839         aggsum_add(&arc_size, space);
2840 }
2841
2842 void
2843 arc_space_return(uint64_t space, arc_space_type_t type)
2844 {
2845         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
2846
2847         switch (type) {
2848         case ARC_SPACE_DATA:
2849                 aggsum_add(&astat_data_size, -space);
2850                 break;
2851         case ARC_SPACE_META:
2852                 aggsum_add(&astat_metadata_size, -space);
2853                 break;
2854         case ARC_SPACE_BONUS:
2855                 aggsum_add(&astat_bonus_size, -space);
2856                 break;
2857         case ARC_SPACE_DNODE:
2858                 aggsum_add(&astat_dnode_size, -space);
2859                 break;
2860         case ARC_SPACE_DBUF:
2861                 aggsum_add(&astat_dbuf_size, -space);
2862                 break;
2863         case ARC_SPACE_HDRS:
2864                 aggsum_add(&astat_hdr_size, -space);
2865                 break;
2866         case ARC_SPACE_L2HDRS:
2867                 aggsum_add(&astat_l2_hdr_size, -space);
2868                 break;
2869         }
2870
2871         if (type != ARC_SPACE_DATA) {
2872                 ASSERT(aggsum_compare(&arc_meta_used, space) >= 0);
2873                 /*
2874                  * We use the upper bound here rather than the precise value
2875                  * because the arc_meta_max value doesn't need to be
2876                  * precise. It's only consumed by humans via arcstats.
2877                  */
2878                 if (arc_meta_max < aggsum_upper_bound(&arc_meta_used))
2879                         arc_meta_max = aggsum_upper_bound(&arc_meta_used);
2880                 aggsum_add(&arc_meta_used, -space);
2881         }
2882
2883         ASSERT(aggsum_compare(&arc_size, space) >= 0);
2884         aggsum_add(&arc_size, -space);
2885 }
2886
2887 /*
2888  * Given a hdr and a buf, returns whether that buf can share its b_data buffer
2889  * with the hdr's b_pabd.
2890  */
2891 static boolean_t
2892 arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
2893 {
2894         /*
2895          * The criteria for sharing a hdr's data are:
2896          * 1. the hdr's compression matches the buf's compression
2897          * 2. the hdr doesn't need to be byteswapped
2898          * 3. the hdr isn't already being shared
2899          * 4. the buf is either compressed or it is the last buf in the hdr list
2900          *
2901          * Criterion #4 maintains the invariant that shared uncompressed
2902          * bufs must be the final buf in the hdr's b_buf list. Reading this, you
2903          * might ask, "if a compressed buf is allocated first, won't that be the
2904          * last thing in the list?", but in that case it's impossible to create
2905          * a shared uncompressed buf anyway (because the hdr must be compressed
2906          * to have the compressed buf). You might also think that #3 is
2907          * sufficient to make this guarantee, however it's possible
2908          * (specifically in the rare L2ARC write race mentioned in
2909          * arc_buf_alloc_impl()) there will be an existing uncompressed buf that
2910          * is sharable, but wasn't at the time of its allocation. Rather than
2911          * allow a new shared uncompressed buf to be created and then shuffle
2912          * the list around to make it the last element, this simply disallows
2913          * sharing if the new buf isn't the first to be added.
2914          */
2915         ASSERT3P(buf->b_hdr, ==, hdr);
2916         boolean_t hdr_compressed = HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF;
2917         boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0;
2918         return (buf_compressed == hdr_compressed &&
2919             hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
2920             !HDR_SHARED_DATA(hdr) &&
2921             (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf)));
2922 }
2923
2924 /*
2925  * Allocate a buf for this hdr. If you care about the data that's in the hdr,
2926  * or if you want a compressed buffer, pass those flags in. Returns 0 if the
2927  * copy was made successfully, or an error code otherwise.
2928  */
2929 static int
2930 arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed,
2931     boolean_t fill, arc_buf_t **ret)
2932 {
2933         arc_buf_t *buf;
2934
2935         ASSERT(HDR_HAS_L1HDR(hdr));
2936         ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
2937         VERIFY(hdr->b_type == ARC_BUFC_DATA ||
2938             hdr->b_type == ARC_BUFC_METADATA);
2939         ASSERT3P(ret, !=, NULL);
2940         ASSERT3P(*ret, ==, NULL);
2941
2942         buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2943         buf->b_hdr = hdr;
2944         buf->b_data = NULL;
2945         buf->b_next = hdr->b_l1hdr.b_buf;
2946         buf->b_flags = 0;
2947
2948         add_reference(hdr, tag);
2949
2950         /*
2951          * We're about to change the hdr's b_flags. We must either
2952          * hold the hash_lock or be undiscoverable.
2953          */
2954         ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
2955
2956         /*
2957          * Only honor requests for compressed bufs if the hdr is actually
2958          * compressed.
2959          */
2960         if (compressed && HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
2961                 buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
2962
2963         /*
2964          * If the hdr's data can be shared then we share the data buffer and
2965          * set the appropriate bit in the hdr's b_flags to indicate the hdr is
2966          * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new
2967          * buffer to store the buf's data.
2968          *
2969          * There are two additional restrictions here because we're sharing
2970          * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
2971          * actively involved in an L2ARC write, because if this buf is used by
2972          * an arc_write() then the hdr's data buffer will be released when the
2973          * write completes, even though the L2ARC write might still be using it.
2974          * Second, the hdr's ABD must be linear so that the buf's user doesn't
2975          * need to be ABD-aware.
2976          */
2977         boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) &&
2978             abd_is_linear(hdr->b_l1hdr.b_pabd);
2979
2980         /* Set up b_data and sharing */
2981         if (can_share) {
2982                 buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd);
2983                 buf->b_flags |= ARC_BUF_FLAG_SHARED;
2984                 arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
2985         } else {
2986                 buf->b_data =
2987                     arc_get_data_buf(hdr, arc_buf_size(buf), buf);
2988                 ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
2989         }
2990         VERIFY3P(buf->b_data, !=, NULL);
2991
2992         hdr->b_l1hdr.b_buf = buf;
2993         hdr->b_l1hdr.b_bufcnt += 1;
2994
2995         /*
2996          * If the user wants the data from the hdr, we need to either copy or
2997          * decompress the data.
2998          */
2999         if (fill) {
3000                 return (arc_buf_fill(buf, ARC_BUF_COMPRESSED(buf) != 0));
3001         }
3002
3003         return (0);
3004 }
3005
3006 static char *arc_onloan_tag = "onloan";
3007
3008 static inline void
3009 arc_loaned_bytes_update(int64_t delta)
3010 {
3011         atomic_add_64(&arc_loaned_bytes, delta);
3012
3013         /* assert that it did not wrap around */
3014         ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
3015 }
3016
3017 /*
3018  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
3019  * flight data by arc_tempreserve_space() until they are "returned". Loaned
3020  * buffers must be returned to the arc before they can be used by the DMU or
3021  * freed.
3022  */
3023 arc_buf_t *
3024 arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size)
3025 {
3026         arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag,
3027             is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size);
3028
3029         arc_loaned_bytes_update(arc_buf_size(buf));
3030
3031         return (buf);
3032 }
3033
3034 arc_buf_t *
3035 arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
3036     enum zio_compress compression_type)
3037 {
3038         arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag,
3039             psize, lsize, compression_type);
3040
3041         arc_loaned_bytes_update(arc_buf_size(buf));
3042
3043         return (buf);
3044 }
3045
3046
3047 /*
3048  * Return a loaned arc buffer to the arc.
3049  */
3050 void
3051 arc_return_buf(arc_buf_t *buf, void *tag)
3052 {
3053         arc_buf_hdr_t *hdr = buf->b_hdr;
3054
3055         ASSERT3P(buf->b_data, !=, NULL);
3056         ASSERT(HDR_HAS_L1HDR(hdr));
3057         (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
3058         (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
3059
3060         arc_loaned_bytes_update(-arc_buf_size(buf));
3061 }
3062
3063 /* Detach an arc_buf from a dbuf (tag) */
3064 void
3065 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
3066 {
3067         arc_buf_hdr_t *hdr = buf->b_hdr;
3068
3069         ASSERT3P(buf->b_data, !=, NULL);
3070         ASSERT(HDR_HAS_L1HDR(hdr));
3071         (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
3072         (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
3073
3074         arc_loaned_bytes_update(arc_buf_size(buf));
3075 }
3076
3077 static void
3078 l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type)
3079 {
3080         l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
3081
3082         df->l2df_abd = abd;
3083         df->l2df_size = size;
3084         df->l2df_type = type;
3085         mutex_enter(&l2arc_free_on_write_mtx);
3086         list_insert_head(l2arc_free_on_write, df);
3087         mutex_exit(&l2arc_free_on_write_mtx);
3088 }
3089
3090 static void
3091 arc_hdr_free_on_write(arc_buf_hdr_t *hdr)
3092 {
3093         arc_state_t *state = hdr->b_l1hdr.b_state;
3094         arc_buf_contents_t type = arc_buf_type(hdr);
3095         uint64_t size = arc_hdr_size(hdr);
3096
3097         /* protected by hash lock, if in the hash table */
3098         if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
3099                 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3100                 ASSERT(state != arc_anon && state != arc_l2c_only);
3101
3102                 (void) refcount_remove_many(&state->arcs_esize[type],
3103                     size, hdr);
3104         }
3105         (void) refcount_remove_many(&state->arcs_size, size, hdr);
3106         if (type == ARC_BUFC_METADATA) {
3107                 arc_space_return(size, ARC_SPACE_META);
3108         } else {
3109                 ASSERT(type == ARC_BUFC_DATA);
3110                 arc_space_return(size, ARC_SPACE_DATA);
3111         }
3112
3113         l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
3114 }
3115
3116 /*
3117  * Share the arc_buf_t's data with the hdr. Whenever we are sharing the
3118  * data buffer, we transfer the refcount ownership to the hdr and update
3119  * the appropriate kstats.
3120  */
3121 static void
3122 arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
3123 {
3124         arc_state_t *state = hdr->b_l1hdr.b_state;
3125
3126         ASSERT(arc_can_share(hdr, buf));
3127         ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
3128         ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
3129
3130         /*
3131          * Start sharing the data buffer. We transfer the
3132          * refcount ownership to the hdr since it always owns
3133          * the refcount whenever an arc_buf_t is shared.
3134          */
3135         refcount_transfer_ownership(&state->arcs_size, buf, hdr);
3136         hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
3137         abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
3138             HDR_ISTYPE_METADATA(hdr));
3139         arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
3140         buf->b_flags |= ARC_BUF_FLAG_SHARED;
3141
3142         /*
3143          * Since we've transferred ownership to the hdr we need
3144          * to increment its compressed and uncompressed kstats and
3145          * decrement the overhead size.
3146          */
3147         ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
3148         ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
3149         ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf));
3150 }
3151
3152 static void
3153 arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
3154 {
3155         arc_state_t *state = hdr->b_l1hdr.b_state;
3156
3157         ASSERT(arc_buf_is_shared(buf));
3158         ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
3159         ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
3160
3161         /*
3162          * We are no longer sharing this buffer so we need
3163          * to transfer its ownership to the rightful owner.
3164          */
3165         refcount_transfer_ownership(&state->arcs_size, hdr, buf);
3166         arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
3167         abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
3168         abd_put(hdr->b_l1hdr.b_pabd);
3169         hdr->b_l1hdr.b_pabd = NULL;
3170         buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
3171
3172         /*
3173          * Since the buffer is no longer shared between
3174          * the arc buf and the hdr, count it as overhead.
3175          */
3176         ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
3177         ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
3178         ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
3179 }
3180
3181 /*
3182  * Remove an arc_buf_t from the hdr's buf list and return the last
3183  * arc_buf_t on the list. If no buffers remain on the list then return
3184  * NULL.
3185  */
3186 static arc_buf_t *
3187 arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
3188 {
3189         ASSERT(HDR_HAS_L1HDR(hdr));
3190         ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
3191
3192         arc_buf_t **bufp = &hdr->b_l1hdr.b_buf;
3193         arc_buf_t *lastbuf = NULL;
3194
3195         /*
3196          * Remove the buf from the hdr list and locate the last
3197          * remaining buffer on the list.
3198          */
3199         while (*bufp != NULL) {
3200                 if (*bufp == buf)
3201                         *bufp = buf->b_next;
3202
3203                 /*
3204                  * If we've removed a buffer in the middle of
3205                  * the list then update the lastbuf and update
3206                  * bufp.
3207                  */
3208                 if (*bufp != NULL) {
3209                         lastbuf = *bufp;
3210                         bufp = &(*bufp)->b_next;
3211                 }
3212         }
3213         buf->b_next = NULL;
3214         ASSERT3P(lastbuf, !=, buf);
3215         IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL);
3216         IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL);
3217         IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));
3218
3219         return (lastbuf);
3220 }
3221
3222 /*
3223  * Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's
3224  * list and free it.
3225  */
3226 static void
3227 arc_buf_destroy_impl(arc_buf_t *buf)
3228 {
3229         arc_buf_hdr_t *hdr = buf->b_hdr;
3230
3231         /*
3232          * Free up the data associated with the buf but only if we're not
3233          * sharing this with the hdr. If we are sharing it with the hdr, the
3234          * hdr is responsible for doing the free.
3235          */
3236         if (buf->b_data != NULL) {
3237                 /*
3238                  * We're about to change the hdr's b_flags. We must either
3239                  * hold the hash_lock or be undiscoverable.
3240                  */
3241                 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
3242
3243                 arc_cksum_verify(buf);
3244 #ifdef illumos
3245                 arc_buf_unwatch(buf);
3246 #endif
3247
3248                 if (arc_buf_is_shared(buf)) {
3249                         arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
3250                 } else {
3251                         uint64_t size = arc_buf_size(buf);
3252                         arc_free_data_buf(hdr, buf->b_data, size, buf);
3253                         ARCSTAT_INCR(arcstat_overhead_size, -size);
3254                 }
3255                 buf->b_data = NULL;
3256
3257                 ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
3258                 hdr->b_l1hdr.b_bufcnt -= 1;
3259         }
3260
3261         arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
3262
3263         if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
3264                 /*
3265                  * If the current arc_buf_t is sharing its data buffer with the
3266                  * hdr, then reassign the hdr's b_pabd to share it with the new
3267                  * buffer at the end of the list. The shared buffer is always
3268                  * the last one on the hdr's buffer list.
3269                  *
3270                  * There is an equivalent case for compressed bufs, but since
3271                  * they aren't guaranteed to be the last buf in the list and
3272                  * that is an exceedingly rare case, we just allow that space be
3273                  * wasted temporarily.
3274                  */
3275                 if (lastbuf != NULL) {
3276                         /* Only one buf can be shared at once */
3277                         VERIFY(!arc_buf_is_shared(lastbuf));
3278                         /* hdr is uncompressed so can't have compressed buf */
3279                         VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
3280
3281                         ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
3282                         arc_hdr_free_pabd(hdr);
3283
3284                         /*
3285                          * We must setup a new shared block between the
3286                          * last buffer and the hdr. The data would have
3287                          * been allocated by the arc buf so we need to transfer
3288                          * ownership to the hdr since it's now being shared.
3289                          */
3290                         arc_share_buf(hdr, lastbuf);
3291                 }
3292         } else if (HDR_SHARED_DATA(hdr)) {
3293                 /*
3294                  * Uncompressed shared buffers are always at the end
3295                  * of the list. Compressed buffers don't have the
3296                  * same requirements. This makes it hard to
3297                  * simply assert that the lastbuf is shared so
3298                  * we rely on the hdr's compression flags to determine
3299                  * if we have a compressed, shared buffer.
3300                  */
3301                 ASSERT3P(lastbuf, !=, NULL);
3302                 ASSERT(arc_buf_is_shared(lastbuf) ||
3303                     HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
3304         }
3305
3306         /*
3307          * Free the checksum if we're removing the last uncompressed buf from
3308          * this hdr.
3309          */
3310         if (!arc_hdr_has_uncompressed_buf(hdr)) {
3311                 arc_cksum_free(hdr);
3312         }
3313
3314         /* clean up the buf */
3315         buf->b_hdr = NULL;
3316         kmem_cache_free(buf_cache, buf);
3317 }
3318
3319 static void
3320 arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr)
3321 {
3322         ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
3323         ASSERT(HDR_HAS_L1HDR(hdr));
3324         ASSERT(!HDR_SHARED_DATA(hdr));
3325
3326         ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
3327         hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr);
3328         hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
3329         ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
3330
3331         ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
3332         ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
3333 }
3334
3335 static void
3336 arc_hdr_free_pabd(arc_buf_hdr_t *hdr)
3337 {
3338         ASSERT(HDR_HAS_L1HDR(hdr));
3339         ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
3340
3341         /*
3342          * If the hdr is currently being written to the l2arc then
3343          * we defer freeing the data by adding it to the l2arc_free_on_write
3344          * list. The l2arc will free the data once it's finished
3345          * writing it to the l2arc device.
3346          */
3347         if (HDR_L2_WRITING(hdr)) {
3348                 arc_hdr_free_on_write(hdr);
3349                 ARCSTAT_BUMP(arcstat_l2_free_on_write);
3350         } else {
3351                 arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
3352                     arc_hdr_size(hdr), hdr);
3353         }
3354         hdr->b_l1hdr.b_pabd = NULL;
3355         hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
3356
3357         ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
3358         ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
3359 }
3360
3361 static arc_buf_hdr_t *
3362 arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
3363     enum zio_compress compression_type, arc_buf_contents_t type)
3364 {
3365         arc_buf_hdr_t *hdr;
3366
3367         VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
3368
3369         hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
3370         ASSERT(HDR_EMPTY(hdr));
3371         ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
3372         ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL);
3373         HDR_SET_PSIZE(hdr, psize);
3374         HDR_SET_LSIZE(hdr, lsize);
3375         hdr->b_spa = spa;
3376         hdr->b_type = type;
3377         hdr->b_flags = 0;
3378         arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR);
3379         arc_hdr_set_compress(hdr, compression_type);
3380
3381         hdr->b_l1hdr.b_state = arc_anon;
3382         hdr->b_l1hdr.b_arc_access = 0;
3383         hdr->b_l1hdr.b_bufcnt = 0;
3384         hdr->b_l1hdr.b_buf = NULL;
3385
3386         /*
3387          * Allocate the hdr's buffer. This will contain either
3388          * the compressed or uncompressed data depending on the block
3389          * it references and compressed arc enablement.
3390          */
3391         arc_hdr_alloc_pabd(hdr);
3392         ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3393
3394         return (hdr);
3395 }
3396
3397 /*
3398  * Transition between the two allocation states for the arc_buf_hdr struct.
3399  * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
3400  * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
3401  * version is used when a cache buffer is only in the L2ARC in order to reduce
3402  * memory usage.
3403  */
3404 static arc_buf_hdr_t *
3405 arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
3406 {
3407         ASSERT(HDR_HAS_L2HDR(hdr));
3408
3409         arc_buf_hdr_t *nhdr;
3410         l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
3411
3412         ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
3413             (old == hdr_l2only_cache && new == hdr_full_cache));
3414
3415         nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
3416
3417         ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
3418         buf_hash_remove(hdr);
3419
3420         bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
3421
3422         if (new == hdr_full_cache) {
3423                 arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
3424                 /*
3425                  * arc_access and arc_change_state need to be aware that a
3426                  * header has just come out of L2ARC, so we set its state to
3427                  * l2c_only even though it's about to change.
3428                  */
3429                 nhdr->b_l1hdr.b_state = arc_l2c_only;
3430
3431                 /* Verify previous threads set to NULL before freeing */
3432                 ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL);
3433         } else {
3434                 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
3435                 ASSERT0(hdr->b_l1hdr.b_bufcnt);
3436                 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
3437
3438                 /*
3439                  * If we've reached here, We must have been called from
3440                  * arc_evict_hdr(), as such we should have already been
3441                  * removed from any ghost list we were previously on
3442                  * (which protects us from racing with arc_evict_state),
3443                  * thus no locking is needed during this check.
3444                  */
3445                 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
3446
3447                 /*
3448                  * A buffer must not be moved into the arc_l2c_only
3449                  * state if it's not finished being written out to the
3450                  * l2arc device. Otherwise, the b_l1hdr.b_pabd field
3451                  * might try to be accessed, even though it was removed.
3452                  */
3453                 VERIFY(!HDR_L2_WRITING(hdr));
3454                 VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL);
3455
3456 #ifdef ZFS_DEBUG
3457                 if (hdr->b_l1hdr.b_thawed != NULL) {
3458                         kmem_free(hdr->b_l1hdr.b_thawed, 1);
3459                         hdr->b_l1hdr.b_thawed = NULL;
3460                 }
3461 #endif
3462
3463                 arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
3464         }
3465         /*
3466          * The header has been reallocated so we need to re-insert it into any
3467          * lists it was on.
3468          */
3469         (void) buf_hash_insert(nhdr, NULL);
3470
3471         ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
3472
3473         mutex_enter(&dev->l2ad_mtx);
3474
3475         /*
3476          * We must place the realloc'ed header back into the list at
3477          * the same spot. Otherwise, if it's placed earlier in the list,
3478          * l2arc_write_buffers() could find it during the function's
3479          * write phase, and try to write it out to the l2arc.
3480          */
3481         list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
3482         list_remove(&dev->l2ad_buflist, hdr);
3483
3484         mutex_exit(&dev->l2ad_mtx);
3485
3486         /*
3487          * Since we're using the pointer address as the tag when
3488          * incrementing and decrementing the l2ad_alloc refcount, we
3489          * must remove the old pointer (that we're about to destroy) and
3490          * add the new pointer to the refcount. Otherwise we'd remove
3491          * the wrong pointer address when calling arc_hdr_destroy() later.
3492          */
3493
3494         (void) refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr);
3495         (void) refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr);
3496
3497         buf_discard_identity(hdr);
3498         kmem_cache_free(old, hdr);
3499
3500         return (nhdr);
3501 }
3502
3503 /*
3504  * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller.
3505  * The buf is returned thawed since we expect the consumer to modify it.
3506  */
3507 arc_buf_t *
3508 arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size)
3509 {
3510         arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
3511             ZIO_COMPRESS_OFF, type);
3512         ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
3513
3514         arc_buf_t *buf = NULL;
3515         VERIFY0(arc_buf_alloc_impl(hdr, tag, B_FALSE, B_FALSE, &buf));
3516         arc_buf_thaw(buf);
3517
3518         return (buf);
3519 }
3520
3521 /*
3522  * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this
3523  * for bufs containing metadata.
3524  */
3525 arc_buf_t *
3526 arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
3527     enum zio_compress compression_type)
3528 {
3529         ASSERT3U(lsize, >, 0);
3530         ASSERT3U(lsize, >=, psize);
3531         ASSERT(compression_type > ZIO_COMPRESS_OFF);
3532         ASSERT(compression_type < ZIO_COMPRESS_FUNCTIONS);
3533
3534         arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
3535             compression_type, ARC_BUFC_DATA);
3536         ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
3537
3538         arc_buf_t *buf = NULL;
3539         VERIFY0(arc_buf_alloc_impl(hdr, tag, B_TRUE, B_FALSE, &buf));
3540         arc_buf_thaw(buf);
3541         ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
3542
3543         if (!arc_buf_is_shared(buf)) {
3544                 /*
3545                  * To ensure that the hdr has the correct data in it if we call
3546                  * arc_decompress() on this buf before it's been written to
3547                  * disk, it's easiest if we just set up sharing between the
3548                  * buf and the hdr.
3549                  */
3550                 ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd));
3551                 arc_hdr_free_pabd(hdr);
3552                 arc_share_buf(hdr, buf);
3553         }
3554
3555         return (buf);
3556 }
3557
3558 static void
3559 arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
3560 {
3561         l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
3562         l2arc_dev_t *dev = l2hdr->b_dev;
3563         uint64_t psize = arc_hdr_size(hdr);
3564
3565         ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
3566         ASSERT(HDR_HAS_L2HDR(hdr));
3567
3568         list_remove(&dev->l2ad_buflist, hdr);
3569
3570         ARCSTAT_INCR(arcstat_l2_psize, -psize);
3571         ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr));
3572
3573         vdev_space_update(dev->l2ad_vdev, -psize, 0, 0);
3574
3575         (void) refcount_remove_many(&dev->l2ad_alloc, psize, hdr);
3576         arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
3577 }
3578
3579 static void
3580 arc_hdr_destroy(arc_buf_hdr_t *hdr)
3581 {
3582         if (HDR_HAS_L1HDR(hdr)) {
3583                 ASSERT(hdr->b_l1hdr.b_buf == NULL ||
3584                     hdr->b_l1hdr.b_bufcnt > 0);
3585                 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3586                 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
3587         }
3588         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3589         ASSERT(!HDR_IN_HASH_TABLE(hdr));
3590
3591         if (!HDR_EMPTY(hdr))
3592                 buf_discard_identity(hdr);
3593
3594         if (HDR_HAS_L2HDR(hdr)) {
3595                 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
3596                 boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
3597
3598                 if (!buflist_held)
3599                         mutex_enter(&dev->l2ad_mtx);
3600
3601                 /*
3602                  * Even though we checked this conditional above, we
3603                  * need to check this again now that we have the
3604                  * l2ad_mtx. This is because we could be racing with
3605                  * another thread calling l2arc_evict() which might have
3606                  * destroyed this header's L2 portion as we were waiting
3607                  * to acquire the l2ad_mtx. If that happens, we don't
3608                  * want to re-destroy the header's L2 portion.
3609                  */
3610                 if (HDR_HAS_L2HDR(hdr)) {
3611                         l2arc_trim(hdr);
3612                         arc_hdr_l2hdr_destroy(hdr);
3613                 }
3614
3615                 if (!buflist_held)
3616                         mutex_exit(&dev->l2ad_mtx);
3617         }
3618
3619         if (HDR_HAS_L1HDR(hdr)) {
3620                 arc_cksum_free(hdr);
3621
3622                 while (hdr->b_l1hdr.b_buf != NULL)
3623                         arc_buf_destroy_impl(hdr->b_l1hdr.b_buf);
3624
3625 #ifdef ZFS_DEBUG
3626                 if (hdr->b_l1hdr.b_thawed != NULL) {
3627                         kmem_free(hdr->b_l1hdr.b_thawed, 1);
3628                         hdr->b_l1hdr.b_thawed = NULL;
3629                 }
3630 #endif
3631
3632                 if (hdr->b_l1hdr.b_pabd != NULL) {
3633                         arc_hdr_free_pabd(hdr);
3634                 }
3635         }
3636
3637         ASSERT3P(hdr->b_hash_next, ==, NULL);
3638         if (HDR_HAS_L1HDR(hdr)) {
3639                 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
3640                 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
3641                 kmem_cache_free(hdr_full_cache, hdr);
3642         } else {
3643                 kmem_cache_free(hdr_l2only_cache, hdr);
3644         }
3645 }
3646
3647 void
3648 arc_buf_destroy(arc_buf_t *buf, void* tag)
3649 {
3650         arc_buf_hdr_t *hdr = buf->b_hdr;
3651         kmutex_t *hash_lock = HDR_LOCK(hdr);
3652
3653         if (hdr->b_l1hdr.b_state == arc_anon) {
3654                 ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
3655                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3656                 VERIFY0(remove_reference(hdr, NULL, tag));
3657                 arc_hdr_destroy(hdr);
3658                 return;
3659         }
3660
3661         mutex_enter(hash_lock);
3662         ASSERT3P(hdr, ==, buf->b_hdr);
3663         ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
3664         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3665         ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
3666         ASSERT3P(buf->b_data, !=, NULL);
3667
3668         (void) remove_reference(hdr, hash_lock, tag);
3669         arc_buf_destroy_impl(buf);
3670         mutex_exit(hash_lock);
3671 }
3672
3673 /*
3674  * Evict the arc_buf_hdr that is provided as a parameter. The resultant
3675  * state of the header is dependent on its state prior to entering this
3676  * function. The following transitions are possible:
3677  *
3678  *    - arc_mru -> arc_mru_ghost
3679  *    - arc_mfu -> arc_mfu_ghost
3680  *    - arc_mru_ghost -> arc_l2c_only
3681  *    - arc_mru_ghost -> deleted
3682  *    - arc_mfu_ghost -> arc_l2c_only
3683  *    - arc_mfu_ghost -> deleted
3684  */
3685 static int64_t
3686 arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
3687 {
3688         arc_state_t *evicted_state, *state;
3689         int64_t bytes_evicted = 0;
3690         int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
3691             zfs_arc_min_prescient_prefetch_ms : zfs_arc_min_prefetch_ms;
3692
3693         ASSERT(MUTEX_HELD(hash_lock));
3694         ASSERT(HDR_HAS_L1HDR(hdr));
3695
3696         state = hdr->b_l1hdr.b_state;
3697         if (GHOST_STATE(state)) {
3698                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3699                 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
3700
3701                 /*
3702                  * l2arc_write_buffers() relies on a header's L1 portion
3703                  * (i.e. its b_pabd field) during it's write phase.
3704                  * Thus, we cannot push a header onto the arc_l2c_only
3705                  * state (removing it's L1 piece) until the header is
3706                  * done being written to the l2arc.
3707                  */
3708                 if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
3709                         ARCSTAT_BUMP(arcstat_evict_l2_skip);
3710                         return (bytes_evicted);
3711                 }
3712
3713                 ARCSTAT_BUMP(arcstat_deleted);
3714                 bytes_evicted += HDR_GET_LSIZE(hdr);
3715
3716                 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
3717
3718                 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
3719                 if (HDR_HAS_L2HDR(hdr)) {
3720                         /*
3721                          * This buffer is cached on the 2nd Level ARC;
3722                          * don't destroy the header.
3723                          */
3724                         arc_change_state(arc_l2c_only, hdr, hash_lock);
3725                         /*
3726                          * dropping from L1+L2 cached to L2-only,
3727                          * realloc to remove the L1 header.
3728                          */
3729                         hdr = arc_hdr_realloc(hdr, hdr_full_cache,
3730                             hdr_l2only_cache);
3731                 } else {
3732                         arc_change_state(arc_anon, hdr, hash_lock);
3733                         arc_hdr_destroy(hdr);
3734                 }
3735                 return (bytes_evicted);
3736         }
3737
3738         ASSERT(state == arc_mru || state == arc_mfu);
3739         evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3740
3741         /* prefetch buffers have a minimum lifespan */
3742         if (HDR_IO_IN_PROGRESS(hdr) ||
3743             ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
3744             ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) {
3745                 ARCSTAT_BUMP(arcstat_evict_skip);
3746                 return (bytes_evicted);
3747         }
3748
3749         ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
3750         while (hdr->b_l1hdr.b_buf) {
3751                 arc_buf_t *buf = hdr->b_l1hdr.b_buf;
3752                 if (!mutex_tryenter(&buf->b_evict_lock)) {
3753                         ARCSTAT_BUMP(arcstat_mutex_miss);
3754                         break;
3755                 }
3756                 if (buf->b_data != NULL)
3757                         bytes_evicted += HDR_GET_LSIZE(hdr);
3758                 mutex_exit(&buf->b_evict_lock);
3759                 arc_buf_destroy_impl(buf);
3760         }
3761
3762         if (HDR_HAS_L2HDR(hdr)) {
3763                 ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr));
3764         } else {
3765                 if (l2arc_write_eligible(hdr->b_spa, hdr)) {
3766                         ARCSTAT_INCR(arcstat_evict_l2_eligible,
3767                             HDR_GET_LSIZE(hdr));
3768                 } else {
3769                         ARCSTAT_INCR(arcstat_evict_l2_ineligible,
3770                             HDR_GET_LSIZE(hdr));
3771                 }
3772         }
3773
3774         if (hdr->b_l1hdr.b_bufcnt == 0) {
3775                 arc_cksum_free(hdr);
3776
3777                 bytes_evicted += arc_hdr_size(hdr);
3778
3779                 /*
3780                  * If this hdr is being evicted and has a compressed
3781                  * buffer then we discard it here before we change states.
3782                  * This ensures that the accounting is updated correctly
3783                  * in arc_free_data_impl().
3784                  */
3785                 arc_hdr_free_pabd(hdr);
3786
3787                 arc_change_state(evicted_state, hdr, hash_lock);
3788                 ASSERT(HDR_IN_HASH_TABLE(hdr));
3789                 arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
3790                 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
3791         }
3792
3793         return (bytes_evicted);
3794 }
3795
3796 static uint64_t
3797 arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
3798     uint64_t spa, int64_t bytes)
3799 {
3800         multilist_sublist_t *mls;
3801         uint64_t bytes_evicted = 0;
3802         arc_buf_hdr_t *hdr;
3803         kmutex_t *hash_lock;
3804         int evict_count = 0;
3805
3806         ASSERT3P(marker, !=, NULL);
3807         IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
3808
3809         mls = multilist_sublist_lock(ml, idx);
3810
3811         for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
3812             hdr = multilist_sublist_prev(mls, marker)) {
3813                 if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
3814                     (evict_count >= zfs_arc_evict_batch_limit))
3815                         break;
3816
3817                 /*
3818                  * To keep our iteration location, move the marker
3819                  * forward. Since we're not holding hdr's hash lock, we
3820                  * must be very careful and not remove 'hdr' from the
3821                  * sublist. Otherwise, other consumers might mistake the
3822                  * 'hdr' as not being on a sublist when they call the
3823                  * multilist_link_active() function (they all rely on
3824                  * the hash lock protecting concurrent insertions and
3825                  * removals). multilist_sublist_move_forward() was
3826                  * specifically implemented to ensure this is the case
3827                  * (only 'marker' will be removed and re-inserted).
3828                  */
3829                 multilist_sublist_move_forward(mls, marker);
3830
3831                 /*
3832                  * The only case where the b_spa field should ever be
3833                  * zero, is the marker headers inserted by
3834                  * arc_evict_state(). It's possible for multiple threads
3835                  * to be calling arc_evict_state() concurrently (e.g.
3836                  * dsl_pool_close() and zio_inject_fault()), so we must
3837                  * skip any markers we see from these other threads.
3838                  */
3839                 if (hdr->b_spa == 0)
3840                         continue;
3841
3842                 /* we're only interested in evicting buffers of a certain spa */
3843                 if (spa != 0 && hdr->b_spa != spa) {
3844                         ARCSTAT_BUMP(arcstat_evict_skip);
3845                         continue;
3846                 }
3847
3848                 hash_lock = HDR_LOCK(hdr);
3849
3850                 /*
3851                  * We aren't calling this function from any code path
3852                  * that would already be holding a hash lock, so we're
3853                  * asserting on this assumption to be defensive in case
3854                  * this ever changes. Without this check, it would be
3855                  * possible to incorrectly increment arcstat_mutex_miss
3856                  * below (e.g. if the code changed such that we called
3857                  * this function with a hash lock held).
3858                  */
3859                 ASSERT(!MUTEX_HELD(hash_lock));
3860
3861                 if (mutex_tryenter(hash_lock)) {
3862                         uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
3863                         mutex_exit(hash_lock);
3864
3865                         bytes_evicted += evicted;
3866
3867                         /*
3868                          * If evicted is zero, arc_evict_hdr() must have
3869                          * decided to skip this header, don't increment
3870                          * evict_count in this case.
3871                          */
3872                         if (evicted != 0)
3873                                 evict_count++;
3874
3875                         /*
3876                          * If arc_size isn't overflowing, signal any
3877                          * threads that might happen to be waiting.
3878                          *
3879                          * For each header evicted, we wake up a single
3880                          * thread. If we used cv_broadcast, we could
3881                          * wake up "too many" threads causing arc_size
3882                          * to significantly overflow arc_c; since
3883                          * arc_get_data_impl() doesn't check for overflow
3884                          * when it's woken up (it doesn't because it's
3885                          * possible for the ARC to be overflowing while
3886                          * full of un-evictable buffers, and the
3887                          * function should proceed in this case).
3888                          *
3889                          * If threads are left sleeping, due to not
3890                          * using cv_broadcast, they will be woken up
3891                          * just before arc_reclaim_thread() sleeps.
3892                          */
3893                         mutex_enter(&arc_reclaim_lock);
3894                         if (!arc_is_overflowing())
3895                                 cv_signal(&arc_reclaim_waiters_cv);
3896                         mutex_exit(&arc_reclaim_lock);
3897                 } else {
3898                         ARCSTAT_BUMP(arcstat_mutex_miss);
3899                 }
3900         }
3901
3902         multilist_sublist_unlock(mls);
3903
3904         return (bytes_evicted);
3905 }
3906
3907 /*
3908  * Evict buffers from the given arc state, until we've removed the
3909  * specified number of bytes. Move the removed buffers to the
3910  * appropriate evict state.
3911  *
3912  * This function makes a "best effort". It skips over any buffers
3913  * it can't get a hash_lock on, and so, may not catch all candidates.
3914  * It may also return without evicting as much space as requested.
3915  *
3916  * If bytes is specified using the special value ARC_EVICT_ALL, this
3917  * will evict all available (i.e. unlocked and evictable) buffers from
3918  * the given arc state; which is used by arc_flush().
3919  */
3920 static uint64_t
3921 arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
3922     arc_buf_contents_t type)
3923 {
3924         uint64_t total_evicted = 0;
3925         multilist_t *ml = state->arcs_list[type];
3926         int num_sublists;
3927         arc_buf_hdr_t **markers;
3928
3929         IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
3930
3931         num_sublists = multilist_get_num_sublists(ml);
3932
3933         /*
3934          * If we've tried to evict from each sublist, made some
3935          * progress, but still have not hit the target number of bytes
3936          * to evict, we want to keep trying. The markers allow us to
3937          * pick up where we left off for each individual sublist, rather
3938          * than starting from the tail each time.
3939          */
3940         markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
3941         for (int i = 0; i < num_sublists; i++) {
3942                 markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
3943
3944                 /*
3945                  * A b_spa of 0 is used to indicate that this header is
3946                  * a marker. This fact is used in arc_adjust_type() and
3947                  * arc_evict_state_impl().
3948                  */
3949                 markers[i]->b_spa = 0;
3950
3951                 multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
3952                 multilist_sublist_insert_tail(mls, markers[i]);
3953                 multilist_sublist_unlock(mls);
3954         }
3955
3956         /*
3957          * While we haven't hit our target number of bytes to evict, or
3958          * we're evicting all available buffers.
3959          */
3960         while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
3961                 int sublist_idx = multilist_get_random_index(ml);
3962                 uint64_t scan_evicted = 0;
3963
3964                 /*
3965                  * Try to reduce pinned dnodes with a floor of arc_dnode_limit.
3966                  * Request that 10% of the LRUs be scanned by the superblock
3967                  * shrinker.
3968                  */
3969                 if (type == ARC_BUFC_DATA && aggsum_compare(&astat_dnode_size,
3970                     arc_dnode_limit) > 0) {
3971                         arc_prune_async((aggsum_upper_bound(&astat_dnode_size) -
3972                             arc_dnode_limit) / sizeof (dnode_t) /
3973                             zfs_arc_dnode_reduce_percent);
3974                 }
3975
3976                 /*
3977                  * Start eviction using a randomly selected sublist,
3978                  * this is to try and evenly balance eviction across all
3979                  * sublists. Always starting at the same sublist
3980                  * (e.g. index 0) would cause evictions to favor certain
3981                  * sublists over others.
3982                  */
3983                 for (int i = 0; i < num_sublists; i++) {
3984                         uint64_t bytes_remaining;
3985                         uint64_t bytes_evicted;
3986
3987                         if (bytes == ARC_EVICT_ALL)
3988                                 bytes_remaining = ARC_EVICT_ALL;
3989                         else if (total_evicted < bytes)
3990                                 bytes_remaining = bytes - total_evicted;
3991                         else
3992                                 break;
3993
3994                         bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
3995                             markers[sublist_idx], spa, bytes_remaining);
3996
3997                         scan_evicted += bytes_evicted;
3998                         total_evicted += bytes_evicted;
3999
4000                         /* we've reached the end, wrap to the beginning */
4001                         if (++sublist_idx >= num_sublists)
4002                                 sublist_idx = 0;
4003                 }
4004
4005                 /*
4006                  * If we didn't evict anything during this scan, we have
4007                  * no reason to believe we'll evict more during another
4008                  * scan, so break the loop.
4009                  */
4010                 if (scan_evicted == 0) {
4011                         /* This isn't possible, let's make that obvious */
4012                         ASSERT3S(bytes, !=, 0);
4013
4014                         /*
4015                          * When bytes is ARC_EVICT_ALL, the only way to
4016                          * break the loop is when scan_evicted is zero.
4017                          * In that case, we actually have evicted enough,
4018                          * so we don't want to increment the kstat.
4019                          */
4020                         if (bytes != ARC_EVICT_ALL) {
4021                                 ASSERT3S(total_evicted, <, bytes);
4022                                 ARCSTAT_BUMP(arcstat_evict_not_enough);
4023                         }
4024
4025                         break;
4026                 }
4027         }
4028
4029         for (int i = 0; i < num_sublists; i++) {
4030                 multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
4031                 multilist_sublist_remove(mls, markers[i]);
4032                 multilist_sublist_unlock(mls);
4033
4034                 kmem_cache_free(hdr_full_cache, markers[i]);
4035         }
4036         kmem_free(markers, sizeof (*markers) * num_sublists);
4037
4038         return (total_evicted);
4039 }
4040
4041 /*
4042  * Flush all "evictable" data of the given type from the arc state
4043  * specified. This will not evict any "active" buffers (i.e. referenced).
4044  *
4045  * When 'retry' is set to B_FALSE, the function will make a single pass
4046  * over the state and evict any buffers that it can. Since it doesn't
4047  * continually retry the eviction, it might end up leaving some buffers
4048  * in the ARC due to lock misses.
4049  *
4050  * When 'retry' is set to B_TRUE, the function will continually retry the
4051  * eviction until *all* evictable buffers have been removed from the
4052  * state. As a result, if concurrent insertions into the state are
4053  * allowed (e.g. if the ARC isn't shutting down), this function might
4054  * wind up in an infinite loop, continually trying to evict buffers.
4055  */
4056 static uint64_t
4057 arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
4058     boolean_t retry)
4059 {
4060         uint64_t evicted = 0;
4061
4062         while (refcount_count(&state->arcs_esize[type]) != 0) {
4063                 evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
4064
4065                 if (!retry)
4066                         break;
4067         }
4068
4069         return (evicted);
4070 }
4071
4072 /*
4073  * Helper function for arc_prune_async() it is responsible for safely
4074  * handling the execution of a registered arc_prune_func_t.
4075  */
4076 static void
4077 arc_prune_task(void *ptr)
4078 {
4079         arc_prune_t *ap = (arc_prune_t *)ptr;
4080         arc_prune_func_t *func = ap->p_pfunc;
4081
4082         if (func != NULL)
4083                 func(ap->p_adjust, ap->p_private);
4084
4085         refcount_remove(&ap->p_refcnt, func);
4086 }
4087
4088 /*
4089  * Notify registered consumers they must drop holds on a portion of the ARC
4090  * buffered they reference.  This provides a mechanism to ensure the ARC can
4091  * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers.  This
4092  * is analogous to dnlc_reduce_cache() but more generic.
4093  *
4094  * This operation is performed asynchronously so it may be safely called
4095  * in the context of the arc_reclaim_thread().  A reference is taken here
4096  * for each registered arc_prune_t and the arc_prune_task() is responsible
4097  * for releasing it once the registered arc_prune_func_t has completed.
4098  */
4099 static void
4100 arc_prune_async(int64_t adjust)
4101 {
4102         arc_prune_t *ap;
4103
4104         mutex_enter(&arc_prune_mtx);
4105         for (ap = list_head(&arc_prune_list); ap != NULL;
4106             ap = list_next(&arc_prune_list, ap)) {
4107
4108                 if (refcount_count(&ap->p_refcnt) >= 2)
4109                         continue;
4110
4111                 refcount_add(&ap->p_refcnt, ap->p_pfunc);
4112                 ap->p_adjust = adjust;
4113                 if (taskq_dispatch(arc_prune_taskq, arc_prune_task,
4114                     ap, TQ_SLEEP) == TASKQID_INVALID) {
4115                         refcount_remove(&ap->p_refcnt, ap->p_pfunc);
4116                         continue;
4117                 }
4118                 ARCSTAT_BUMP(arcstat_prune);
4119         }
4120         mutex_exit(&arc_prune_mtx);
4121 }
4122
4123 /*
4124  * Evict the specified number of bytes from the state specified,
4125  * restricting eviction to the spa and type given. This function
4126  * prevents us from trying to evict more from a state's list than
4127  * is "evictable", and to skip evicting altogether when passed a
4128  * negative value for "bytes". In contrast, arc_evict_state() will
4129  * evict everything it can, when passed a negative value for "bytes".
4130  */
4131 static uint64_t
4132 arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
4133     arc_buf_contents_t type)
4134 {
4135         int64_t delta;
4136
4137         if (bytes > 0 && refcount_count(&state->arcs_esize[type]) > 0) {
4138                 delta = MIN(refcount_count(&state->arcs_esize[type]), bytes);
4139                 return (arc_evict_state(state, spa, delta, type));
4140         }
4141
4142         return (0);
4143 }
4144
4145 /*
4146  * The goal of this function is to evict enough meta data buffers from the
4147  * ARC in order to enforce the arc_meta_limit.  Achieving this is slightly
4148  * more complicated than it appears because it is common for data buffers
4149  * to have holds on meta data buffers.  In addition, dnode meta data buffers
4150  * will be held by the dnodes in the block preventing them from being freed.
4151  * This means we can't simply traverse the ARC and expect to always find
4152  * enough unheld meta data buffer to release.
4153  *
4154  * Therefore, this function has been updated to make alternating passes
4155  * over the ARC releasing data buffers and then newly unheld meta data
4156  * buffers.  This ensures forward progress is maintained and meta_used
4157  * will decrease.  Normally this is sufficient, but if required the ARC
4158  * will call the registered prune callbacks causing dentry and inodes to
4159  * be dropped from the VFS cache.  This will make dnode meta data buffers
4160  * available for reclaim.
4161  */
4162 static uint64_t
4163 arc_adjust_meta_balanced(uint64_t meta_used)
4164 {
4165         int64_t delta, prune = 0, adjustmnt;
4166         uint64_t total_evicted = 0;
4167         arc_buf_contents_t type = ARC_BUFC_DATA;
4168         int restarts = MAX(zfs_arc_meta_adjust_restarts, 0);
4169
4170 restart:
4171         /*
4172          * This slightly differs than the way we evict from the mru in
4173          * arc_adjust because we don't have a "target" value (i.e. no
4174          * "meta" arc_p). As a result, I think we can completely
4175          * cannibalize the metadata in the MRU before we evict the
4176          * metadata from the MFU. I think we probably need to implement a
4177          * "metadata arc_p" value to do this properly.
4178          */
4179         adjustmnt = meta_used - arc_meta_limit;
4180
4181         if (adjustmnt > 0 && refcount_count(&arc_mru->arcs_esize[type]) > 0) {
4182                 delta = MIN(refcount_count(&arc_mru->arcs_esize[type]),
4183                     adjustmnt);
4184                 total_evicted += arc_adjust_impl(arc_mru, 0, delta, type);
4185                 adjustmnt -= delta;
4186         }
4187
4188         /*
4189          * We can't afford to recalculate adjustmnt here. If we do,
4190          * new metadata buffers can sneak into the MRU or ANON lists,
4191          * thus penalize the MFU metadata. Although the fudge factor is
4192          * small, it has been empirically shown to be significant for
4193          * certain workloads (e.g. creating many empty directories). As
4194          * such, we use the original calculation for adjustmnt, and
4195          * simply decrement the amount of data evicted from the MRU.
4196          */
4197
4198         if (adjustmnt > 0 && refcount_count(&arc_mfu->arcs_esize[type]) > 0) {
4199                 delta = MIN(refcount_count(&arc_mfu->arcs_esize[type]),
4200                     adjustmnt);
4201                 total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type);
4202         }
4203
4204         adjustmnt = meta_used - arc_meta_limit;
4205
4206         if (adjustmnt > 0 &&
4207             refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) {
4208                 delta = MIN(adjustmnt,
4209                     refcount_count(&arc_mru_ghost->arcs_esize[type]));
4210                 total_evicted += arc_adjust_impl(arc_mru_ghost, 0, delta, type);
4211                 adjustmnt -= delta;
4212         }
4213
4214         if (adjustmnt > 0 &&
4215             refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) {
4216                 delta = MIN(adjustmnt,
4217                     refcount_count(&arc_mfu_ghost->arcs_esize[type]));
4218                 total_evicted += arc_adjust_impl(arc_mfu_ghost, 0, delta, type);
4219         }
4220
4221         /*
4222          * If after attempting to make the requested adjustment to the ARC
4223          * the meta limit is still being exceeded then request that the
4224          * higher layers drop some cached objects which have holds on ARC
4225          * meta buffers.  Requests to the upper layers will be made with
4226          * increasingly large scan sizes until the ARC is below the limit.
4227          */
4228         if (meta_used > arc_meta_limit) {
4229                 if (type == ARC_BUFC_DATA) {
4230                         type = ARC_BUFC_METADATA;
4231                 } else {
4232                         type = ARC_BUFC_DATA;
4233
4234                         if (zfs_arc_meta_prune) {
4235                                 prune += zfs_arc_meta_prune;
4236                                 arc_prune_async(prune);
4237                         }
4238                 }
4239
4240                 if (restarts > 0) {
4241                         restarts--;
4242                         goto restart;
4243                 }
4244         }
4245         return (total_evicted);
4246 }
4247
4248 /*
4249  * Evict metadata buffers from the cache, such that arc_meta_used is
4250  * capped by the arc_meta_limit tunable.
4251  */
4252 static uint64_t
4253 arc_adjust_meta_only(uint64_t meta_used)
4254 {
4255         uint64_t total_evicted = 0;
4256         int64_t target;
4257
4258         /*
4259          * If we're over the meta limit, we want to evict enough
4260          * metadata to get back under the meta limit. We don't want to
4261          * evict so much that we drop the MRU below arc_p, though. If
4262          * we're over the meta limit more than we're over arc_p, we
4263          * evict some from the MRU here, and some from the MFU below.
4264          */
4265         target = MIN((int64_t)(meta_used - arc_meta_limit),
4266             (int64_t)(refcount_count(&arc_anon->arcs_size) +
4267             refcount_count(&arc_mru->arcs_size) - arc_p));
4268
4269         total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
4270
4271         /*
4272          * Similar to the above, we want to evict enough bytes to get us
4273          * below the meta limit, but not so much as to drop us below the
4274          * space allotted to the MFU (which is defined as arc_c - arc_p).
4275          */
4276         target = MIN((int64_t)(meta_used - arc_meta_limit),
4277             (int64_t)(refcount_count(&arc_mfu->arcs_size) -
4278             (arc_c - arc_p)));
4279
4280         total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
4281
4282         return (total_evicted);
4283 }
4284
4285 static uint64_t
4286 arc_adjust_meta(uint64_t meta_used)
4287 {
4288         if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY)
4289                 return (arc_adjust_meta_only(meta_used));
4290         else
4291                 return (arc_adjust_meta_balanced(meta_used));
4292 }
4293
4294 /*
4295  * Return the type of the oldest buffer in the given arc state
4296  *
4297  * This function will select a random sublist of type ARC_BUFC_DATA and
4298  * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
4299  * is compared, and the type which contains the "older" buffer will be
4300  * returned.
4301  */
4302 static arc_buf_contents_t
4303 arc_adjust_type(arc_state_t *state)
4304 {
4305         multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA];
4306         multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA];
4307         int data_idx = multilist_get_random_index(data_ml);
4308         int meta_idx = multilist_get_random_index(meta_ml);
4309         multilist_sublist_t *data_mls;
4310         multilist_sublist_t *meta_mls;
4311         arc_buf_contents_t type;
4312         arc_buf_hdr_t *data_hdr;
4313         arc_buf_hdr_t *meta_hdr;
4314
4315         /*
4316          * We keep the sublist lock until we're finished, to prevent
4317          * the headers from being destroyed via arc_evict_state().
4318          */
4319         data_mls = multilist_sublist_lock(data_ml, data_idx);
4320         meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
4321
4322         /*
4323          * These two loops are to ensure we skip any markers that
4324          * might be at the tail of the lists due to arc_evict_state().
4325          */
4326
4327         for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
4328             data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
4329                 if (data_hdr->b_spa != 0)
4330                         break;
4331         }
4332
4333         for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
4334             meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
4335                 if (meta_hdr->b_spa != 0)
4336                         break;
4337         }
4338
4339         if (data_hdr == NULL && meta_hdr == NULL) {
4340                 type = ARC_BUFC_DATA;
4341         } else if (data_hdr == NULL) {
4342                 ASSERT3P(meta_hdr, !=, NULL);
4343                 type = ARC_BUFC_METADATA;
4344         } else if (meta_hdr == NULL) {
4345                 ASSERT3P(data_hdr, !=, NULL);
4346                 type = ARC_BUFC_DATA;
4347         } else {
4348                 ASSERT3P(data_hdr, !=, NULL);
4349                 ASSERT3P(meta_hdr, !=, NULL);
4350
4351                 /* The headers can't be on the sublist without an L1 header */
4352                 ASSERT(HDR_HAS_L1HDR(data_hdr));
4353                 ASSERT(HDR_HAS_L1HDR(meta_hdr));
4354
4355                 if (data_hdr->b_l1hdr.b_arc_access <
4356                     meta_hdr->b_l1hdr.b_arc_access) {
4357                         type = ARC_BUFC_DATA;
4358                 } else {
4359                         type = ARC_BUFC_METADATA;
4360                 }
4361         }
4362
4363         multilist_sublist_unlock(meta_mls);
4364         multilist_sublist_unlock(data_mls);
4365
4366         return (type);
4367 }
4368
4369 /*
4370  * Evict buffers from the cache, such that arc_size is capped by arc_c.
4371  */
4372 static uint64_t
4373 arc_adjust(void)
4374 {
4375         uint64_t total_evicted = 0;
4376         uint64_t bytes;
4377         int64_t target;
4378         uint64_t asize = aggsum_value(&arc_size);
4379         uint64_t ameta = aggsum_value(&arc_meta_used);
4380
4381         /*
4382          * If we're over arc_meta_limit, we want to correct that before
4383          * potentially evicting data buffers below.
4384          */
4385         total_evicted += arc_adjust_meta(ameta);
4386
4387         /*
4388          * Adjust MRU size
4389          *
4390          * If we're over the target cache size, we want to evict enough
4391          * from the list to get back to our target size. We don't want
4392          * to evict too much from the MRU, such that it drops below
4393          * arc_p. So, if we're over our target cache size more than
4394          * the MRU is over arc_p, we'll evict enough to get back to
4395          * arc_p here, and then evict more from the MFU below.
4396          */
4397         target = MIN((int64_t)(asize - arc_c),
4398             (int64_t)(refcount_count(&arc_anon->arcs_size) +
4399             refcount_count(&arc_mru->arcs_size) + ameta - arc_p));
4400
4401         /*
4402          * If we're below arc_meta_min, always prefer to evict data.
4403          * Otherwise, try to satisfy the requested number of bytes to
4404          * evict from the type which contains older buffers; in an
4405          * effort to keep newer buffers in the cache regardless of their
4406          * type. If we cannot satisfy the number of bytes from this
4407          * type, spill over into the next type.
4408          */
4409         if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
4410             ameta > arc_meta_min) {
4411                 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
4412                 total_evicted += bytes;
4413
4414                 /*
4415                  * If we couldn't evict our target number of bytes from
4416                  * metadata, we try to get the rest from data.
4417                  */
4418                 target -= bytes;
4419
4420                 total_evicted +=
4421                     arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
4422         } else {
4423                 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
4424                 total_evicted += bytes;
4425
4426                 /*
4427                  * If we couldn't evict our target number of bytes from
4428                  * data, we try to get the rest from metadata.
4429                  */
4430                 target -= bytes;
4431
4432                 total_evicted +=
4433                     arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
4434         }
4435
4436         /*
4437          * Adjust MFU size
4438          *
4439          * Now that we've tried to evict enough from the MRU to get its
4440          * size back to arc_p, if we're still above the target cache
4441          * size, we evict the rest from the MFU.
4442          */
4443         target = asize - arc_c;
4444
4445         if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
4446             ameta > arc_meta_min) {
4447                 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
4448                 total_evicted += bytes;
4449
4450                 /*
4451                  * If we couldn't evict our target number of bytes from
4452                  * metadata, we try to get the rest from data.
4453                  */
4454                 target -= bytes;
4455
4456                 total_evicted +=
4457                     arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
4458         } else {
4459                 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
4460                 total_evicted += bytes;
4461
4462                 /*
4463                  * If we couldn't evict our target number of bytes from
4464                  * data, we try to get the rest from data.
4465                  */
4466                 target -= bytes;
4467
4468                 total_evicted +=
4469                     arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
4470         }
4471
4472         /*
4473          * Adjust ghost lists
4474          *
4475          * In addition to the above, the ARC also defines target values
4476          * for the ghost lists. The sum of the mru list and mru ghost
4477          * list should never exceed the target size of the cache, and
4478          * the sum of the mru list, mfu list, mru ghost list, and mfu
4479          * ghost list should never exceed twice the target size of the
4480          * cache. The following logic enforces these limits on the ghost
4481          * caches, and evicts from them as needed.
4482          */
4483         target = refcount_count(&arc_mru->arcs_size) +
4484             refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
4485
4486         bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
4487         total_evicted += bytes;
4488
4489         target -= bytes;
4490
4491         total_evicted +=
4492             arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
4493
4494         /*
4495          * We assume the sum of the mru list and mfu list is less than
4496          * or equal to arc_c (we enforced this above), which means we
4497          * can use the simpler of the two equations below:
4498          *
4499          *      mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
4500          *                  mru ghost + mfu ghost <= arc_c
4501          */
4502         target = refcount_count(&arc_mru_ghost->arcs_size) +
4503             refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
4504
4505         bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
4506         total_evicted += bytes;
4507
4508         target -= bytes;
4509
4510         total_evicted +=
4511             arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
4512
4513         return (total_evicted);
4514 }
4515
4516 void
4517 arc_flush(spa_t *spa, boolean_t retry)
4518 {
4519         uint64_t guid = 0;
4520
4521         /*
4522          * If retry is B_TRUE, a spa must not be specified since we have
4523          * no good way to determine if all of a spa's buffers have been
4524          * evicted from an arc state.
4525          */
4526         ASSERT(!retry || spa == 0);
4527
4528         if (spa != NULL)
4529                 guid = spa_load_guid(spa);
4530
4531         (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
4532         (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
4533
4534         (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
4535         (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
4536
4537         (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
4538         (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
4539
4540         (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
4541         (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
4542 }
4543
4544 void
4545 arc_shrink(int64_t to_free)
4546 {
4547         uint64_t asize = aggsum_value(&arc_size);
4548         if (arc_c > arc_c_min) {
4549                 DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t,
4550                         arc_c_min, uint64_t, arc_p, uint64_t, to_free);
4551                 if (arc_c > arc_c_min + to_free)
4552                         atomic_add_64(&arc_c, -to_free);
4553                 else
4554                         arc_c = arc_c_min;
4555
4556                 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
4557                 if (asize < arc_c)
4558                         arc_c = MAX(asize, arc_c_min);
4559                 if (arc_p > arc_c)
4560                         arc_p = (arc_c >> 1);
4561
4562                 DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t,
4563                         arc_p);
4564
4565                 ASSERT(arc_c >= arc_c_min);
4566                 ASSERT((int64_t)arc_p >= 0);
4567         }
4568
4569         if (asize > arc_c) {
4570                 DTRACE_PROBE2(arc__shrink_adjust, uint64_t, asize,
4571                         uint64_t, arc_c);
4572                 (void) arc_adjust();
4573         }
4574 }
4575
4576 typedef enum free_memory_reason_t {
4577         FMR_UNKNOWN,
4578         FMR_NEEDFREE,
4579         FMR_LOTSFREE,
4580         FMR_SWAPFS_MINFREE,
4581         FMR_PAGES_PP_MAXIMUM,
4582         FMR_HEAP_ARENA,
4583         FMR_ZIO_ARENA,
4584 } free_memory_reason_t;
4585
4586 int64_t last_free_memory;
4587 free_memory_reason_t last_free_reason;
4588
4589 /*
4590  * Additional reserve of pages for pp_reserve.
4591  */
4592 int64_t arc_pages_pp_reserve = 64;
4593
4594 /*
4595  * Additional reserve of pages for swapfs.
4596  */
4597 int64_t arc_swapfs_reserve = 64;
4598
4599 /*
4600  * Return the amount of memory that can be consumed before reclaim will be
4601  * needed.  Positive if there is sufficient free memory, negative indicates
4602  * the amount of memory that needs to be freed up.
4603  */
4604 static int64_t
4605 arc_available_memory(void)
4606 {
4607         int64_t lowest = INT64_MAX;
4608         int64_t n;
4609         free_memory_reason_t r = FMR_UNKNOWN;
4610
4611 #ifdef _KERNEL
4612 #ifdef __FreeBSD__
4613         /*
4614          * Cooperate with pagedaemon when it's time for it to scan
4615          * and reclaim some pages.
4616          */
4617         n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target);
4618         if (n < lowest) {
4619                 lowest = n;
4620                 r = FMR_LOTSFREE;
4621         }
4622
4623 #else
4624         if (needfree > 0) {
4625                 n = PAGESIZE * (-needfree);
4626                 if (n < lowest) {
4627                         lowest = n;
4628                         r = FMR_NEEDFREE;
4629                 }
4630         }
4631
4632         /*
4633          * check that we're out of range of the pageout scanner.  It starts to
4634          * schedule paging if freemem is less than lotsfree and needfree.
4635          * lotsfree is the high-water mark for pageout, and needfree is the
4636          * number of needed free pages.  We add extra pages here to make sure
4637          * the scanner doesn't start up while we're freeing memory.
4638          */
4639         n = PAGESIZE * (freemem - lotsfree - needfree - desfree);
4640         if (n < lowest) {
4641                 lowest = n;
4642                 r = FMR_LOTSFREE;
4643         }
4644
4645         /*
4646          * check to make sure that swapfs has enough space so that anon
4647          * reservations can still succeed. anon_resvmem() checks that the
4648          * availrmem is greater than swapfs_minfree, and the number of reserved
4649          * swap pages.  We also add a bit of extra here just to prevent
4650          * circumstances from getting really dire.
4651          */
4652         n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve -
4653             desfree - arc_swapfs_reserve);
4654         if (n < lowest) {
4655                 lowest = n;
4656                 r = FMR_SWAPFS_MINFREE;
4657         }
4658
4659
4660         /*
4661          * Check that we have enough availrmem that memory locking (e.g., via
4662          * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
4663          * stores the number of pages that cannot be locked; when availrmem
4664          * drops below pages_pp_maximum, page locking mechanisms such as
4665          * page_pp_lock() will fail.)
4666          */
4667         n = PAGESIZE * (availrmem - pages_pp_maximum -
4668             arc_pages_pp_reserve);
4669         if (n < lowest) {
4670                 lowest = n;
4671                 r = FMR_PAGES_PP_MAXIMUM;
4672         }
4673
4674 #endif  /* __FreeBSD__ */
4675 #if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
4676         /*
4677          * If we're on an i386 platform, it's possible that we'll exhaust the
4678          * kernel heap space before we ever run out of available physical
4679          * memory.  Most checks of the size of the heap_area compare against
4680          * tune.t_minarmem, which is the minimum available real memory that we
4681          * can have in the system.  However, this is generally fixed at 25 pages
4682          * which is so low that it's useless.  In this comparison, we seek to
4683          * calculate the total heap-size, and reclaim if more than 3/4ths of the
4684          * heap is allocated.  (Or, in the calculation, if less than 1/4th is
4685          * free)
4686          */
4687         n = uma_avail() - (long)(uma_limit() / 4);
4688         if (n < lowest) {
4689                 lowest = n;
4690                 r = FMR_HEAP_ARENA;
4691         }
4692 #endif
4693
4694         /*
4695          * If zio data pages are being allocated out of a separate heap segment,
4696          * then enforce that the size of available vmem for this arena remains
4697          * above about 1/4th (1/(2^arc_zio_arena_free_shift)) free.
4698          *
4699          * Note that reducing the arc_zio_arena_free_shift keeps more virtual
4700          * memory (in the zio_arena) free, which can avoid memory
4701          * fragmentation issues.
4702          */
4703         if (zio_arena != NULL) {
4704                 n = (int64_t)vmem_size(zio_arena, VMEM_FREE) -
4705                     (vmem_size(zio_arena, VMEM_ALLOC) >>
4706                     arc_zio_arena_free_shift);
4707                 if (n < lowest) {
4708                         lowest = n;
4709                         r = FMR_ZIO_ARENA;
4710                 }
4711         }
4712
4713 #else   /* _KERNEL */
4714         /* Every 100 calls, free a small amount */
4715         if (spa_get_random(100) == 0)
4716                 lowest = -1024;
4717 #endif  /* _KERNEL */
4718
4719         last_free_memory = lowest;
4720         last_free_reason = r;
4721         DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r);
4722         return (lowest);
4723 }
4724
4725
4726 /*
4727  * Determine if the system is under memory pressure and is asking
4728  * to reclaim memory. A return value of B_TRUE indicates that the system
4729  * is under memory pressure and that the arc should adjust accordingly.
4730  */
4731 static boolean_t
4732 arc_reclaim_needed(void)
4733 {
4734         return (arc_available_memory() < 0);
4735 }
4736
4737 extern kmem_cache_t     *zio_buf_cache[];
4738 extern kmem_cache_t     *zio_data_buf_cache[];
4739 extern kmem_cache_t     *range_seg_cache;
4740 extern kmem_cache_t     *abd_chunk_cache;
4741
4742 static __noinline void
4743 arc_kmem_reap_now(void)
4744 {
4745         size_t                  i;
4746         kmem_cache_t            *prev_cache = NULL;
4747         kmem_cache_t            *prev_data_cache = NULL;
4748
4749         DTRACE_PROBE(arc__kmem_reap_start);
4750 #ifdef _KERNEL
4751         if (aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) {
4752                 /*
4753                  * We are exceeding our meta-data cache limit.
4754                  * Purge some DNLC entries to release holds on meta-data.
4755                  */
4756                 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
4757         }
4758 #if defined(__i386)
4759         /*
4760          * Reclaim unused memory from all kmem caches.
4761          */
4762         kmem_reap();
4763 #endif
4764 #endif
4765
4766         /*
4767          * If a kmem reap is already active, don't schedule more.  We must
4768          * check for this because kmem_cache_reap_soon() won't actually
4769          * block on the cache being reaped (this is to prevent callers from
4770          * becoming implicitly blocked by a system-wide kmem reap -- which,
4771          * on a system with many, many full magazines, can take minutes).
4772          */
4773         if (kmem_cache_reap_active())
4774                 return;
4775
4776         for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
4777                 if (zio_buf_cache[i] != prev_cache) {
4778                         prev_cache = zio_buf_cache[i];
4779                         kmem_cache_reap_soon(zio_buf_cache[i]);
4780                 }
4781                 if (zio_data_buf_cache[i] != prev_data_cache) {
4782                         prev_data_cache = zio_data_buf_cache[i];
4783                         kmem_cache_reap_soon(zio_data_buf_cache[i]);
4784                 }
4785         }
4786         kmem_cache_reap_soon(abd_chunk_cache);
4787         kmem_cache_reap_soon(buf_cache);
4788         kmem_cache_reap_soon(hdr_full_cache);
4789         kmem_cache_reap_soon(hdr_l2only_cache);
4790         kmem_cache_reap_soon(range_seg_cache);
4791
4792 #ifdef illumos
4793         if (zio_arena != NULL) {
4794                 /*
4795                  * Ask the vmem arena to reclaim unused memory from its
4796                  * quantum caches.
4797                  */
4798                 vmem_qcache_reap(zio_arena);
4799         }
4800 #endif
4801         DTRACE_PROBE(arc__kmem_reap_end);
4802 }
4803
4804 /*
4805  * Threads can block in arc_get_data_impl() waiting for this thread to evict
4806  * enough data and signal them to proceed. When this happens, the threads in
4807  * arc_get_data_impl() are sleeping while holding the hash lock for their
4808  * particular arc header. Thus, we must be careful to never sleep on a
4809  * hash lock in this thread. This is to prevent the following deadlock:
4810  *
4811  *  - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L",
4812  *    waiting for the reclaim thread to signal it.
4813  *
4814  *  - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
4815  *    fails, and goes to sleep forever.
4816  *
4817  * This possible deadlock is avoided by always acquiring a hash lock
4818  * using mutex_tryenter() from arc_reclaim_thread().
4819  */
4820 /* ARGSUSED */
4821 static void
4822 arc_reclaim_thread(void *unused __unused)
4823 {
4824         hrtime_t                growtime = 0;
4825         hrtime_t                kmem_reap_time = 0;
4826         callb_cpr_t             cpr;
4827
4828         CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
4829
4830         mutex_enter(&arc_reclaim_lock);
4831         while (!arc_reclaim_thread_exit) {
4832                 uint64_t evicted = 0;
4833
4834                 /*
4835                  * This is necessary in order for the mdb ::arc dcmd to
4836                  * show up to date information. Since the ::arc command
4837                  * does not call the kstat's update function, without
4838                  * this call, the command may show stale stats for the
4839                  * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
4840                  * with this change, the data might be up to 1 second
4841                  * out of date; but that should suffice. The arc_state_t
4842                  * structures can be queried directly if more accurate
4843                  * information is needed.
4844                  */
4845                 if (arc_ksp != NULL)
4846                         arc_ksp->ks_update(arc_ksp, KSTAT_READ);
4847
4848                 mutex_exit(&arc_reclaim_lock);
4849
4850                 /*
4851                  * We call arc_adjust() before (possibly) calling
4852                  * arc_kmem_reap_now(), so that we can wake up
4853                  * arc_get_data_impl() sooner.
4854                  */
4855                 evicted = arc_adjust();
4856
4857                 int64_t free_memory = arc_available_memory();
4858                 if (free_memory < 0) {
4859                         hrtime_t curtime = gethrtime();
4860                         arc_no_grow = B_TRUE;
4861                         arc_warm = B_TRUE;
4862
4863                         /*
4864                          * Wait at least zfs_grow_retry (default 60) seconds
4865                          * before considering growing.
4866                          */
4867                         growtime = curtime + SEC2NSEC(arc_grow_retry);
4868
4869                         /*
4870                          * Wait at least arc_kmem_cache_reap_retry_ms
4871                          * between arc_kmem_reap_now() calls. Without
4872                          * this check it is possible to end up in a
4873                          * situation where we spend lots of time
4874                          * reaping caches, while we're near arc_c_min.
4875                          */
4876                         if (curtime >= kmem_reap_time) {
4877                                 arc_kmem_reap_now();
4878                                 kmem_reap_time = gethrtime() +
4879                                     MSEC2NSEC(arc_kmem_cache_reap_retry_ms);
4880                         }
4881
4882                         /*
4883                          * If we are still low on memory, shrink the ARC
4884                          * so that we have arc_shrink_min free space.
4885                          */
4886                         free_memory = arc_available_memory();
4887
4888                         int64_t to_free =
4889                             (arc_c >> arc_shrink_shift) - free_memory;
4890                         if (to_free > 0) {
4891 #ifdef _KERNEL
4892 #ifdef illumos
4893                                 to_free = MAX(to_free, ptob(needfree));
4894 #endif
4895 #endif
4896                                 arc_shrink(to_free);
4897                         }
4898                 } else if (free_memory < arc_c >> arc_no_grow_shift) {
4899                         arc_no_grow = B_TRUE;
4900                 } else if (gethrtime() >= growtime) {
4901                         arc_no_grow = B_FALSE;
4902                 }
4903
4904                 mutex_enter(&arc_reclaim_lock);
4905
4906                 /*
4907                  * If evicted is zero, we couldn't evict anything via
4908                  * arc_adjust(). This could be due to hash lock
4909                  * collisions, but more likely due to the majority of
4910                  * arc buffers being unevictable. Therefore, even if
4911                  * arc_size is above arc_c, another pass is unlikely to
4912                  * be helpful and could potentially cause us to enter an
4913                  * infinite loop.
4914                  */
4915                 if (aggsum_compare(&arc_size, arc_c) <= 0|| evicted == 0) {
4916                         /*
4917                          * We're either no longer overflowing, or we
4918                          * can't evict anything more, so we should wake
4919                          * up any threads before we go to sleep.
4920                          */
4921                         cv_broadcast(&arc_reclaim_waiters_cv);
4922
4923                         /*
4924                          * Block until signaled, or after one second (we
4925                          * might need to perform arc_kmem_reap_now()
4926                          * even if we aren't being signalled)
4927                          */
4928                         CALLB_CPR_SAFE_BEGIN(&cpr);
4929                         (void) cv_timedwait_hires(&arc_reclaim_thread_cv,
4930                             &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
4931                         CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock);
4932                 }
4933         }
4934
4935         arc_reclaim_thread_exit = B_FALSE;
4936         cv_broadcast(&arc_reclaim_thread_cv);
4937         CALLB_CPR_EXIT(&cpr);           /* drops arc_reclaim_lock */
4938         thread_exit();
4939 }
4940
4941 static u_int arc_dnlc_evicts_arg;
4942 extern struct vfsops zfs_vfsops;
4943
4944 static void
4945 arc_dnlc_evicts_thread(void *dummy __unused)
4946 {
4947         callb_cpr_t cpr;
4948         u_int percent;
4949
4950         CALLB_CPR_INIT(&cpr, &arc_dnlc_evicts_lock, callb_generic_cpr, FTAG);
4951
4952         mutex_enter(&arc_dnlc_evicts_lock);
4953         while (!arc_dnlc_evicts_thread_exit) {
4954                 CALLB_CPR_SAFE_BEGIN(&cpr);
4955                 (void) cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock);
4956                 CALLB_CPR_SAFE_END(&cpr, &arc_dnlc_evicts_lock);
4957                 if (arc_dnlc_evicts_arg != 0) {
4958                         percent = arc_dnlc_evicts_arg;
4959                         mutex_exit(&arc_dnlc_evicts_lock);
4960 #ifdef _KERNEL
4961                         vnlru_free(desiredvnodes * percent / 100, &zfs_vfsops);
4962 #endif
4963                         mutex_enter(&arc_dnlc_evicts_lock);
4964                         /*
4965                          * Clear our token only after vnlru_free()
4966                          * pass is done, to avoid false queueing of
4967                          * the requests.
4968                          */
4969                         arc_dnlc_evicts_arg = 0;
4970                 }
4971         }
4972         arc_dnlc_evicts_thread_exit = FALSE;
4973         cv_broadcast(&arc_dnlc_evicts_cv);
4974         CALLB_CPR_EXIT(&cpr);
4975         thread_exit();
4976 }
4977
4978 void
4979 dnlc_reduce_cache(void *arg)
4980 {
4981         u_int percent;
4982
4983         percent = (u_int)(uintptr_t)arg;
4984         mutex_enter(&arc_dnlc_evicts_lock);
4985         if (arc_dnlc_evicts_arg == 0) {
4986                 arc_dnlc_evicts_arg = percent;
4987                 cv_broadcast(&arc_dnlc_evicts_cv);
4988         }
4989         mutex_exit(&arc_dnlc_evicts_lock);
4990 }
4991
4992 /*
4993  * Adapt arc info given the number of bytes we are trying to add and
4994  * the state that we are comming from.  This function is only called
4995  * when we are adding new content to the cache.
4996  */
4997 static void
4998 arc_adapt(int bytes, arc_state_t *state)
4999 {
5000         int mult;
5001         uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
5002         int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size);
5003         int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size);
5004
5005         if (state == arc_l2c_only)
5006                 return;
5007
5008         ASSERT(bytes > 0);
5009         /*
5010          * Adapt the target size of the MRU list:
5011          *      - if we just hit in the MRU ghost list, then increase
5012          *        the target size of the MRU list.
5013          *      - if we just hit in the MFU ghost list, then increase
5014          *        the target size of the MFU list by decreasing the
5015          *        target size of the MRU list.
5016          */
5017         if (state == arc_mru_ghost) {
5018                 mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
5019                 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
5020
5021                 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
5022         } else if (state == arc_mfu_ghost) {
5023                 uint64_t delta;
5024
5025                 mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
5026                 mult = MIN(mult, 10);
5027
5028                 delta = MIN(bytes * mult, arc_p);
5029                 arc_p = MAX(arc_p_min, arc_p - delta);
5030         }
5031         ASSERT((int64_t)arc_p >= 0);
5032
5033         if (arc_reclaim_needed()) {
5034                 cv_signal(&arc_reclaim_thread_cv);
5035                 return;
5036         }
5037
5038         if (arc_no_grow)
5039                 return;
5040
5041         if (arc_c >= arc_c_max)
5042                 return;
5043
5044         /*
5045          * If we're within (2 * maxblocksize) bytes of the target
5046          * cache size, increment the target cache size
5047          */
5048         if (aggsum_compare(&arc_size, arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) >
5049             0) {
5050                 DTRACE_PROBE1(arc__inc_adapt, int, bytes);
5051                 atomic_add_64(&arc_c, (int64_t)bytes);
5052                 if (arc_c > arc_c_max)
5053                         arc_c = arc_c_max;
5054                 else if (state == arc_anon)
5055                         atomic_add_64(&arc_p, (int64_t)bytes);
5056                 if (arc_p > arc_c)
5057                         arc_p = arc_c;
5058         }
5059         ASSERT((int64_t)arc_p >= 0);
5060 }
5061
5062 /*
5063  * Check if arc_size has grown past our upper threshold, determined by
5064  * zfs_arc_overflow_shift.
5065  */
5066 static boolean_t
5067 arc_is_overflowing(void)
5068 {
5069         /* Always allow at least one block of overflow */
5070         uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
5071             arc_c >> zfs_arc_overflow_shift);
5072
5073         /*
5074          * We just compare the lower bound here for performance reasons. Our
5075          * primary goals are to make sure that the arc never grows without
5076          * bound, and that it can reach its maximum size. This check
5077          * accomplishes both goals. The maximum amount we could run over by is
5078          * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block
5079          * in the ARC. In practice, that's in the tens of MB, which is low
5080          * enough to be safe.
5081          */
5082         return (aggsum_lower_bound(&arc_size) >= arc_c + overflow);
5083 }
5084
5085 static abd_t *
5086 arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
5087 {
5088         arc_buf_contents_t type = arc_buf_type(hdr);
5089
5090         arc_get_data_impl(hdr, size, tag);
5091         if (type == ARC_BUFC_METADATA) {
5092                 return (abd_alloc(size, B_TRUE));
5093         } else {
5094                 ASSERT(type == ARC_BUFC_DATA);
5095                 return (abd_alloc(size, B_FALSE));
5096         }
5097 }
5098
5099 static void *
5100 arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
5101 {
5102         arc_buf_contents_t type = arc_buf_type(hdr);
5103
5104         arc_get_data_impl(hdr, size, tag);
5105         if (type == ARC_BUFC_METADATA) {
5106                 return (zio_buf_alloc(size));
5107         } else {
5108                 ASSERT(type == ARC_BUFC_DATA);
5109                 return (zio_data_buf_alloc(size));
5110         }
5111 }
5112
5113 /*
5114  * Allocate a block and return it to the caller. If we are hitting the
5115  * hard limit for the cache size, we must sleep, waiting for the eviction
5116  * thread to catch up. If we're past the target size but below the hard
5117  * limit, we'll only signal the reclaim thread and continue on.
5118  */
5119 static void
5120 arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
5121 {
5122         arc_state_t *state = hdr->b_l1hdr.b_state;
5123         arc_buf_contents_t type = arc_buf_type(hdr);
5124
5125         arc_adapt(size, state);
5126
5127         /*
5128          * If arc_size is currently overflowing, and has grown past our
5129          * upper limit, we must be adding data faster than the evict
5130          * thread can evict. Thus, to ensure we don't compound the
5131          * problem by adding more data and forcing arc_size to grow even
5132          * further past it's target size, we halt and wait for the
5133          * eviction thread to catch up.
5134          *
5135          * It's also possible that the reclaim thread is unable to evict
5136          * enough buffers to get arc_size below the overflow limit (e.g.
5137          * due to buffers being un-evictable, or hash lock collisions).
5138          * In this case, we want to proceed regardless if we're
5139          * overflowing; thus we don't use a while loop here.
5140          */
5141         if (arc_is_overflowing()) {
5142                 mutex_enter(&arc_reclaim_lock);
5143
5144                 /*
5145                  * Now that we've acquired the lock, we may no longer be
5146                  * over the overflow limit, lets check.
5147                  *
5148                  * We're ignoring the case of spurious wake ups. If that
5149                  * were to happen, it'd let this thread consume an ARC
5150                  * buffer before it should have (i.e. before we're under
5151                  * the overflow limit and were signalled by the reclaim
5152                  * thread). As long as that is a rare occurrence, it
5153                  * shouldn't cause any harm.
5154                  */
5155                 if (arc_is_overflowing()) {
5156                         cv_signal(&arc_reclaim_thread_cv);
5157                         cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
5158                 }
5159
5160                 mutex_exit(&arc_reclaim_lock);
5161         }
5162
5163         VERIFY3U(hdr->b_type, ==, type);
5164         if (type == ARC_BUFC_METADATA) {
5165                 arc_space_consume(size, ARC_SPACE_META);
5166         } else {
5167                 arc_space_consume(size, ARC_SPACE_DATA);
5168         }
5169
5170         /*
5171          * Update the state size.  Note that ghost states have a
5172          * "ghost size" and so don't need to be updated.
5173          */
5174         if (!GHOST_STATE(state)) {
5175
5176                 (void) refcount_add_many(&state->arcs_size, size, tag);
5177
5178                 /*
5179                  * If this is reached via arc_read, the link is
5180                  * protected by the hash lock. If reached via
5181                  * arc_buf_alloc, the header should not be accessed by
5182                  * any other thread. And, if reached via arc_read_done,
5183                  * the hash lock will protect it if it's found in the
5184                  * hash table; otherwise no other thread should be
5185                  * trying to [add|remove]_reference it.
5186                  */
5187                 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
5188                         ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
5189                         (void) refcount_add_many(&state->arcs_esize[type],
5190                             size, tag);
5191                 }
5192
5193                 /*
5194                  * If we are growing the cache, and we are adding anonymous
5195                  * data, and we have outgrown arc_p, update arc_p
5196                  */
5197                 if (aggsum_compare(&arc_size, arc_c) < 0 &&
5198                     hdr->b_l1hdr.b_state == arc_anon &&
5199                     (refcount_count(&arc_anon->arcs_size) +
5200                     refcount_count(&arc_mru->arcs_size) > arc_p))
5201                         arc_p = MIN(arc_c, arc_p + size);
5202         }
5203         ARCSTAT_BUMP(arcstat_allocated);
5204 }
5205
5206 static void
5207 arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag)
5208 {
5209         arc_free_data_impl(hdr, size, tag);
5210         abd_free(abd);
5211 }
5212
5213 static void
5214 arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag)
5215 {
5216         arc_buf_contents_t type = arc_buf_type(hdr);
5217
5218         arc_free_data_impl(hdr, size, tag);
5219         if (type == ARC_BUFC_METADATA) {
5220                 zio_buf_free(buf, size);
5221         } else {
5222                 ASSERT(type == ARC_BUFC_DATA);
5223                 zio_data_buf_free(buf, size);
5224         }
5225 }
5226
5227 /*
5228  * Free the arc data buffer.
5229  */
5230 static void
5231 arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
5232 {
5233         arc_state_t *state = hdr->b_l1hdr.b_state;
5234         arc_buf_contents_t type = arc_buf_type(hdr);
5235
5236         /* protected by hash lock, if in the hash table */
5237         if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
5238                 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
5239                 ASSERT(state != arc_anon && state != arc_l2c_only);
5240
5241                 (void) refcount_remove_many(&state->arcs_esize[type],
5242                     size, tag);
5243         }
5244         (void) refcount_remove_many(&state->arcs_size, size, tag);
5245
5246         VERIFY3U(hdr->b_type, ==, type);
5247         if (type == ARC_BUFC_METADATA) {
5248                 arc_space_return(size, ARC_SPACE_META);
5249         } else {
5250                 ASSERT(type == ARC_BUFC_DATA);
5251                 arc_space_return(size, ARC_SPACE_DATA);
5252         }
5253 }
5254
5255 /*
5256  * This routine is called whenever a buffer is accessed.
5257  * NOTE: the hash lock is dropped in this function.
5258  */
5259 static void
5260 arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
5261 {
5262         clock_t now;
5263
5264         ASSERT(MUTEX_HELD(hash_lock));
5265         ASSERT(HDR_HAS_L1HDR(hdr));
5266
5267         if (hdr->b_l1hdr.b_state == arc_anon) {
5268                 /*
5269                  * This buffer is not in the cache, and does not
5270                  * appear in our "ghost" list.  Add the new buffer
5271                  * to the MRU state.
5272                  */
5273
5274                 ASSERT0(hdr->b_l1hdr.b_arc_access);
5275                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
5276                 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
5277                 arc_change_state(arc_mru, hdr, hash_lock);
5278
5279         } else if (hdr->b_l1hdr.b_state == arc_mru) {
5280                 now = ddi_get_lbolt();
5281
5282                 /*
5283                  * If this buffer is here because of a prefetch, then either:
5284                  * - clear the flag if this is a "referencing" read
5285                  *   (any subsequent access will bump this into the MFU state).
5286                  * or
5287                  * - move the buffer to the head of the list if this is
5288                  *   another prefetch (to make it less likely to be evicted).
5289                  */
5290                 if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
5291                         if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
5292                                 /* link protected by hash lock */
5293                                 ASSERT(multilist_link_active(
5294                                     &hdr->b_l1hdr.b_arc_node));
5295                         } else {
5296                                 arc_hdr_clear_flags(hdr,
5297                                     ARC_FLAG_PREFETCH |
5298                                     ARC_FLAG_PRESCIENT_PREFETCH);
5299                                 ARCSTAT_BUMP(arcstat_mru_hits);
5300                         }
5301                         hdr->b_l1hdr.b_arc_access = now;
5302                         return;
5303                 }
5304
5305                 /*
5306                  * This buffer has been "accessed" only once so far,
5307                  * but it is still in the cache. Move it to the MFU
5308                  * state.
5309                  */
5310                 if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) {
5311                         /*
5312                          * More than 125ms have passed since we
5313                          * instantiated this buffer.  Move it to the
5314                          * most frequently used state.
5315                          */
5316                         hdr->b_l1hdr.b_arc_access = now;
5317                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
5318                         arc_change_state(arc_mfu, hdr, hash_lock);
5319                 }
5320                 atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
5321                 ARCSTAT_BUMP(arcstat_mru_hits);
5322         } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
5323                 arc_state_t     *new_state;
5324                 /*
5325                  * This buffer has been "accessed" recently, but
5326                  * was evicted from the cache.  Move it to the
5327                  * MFU state.
5328                  */
5329
5330                 if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
5331                         new_state = arc_mru;
5332                         if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) {
5333                                 arc_hdr_clear_flags(hdr,
5334                                     ARC_FLAG_PREFETCH |
5335                                     ARC_FLAG_PRESCIENT_PREFETCH);
5336                         }
5337                         DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
5338                 } else {
5339                         new_state = arc_mfu;
5340                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
5341                 }
5342
5343                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
5344                 arc_change_state(new_state, hdr, hash_lock);
5345
5346                 atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits);
5347                 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
5348         } else if (hdr->b_l1hdr.b_state == arc_mfu) {
5349                 /*
5350                  * This buffer has been accessed more than once and is
5351                  * still in the cache.  Keep it in the MFU state.
5352                  *
5353                  * NOTE: an add_reference() that occurred when we did
5354                  * the arc_read() will have kicked this off the list.
5355                  * If it was a prefetch, we will explicitly move it to
5356                  * the head of the list now.
5357                  */
5358
5359                 atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits);
5360                 ARCSTAT_BUMP(arcstat_mfu_hits);
5361                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
5362         } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
5363                 arc_state_t     *new_state = arc_mfu;
5364                 /*
5365                  * This buffer has been accessed more than once but has
5366                  * been evicted from the cache.  Move it back to the
5367                  * MFU state.
5368                  */
5369
5370                 if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
5371                         /*
5372                          * This is a prefetch access...
5373                          * move this block back to the MRU state.
5374                          */
5375                         new_state = arc_mru;
5376                 }
5377
5378                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
5379                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
5380                 arc_change_state(new_state, hdr, hash_lock);
5381
5382                 atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits);
5383                 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
5384         } else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
5385                 /*
5386                  * This buffer is on the 2nd Level ARC.
5387                  */
5388
5389                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
5390                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
5391                 arc_change_state(arc_mfu, hdr, hash_lock);
5392         } else {
5393                 ASSERT(!"invalid arc state");
5394         }
5395 }
5396
5397 /*
5398  * This routine is called by dbuf_hold() to update the arc_access() state
5399  * which otherwise would be skipped for entries in the dbuf cache.
5400  */
5401 void
5402 arc_buf_access(arc_buf_t *buf)
5403 {
5404         mutex_enter(&buf->b_evict_lock);
5405         arc_buf_hdr_t *hdr = buf->b_hdr;
5406
5407         /*
5408          * Avoid taking the hash_lock when possible as an optimization.
5409          * The header must be checked again under the hash_lock in order
5410          * to handle the case where it is concurrently being released.
5411          */
5412         if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
5413                 mutex_exit(&buf->b_evict_lock);
5414                 ARCSTAT_BUMP(arcstat_access_skip);
5415                 return;
5416         }
5417
5418         kmutex_t *hash_lock = HDR_LOCK(hdr);
5419         mutex_enter(hash_lock);
5420
5421         if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
5422                 mutex_exit(hash_lock);
5423                 mutex_exit(&buf->b_evict_lock);
5424                 ARCSTAT_BUMP(arcstat_access_skip);
5425                 return;
5426         }
5427
5428         mutex_exit(&buf->b_evict_lock);
5429
5430         ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
5431             hdr->b_l1hdr.b_state == arc_mfu);
5432
5433         DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
5434         arc_access(hdr, hash_lock);
5435         mutex_exit(hash_lock);
5436
5437         ARCSTAT_BUMP(arcstat_hits);
5438         ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
5439             demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
5440 }
5441
5442 /* a generic arc_read_done_func_t which you can use */
5443 /* ARGSUSED */
5444 void
5445 arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
5446     arc_buf_t *buf, void *arg)
5447 {
5448         if (buf == NULL)
5449                 return;
5450
5451         bcopy(buf->b_data, arg, arc_buf_size(buf));
5452         arc_buf_destroy(buf, arg);
5453 }
5454
5455 /* a generic arc_read_done_func_t */
5456 /* ARGSUSED */
5457 void
5458 arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
5459     arc_buf_t *buf, void *arg)
5460 {
5461         arc_buf_t **bufp = arg;
5462         if (buf == NULL) {
5463                 ASSERT(zio == NULL || zio->io_error != 0);
5464                 *bufp = NULL;
5465         } else {
5466                 ASSERT(zio == NULL || zio->io_error == 0);
5467                 *bufp = buf;
5468                 ASSERT(buf->b_data != NULL);
5469         }
5470 }
5471
5472 static void
5473 arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp)
5474 {
5475         if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
5476                 ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0);
5477                 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
5478         } else {
5479                 if (HDR_COMPRESSION_ENABLED(hdr)) {
5480                         ASSERT3U(HDR_GET_COMPRESS(hdr), ==,
5481                             BP_GET_COMPRESS(bp));
5482                 }
5483                 ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
5484                 ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp));
5485         }
5486 }
5487
5488 static void
5489 arc_read_done(zio_t *zio)
5490 {
5491         arc_buf_hdr_t   *hdr = zio->io_private;
5492         kmutex_t        *hash_lock = NULL;
5493         arc_callback_t  *callback_list;
5494         arc_callback_t  *acb;
5495         boolean_t       freeable = B_FALSE;
5496         boolean_t       no_zio_error = (zio->io_error == 0);
5497
5498         /*
5499          * The hdr was inserted into hash-table and removed from lists
5500          * prior to starting I/O.  We should find this header, since
5501          * it's in the hash table, and it should be legit since it's
5502          * not possible to evict it during the I/O.  The only possible
5503          * reason for it not to be found is if we were freed during the
5504          * read.
5505          */
5506         if (HDR_IN_HASH_TABLE(hdr)) {
5507                 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
5508                 ASSERT3U(hdr->b_dva.dva_word[0], ==,
5509                     BP_IDENTITY(zio->io_bp)->dva_word[0]);
5510                 ASSERT3U(hdr->b_dva.dva_word[1], ==,
5511                     BP_IDENTITY(zio->io_bp)->dva_word[1]);
5512
5513                 arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
5514                     &hash_lock);
5515
5516                 ASSERT((found == hdr &&
5517                     DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
5518                     (found == hdr && HDR_L2_READING(hdr)));
5519                 ASSERT3P(hash_lock, !=, NULL);
5520         }
5521
5522         if (no_zio_error) {
5523                 /* byteswap if necessary */
5524                 if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
5525                         if (BP_GET_LEVEL(zio->io_bp) > 0) {
5526                                 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
5527                         } else {
5528                                 hdr->b_l1hdr.b_byteswap =
5529                                     DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
5530                         }
5531                 } else {
5532                         hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
5533                 }
5534         }
5535
5536         arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED);
5537         if (l2arc_noprefetch && HDR_PREFETCH(hdr))
5538                 arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);
5539
5540         callback_list = hdr->b_l1hdr.b_acb;
5541         ASSERT3P(callback_list, !=, NULL);
5542
5543         if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) {
5544                 /*
5545                  * Only call arc_access on anonymous buffers.  This is because
5546                  * if we've issued an I/O for an evicted buffer, we've already
5547                  * called arc_access (to prevent any simultaneous readers from
5548                  * getting confused).
5549                  */
5550                 arc_access(hdr, hash_lock);
5551         }
5552
5553         /*
5554          * If a read request has a callback (i.e. acb_done is not NULL), then we
5555          * make a buf containing the data according to the parameters which were
5556          * passed in. The implementation of arc_buf_alloc_impl() ensures that we
5557          * aren't needlessly decompressing the data multiple times.
5558          */
5559         int callback_cnt = 0;
5560         for (acb = callback_list; acb != NULL; acb = acb->acb_next) {
5561                 if (!acb->acb_done)
5562                         continue;
5563
5564                 callback_cnt++;
5565
5566                 if (no_zio_error) {
5567                         int error = arc_buf_alloc_impl(hdr, acb->acb_private,
5568                             acb->acb_compressed, zio->io_error == 0,
5569                             &acb->acb_buf);
5570                         if (error != 0) {
5571                                 /*
5572                                  * Decompression failed.  Set io_error
5573                                  * so that when we call acb_done (below),
5574                                  * we will indicate that the read failed.
5575                                  * Note that in the unusual case where one
5576                                  * callback is compressed and another
5577                                  * uncompressed, we will mark all of them
5578                                  * as failed, even though the uncompressed
5579                                  * one can't actually fail.  In this case,
5580                                  * the hdr will not be anonymous, because
5581                                  * if there are multiple callbacks, it's
5582                                  * because multiple threads found the same
5583                                  * arc buf in the hash table.
5584                                  */
5585                                 zio->io_error = error;
5586                         }
5587                 }
5588         }
5589         /*
5590          * If there are multiple callbacks, we must have the hash lock,
5591          * because the only way for multiple threads to find this hdr is
5592          * in the hash table.  This ensures that if there are multiple
5593          * callbacks, the hdr is not anonymous.  If it were anonymous,
5594          * we couldn't use arc_buf_destroy() in the error case below.
5595          */
5596         ASSERT(callback_cnt < 2 || hash_lock != NULL);
5597
5598         hdr->b_l1hdr.b_acb = NULL;
5599         arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
5600         if (callback_cnt == 0) {
5601                 ASSERT(HDR_PREFETCH(hdr));
5602                 ASSERT0(hdr->b_l1hdr.b_bufcnt);
5603                 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
5604         }
5605
5606         ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
5607             callback_list != NULL);
5608
5609         if (no_zio_error) {
5610                 arc_hdr_verify(hdr, zio->io_bp);
5611         } else {
5612                 arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
5613                 if (hdr->b_l1hdr.b_state != arc_anon)
5614                         arc_change_state(arc_anon, hdr, hash_lock);
5615                 if (HDR_IN_HASH_TABLE(hdr))
5616                         buf_hash_remove(hdr);
5617                 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
5618         }
5619
5620         /*
5621          * Broadcast before we drop the hash_lock to avoid the possibility
5622          * that the hdr (and hence the cv) might be freed before we get to
5623          * the cv_broadcast().
5624          */
5625         cv_broadcast(&hdr->b_l1hdr.b_cv);
5626
5627         if (hash_lock != NULL) {
5628                 mutex_exit(hash_lock);
5629         } else {
5630                 /*
5631                  * This block was freed while we waited for the read to
5632                  * complete.  It has been removed from the hash table and
5633                  * moved to the anonymous state (so that it won't show up
5634                  * in the cache).
5635                  */
5636                 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
5637                 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
5638         }
5639
5640         /* execute each callback and free its structure */
5641         while ((acb = callback_list) != NULL) {
5642                 if (acb->acb_done != NULL) {
5643                         if (zio->io_error != 0 && acb->acb_buf != NULL) {
5644                                 /*
5645                                  * If arc_buf_alloc_impl() fails during
5646                                  * decompression, the buf will still be
5647                                  * allocated, and needs to be freed here.
5648                                  */
5649                                 arc_buf_destroy(acb->acb_buf, acb->acb_private);
5650                                 acb->acb_buf = NULL;
5651                         }
5652                         acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
5653                             acb->acb_buf, acb->acb_private);
5654                 }
5655
5656                 if (acb->acb_zio_dummy != NULL) {
5657                         acb->acb_zio_dummy->io_error = zio->io_error;
5658                         zio_nowait(acb->acb_zio_dummy);
5659                 }
5660
5661                 callback_list = acb->acb_next;
5662                 kmem_free(acb, sizeof (arc_callback_t));
5663         }
5664
5665         if (freeable)
5666                 arc_hdr_destroy(hdr);
5667 }
5668
5669 /*
5670  * "Read" the block at the specified DVA (in bp) via the
5671  * cache.  If the block is found in the cache, invoke the provided
5672  * callback immediately and return.  Note that the `zio' parameter
5673  * in the callback will be NULL in this case, since no IO was
5674  * required.  If the block is not in the cache pass the read request
5675  * on to the spa with a substitute callback function, so that the
5676  * requested block will be added to the cache.
5677  *
5678  * If a read request arrives for a block that has a read in-progress,
5679  * either wait for the in-progress read to complete (and return the
5680  * results); or, if this is a read with a "done" func, add a record
5681  * to the read to invoke the "done" func when the read completes,
5682  * and return; or just return.
5683  *
5684  * arc_read_done() will invoke all the requested "done" functions
5685  * for readers of this block.
5686  */
5687 int
5688 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done,
5689     void *private, zio_priority_t priority, int zio_flags,
5690     arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
5691 {
5692         arc_buf_hdr_t *hdr = NULL;
5693         kmutex_t *hash_lock = NULL;
5694         zio_t *rzio;
5695         uint64_t guid = spa_load_guid(spa);
5696         boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0;
5697         int rc = 0;
5698
5699         ASSERT(!BP_IS_EMBEDDED(bp) ||
5700             BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
5701
5702 top:
5703         if (!BP_IS_EMBEDDED(bp)) {
5704                 /*
5705                  * Embedded BP's have no DVA and require no I/O to "read".
5706                  * Create an anonymous arc buf to back it.
5707                  */
5708                 hdr = buf_hash_find(guid, bp, &hash_lock);
5709         }
5710
5711         if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) {
5712                 arc_buf_t *buf = NULL;
5713                 *arc_flags |= ARC_FLAG_CACHED;
5714
5715                 if (HDR_IO_IN_PROGRESS(hdr)) {
5716                         zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
5717
5718                         ASSERT3P(head_zio, !=, NULL);
5719                         if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
5720                             priority == ZIO_PRIORITY_SYNC_READ) {
5721                                 /*
5722                                  * This is a sync read that needs to wait for
5723                                  * an in-flight async read. Request that the
5724                                  * zio have its priority upgraded.
5725                                  */
5726                                 zio_change_priority(head_zio, priority);
5727                                 DTRACE_PROBE1(arc__async__upgrade__sync,
5728                                     arc_buf_hdr_t *, hdr);
5729                                 ARCSTAT_BUMP(arcstat_async_upgrade_sync);
5730                         }
5731                         if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
5732                                 arc_hdr_clear_flags(hdr,
5733                                     ARC_FLAG_PREDICTIVE_PREFETCH);
5734                         }
5735
5736                         if (*arc_flags & ARC_FLAG_WAIT) {
5737                                 cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
5738                                 mutex_exit(hash_lock);
5739                                 goto top;
5740                         }
5741                         ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
5742
5743                         if (done) {
5744                                 arc_callback_t *acb = NULL;
5745
5746                                 acb = kmem_zalloc(sizeof (arc_callback_t),
5747                                     KM_SLEEP);
5748                                 acb->acb_done = done;
5749                                 acb->acb_private = private;
5750                                 acb->acb_compressed = compressed_read;
5751                                 if (pio != NULL)
5752                                         acb->acb_zio_dummy = zio_null(pio,
5753                                             spa, NULL, NULL, NULL, zio_flags);
5754
5755                                 ASSERT3P(acb->acb_done, !=, NULL);
5756                                 acb->acb_zio_head = head_zio;
5757                                 acb->acb_next = hdr->b_l1hdr.b_acb;
5758                                 hdr->b_l1hdr.b_acb = acb;
5759                                 mutex_exit(hash_lock);
5760                                 return (0);
5761                         }
5762                         mutex_exit(hash_lock);
5763                         return (0);
5764                 }
5765
5766                 ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
5767                     hdr->b_l1hdr.b_state == arc_mfu);
5768
5769                 if (done) {
5770                         if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
5771                                 /*
5772                                  * This is a demand read which does not have to
5773                                  * wait for i/o because we did a predictive
5774                                  * prefetch i/o for it, which has completed.
5775                                  */
5776                                 DTRACE_PROBE1(
5777                                     arc__demand__hit__predictive__prefetch,
5778                                     arc_buf_hdr_t *, hdr);
5779                                 ARCSTAT_BUMP(
5780                                     arcstat_demand_hit_predictive_prefetch);
5781                                 arc_hdr_clear_flags(hdr,
5782                                     ARC_FLAG_PREDICTIVE_PREFETCH);
5783                         }
5784
5785                         if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
5786                                 ARCSTAT_BUMP(
5787                                     arcstat_demand_hit_prescient_prefetch);
5788                                 arc_hdr_clear_flags(hdr,
5789                                     ARC_FLAG_PRESCIENT_PREFETCH);
5790                         }
5791
5792                         ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));
5793                         /* Get a buf with the desired data in it. */
5794                         rc = arc_buf_alloc_impl(hdr, private,
5795                            compressed_read, B_TRUE, &buf);
5796                         if (rc != 0) {
5797                                 arc_buf_destroy(buf, private);
5798                                 buf = NULL;
5799                         }
5800                         ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) ||
5801                             rc == 0 || rc != ENOENT);
5802                 } else if (*arc_flags & ARC_FLAG_PREFETCH &&
5803                     refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
5804                         arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
5805                 }
5806                 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
5807                 arc_access(hdr, hash_lock);
5808                 if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
5809                         arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
5810                 if (*arc_flags & ARC_FLAG_L2CACHE)
5811                         arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
5812                 mutex_exit(hash_lock);
5813                 ARCSTAT_BUMP(arcstat_hits);
5814                 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
5815                     demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
5816                     data, metadata, hits);
5817
5818                 if (done)
5819                         done(NULL, zb, bp, buf, private);
5820         } else {
5821                 uint64_t lsize = BP_GET_LSIZE(bp);
5822                 uint64_t psize = BP_GET_PSIZE(bp);
5823                 arc_callback_t *acb;
5824                 vdev_t *vd = NULL;
5825                 uint64_t addr = 0;
5826                 boolean_t devw = B_FALSE;
5827                 uint64_t size;
5828
5829                 if (hdr == NULL) {
5830                         /* this block is not in the cache */
5831                         arc_buf_hdr_t *exists = NULL;
5832                         arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
5833                         hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
5834                             BP_GET_COMPRESS(bp), type);
5835
5836                         if (!BP_IS_EMBEDDED(bp)) {
5837                                 hdr->b_dva = *BP_IDENTITY(bp);
5838                                 hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
5839                                 exists = buf_hash_insert(hdr, &hash_lock);
5840                         }
5841                         if (exists != NULL) {
5842                                 /* somebody beat us to the hash insert */
5843                                 mutex_exit(hash_lock);
5844                                 buf_discard_identity(hdr);
5845                                 arc_hdr_destroy(hdr);
5846                                 goto top; /* restart the IO request */
5847                         }
5848                 } else {
5849                         /*
5850                          * This block is in the ghost cache. If it was L2-only
5851                          * (and thus didn't have an L1 hdr), we realloc the
5852                          * header to add an L1 hdr.
5853                          */
5854                         if (!HDR_HAS_L1HDR(hdr)) {
5855                                 hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
5856                                     hdr_full_cache);
5857                         }
5858                         ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
5859                         ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
5860                         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
5861                         ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
5862                         ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
5863                         ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
5864
5865                         /*
5866                          * This is a delicate dance that we play here.
5867                          * This hdr is in the ghost list so we access it
5868                          * to move it out of the ghost list before we
5869                          * initiate the read. If it's a prefetch then
5870                          * it won't have a callback so we'll remove the
5871                          * reference that arc_buf_alloc_impl() created. We
5872                          * do this after we've called arc_access() to
5873                          * avoid hitting an assert in remove_reference().
5874                          */
5875                         arc_access(hdr, hash_lock);
5876                         arc_hdr_alloc_pabd(hdr);
5877                 }
5878                 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
5879                 size = arc_hdr_size(hdr);
5880
5881                 /*
5882                  * If compression is enabled on the hdr, then will do
5883                  * RAW I/O and will store the compressed data in the hdr's
5884                  * data block. Otherwise, the hdr's data block will contain
5885                  * the uncompressed data.
5886                  */
5887                 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
5888                         zio_flags |= ZIO_FLAG_RAW;
5889                 }
5890
5891                 if (*arc_flags & ARC_FLAG_PREFETCH)
5892                         arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
5893                 if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
5894                         arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
5895
5896                 if (*arc_flags & ARC_FLAG_L2CACHE)
5897                         arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
5898                 if (BP_GET_LEVEL(bp) > 0)
5899                         arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT);
5900                 if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
5901                         arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH);
5902                 ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
5903
5904                 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
5905                 acb->acb_done = done;
5906                 acb->acb_private = private;
5907                 acb->acb_compressed = compressed_read;
5908
5909                 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
5910                 hdr->b_l1hdr.b_acb = acb;
5911                 arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
5912
5913                 if (HDR_HAS_L2HDR(hdr) &&
5914                     (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
5915                         devw = hdr->b_l2hdr.b_dev->l2ad_writing;
5916                         addr = hdr->b_l2hdr.b_daddr;
5917                         /*
5918                          * Lock out L2ARC device removal.
5919                          */
5920                         if (vdev_is_dead(vd) ||
5921                             !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
5922                                 vd = NULL;
5923                 }
5924
5925                 /*
5926                  * We count both async reads and scrub IOs as asynchronous so
5927                  * that both can be upgraded in the event of a cache hit while
5928                  * the read IO is still in-flight.
5929                  */
5930                 if (priority == ZIO_PRIORITY_ASYNC_READ ||
5931                     priority == ZIO_PRIORITY_SCRUB)
5932                         arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
5933                 else
5934                         arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
5935
5936                 /*
5937                  * At this point, we have a level 1 cache miss.  Try again in
5938                  * L2ARC if possible.
5939                  */
5940                 ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize);
5941
5942                 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
5943                     uint64_t, lsize, zbookmark_phys_t *, zb);
5944                 ARCSTAT_BUMP(arcstat_misses);
5945                 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
5946                     demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
5947                     data, metadata, misses);
5948 #ifdef _KERNEL
5949 #ifdef RACCT
5950                 if (racct_enable) {
5951                         PROC_LOCK(curproc);
5952                         racct_add_force(curproc, RACCT_READBPS, size);
5953                         racct_add_force(curproc, RACCT_READIOPS, 1);
5954                         PROC_UNLOCK(curproc);
5955                 }
5956 #endif /* RACCT */
5957                 curthread->td_ru.ru_inblock++;
5958 #endif
5959
5960                 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
5961                         /*
5962                          * Read from the L2ARC if the following are true:
5963                          * 1. The L2ARC vdev was previously cached.
5964                          * 2. This buffer still has L2ARC metadata.
5965                          * 3. This buffer isn't currently writing to the L2ARC.
5966                          * 4. The L2ARC entry wasn't evicted, which may
5967                          *    also have invalidated the vdev.
5968                          * 5. This isn't prefetch and l2arc_noprefetch is set.
5969                          */
5970                         if (HDR_HAS_L2HDR(hdr) &&
5971                             !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
5972                             !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
5973                                 l2arc_read_callback_t *cb;
5974                                 abd_t *abd;
5975                                 uint64_t asize;
5976
5977                                 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
5978                                 ARCSTAT_BUMP(arcstat_l2_hits);
5979                                 atomic_inc_32(&hdr->b_l2hdr.b_hits);
5980
5981                                 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
5982                                     KM_SLEEP);
5983                                 cb->l2rcb_hdr = hdr;
5984                                 cb->l2rcb_bp = *bp;
5985                                 cb->l2rcb_zb = *zb;
5986                                 cb->l2rcb_flags = zio_flags;
5987
5988                                 asize = vdev_psize_to_asize(vd, size);
5989                                 if (asize != size) {
5990                                         abd = abd_alloc_for_io(asize,
5991                                             HDR_ISTYPE_METADATA(hdr));
5992                                         cb->l2rcb_abd = abd;
5993                                 } else {
5994                                         abd = hdr->b_l1hdr.b_pabd;
5995                                 }
5996
5997                                 ASSERT(addr >= VDEV_LABEL_START_SIZE &&
5998                                     addr + asize <= vd->vdev_psize -
5999                                     VDEV_LABEL_END_SIZE);
6000
6001                                 /*
6002                                  * l2arc read.  The SCL_L2ARC lock will be
6003                                  * released by l2arc_read_done().
6004                                  * Issue a null zio if the underlying buffer
6005                                  * was squashed to zero size by compression.
6006                                  */
6007                                 ASSERT3U(HDR_GET_COMPRESS(hdr), !=,
6008                                     ZIO_COMPRESS_EMPTY);
6009                                 rzio = zio_read_phys(pio, vd, addr,
6010                                     asize, abd,
6011                                     ZIO_CHECKSUM_OFF,
6012                                     l2arc_read_done, cb, priority,
6013                                     zio_flags | ZIO_FLAG_DONT_CACHE |
6014                                     ZIO_FLAG_CANFAIL |
6015                                     ZIO_FLAG_DONT_PROPAGATE |
6016                                     ZIO_FLAG_DONT_RETRY, B_FALSE);
6017                                 acb->acb_zio_head = rzio;
6018
6019                                 if (hash_lock != NULL)
6020                                         mutex_exit(hash_lock);
6021
6022                                 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
6023                                     zio_t *, rzio);
6024                                 ARCSTAT_INCR(arcstat_l2_read_bytes, size);
6025
6026                                 if (*arc_flags & ARC_FLAG_NOWAIT) {
6027                                         zio_nowait(rzio);
6028                                         return (0);
6029                                 }
6030
6031                                 ASSERT(*arc_flags & ARC_FLAG_WAIT);
6032                                 if (zio_wait(rzio) == 0)
6033                                         return (0);
6034
6035                                 /* l2arc read error; goto zio_read() */
6036                                 if (hash_lock != NULL)
6037                                         mutex_enter(hash_lock);
6038                         } else {
6039                                 DTRACE_PROBE1(l2arc__miss,
6040                                     arc_buf_hdr_t *, hdr);
6041                                 ARCSTAT_BUMP(arcstat_l2_misses);
6042                                 if (HDR_L2_WRITING(hdr))
6043                                         ARCSTAT_BUMP(arcstat_l2_rw_clash);
6044                                 spa_config_exit(spa, SCL_L2ARC, vd);
6045                         }
6046                 } else {
6047                         if (vd != NULL)
6048                                 spa_config_exit(spa, SCL_L2ARC, vd);
6049                         if (l2arc_ndev != 0) {
6050                                 DTRACE_PROBE1(l2arc__miss,
6051                                     arc_buf_hdr_t *, hdr);
6052                                 ARCSTAT_BUMP(arcstat_l2_misses);
6053                         }
6054                 }
6055
6056                 rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size,
6057                     arc_read_done, hdr, priority, zio_flags, zb);
6058                 acb->acb_zio_head = rzio;
6059
6060                 if (hash_lock != NULL)
6061                         mutex_exit(hash_lock);
6062
6063                 if (*arc_flags & ARC_FLAG_WAIT)
6064                         return (zio_wait(rzio));
6065
6066                 ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
6067                 zio_nowait(rzio);
6068         }
6069         return (0);
6070 }
6071
6072 arc_prune_t *
6073 arc_add_prune_callback(arc_prune_func_t *func, void *private)
6074 {
6075         arc_prune_t *p;
6076
6077         p = kmem_alloc(sizeof (*p), KM_SLEEP);
6078         p->p_pfunc = func;
6079         p->p_private = private;
6080         list_link_init(&p->p_node);
6081         refcount_create(&p->p_refcnt);
6082
6083         mutex_enter(&arc_prune_mtx);
6084         refcount_add(&p->p_refcnt, &arc_prune_list);
6085         list_insert_head(&arc_prune_list, p);
6086         mutex_exit(&arc_prune_mtx);
6087
6088         return (p);
6089 }
6090
6091 void
6092 arc_remove_prune_callback(arc_prune_t *p)
6093 {
6094         boolean_t wait = B_FALSE;
6095         mutex_enter(&arc_prune_mtx);
6096         list_remove(&arc_prune_list, p);
6097         if (refcount_remove(&p->p_refcnt, &arc_prune_list) > 0)
6098                 wait = B_TRUE;
6099         mutex_exit(&arc_prune_mtx);
6100
6101         /* wait for arc_prune_task to finish */
6102         if (wait)
6103                 taskq_wait(arc_prune_taskq);
6104         ASSERT0(refcount_count(&p->p_refcnt));
6105         refcount_destroy(&p->p_refcnt);
6106         kmem_free(p, sizeof (*p));
6107 }
6108
6109 /*
6110  * Notify the arc that a block was freed, and thus will never be used again.
6111  */
6112 void
6113 arc_freed(spa_t *spa, const blkptr_t *bp)
6114 {
6115         arc_buf_hdr_t *hdr;
6116         kmutex_t *hash_lock;
6117         uint64_t guid = spa_load_guid(spa);
6118
6119         ASSERT(!BP_IS_EMBEDDED(bp));
6120
6121         hdr = buf_hash_find(guid, bp, &hash_lock);
6122         if (hdr == NULL)
6123                 return;
6124
6125         /*
6126          * We might be trying to free a block that is still doing I/O
6127          * (i.e. prefetch) or has a reference (i.e. a dedup-ed,
6128          * dmu_sync-ed block). If this block is being prefetched, then it
6129          * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr
6130          * until the I/O completes. A block may also have a reference if it is
6131          * part of a dedup-ed, dmu_synced write. The dmu_sync() function would
6132          * have written the new block to its final resting place on disk but
6133          * without the dedup flag set. This would have left the hdr in the MRU
6134          * state and discoverable. When the txg finally syncs it detects that
6135          * the block was overridden in open context and issues an override I/O.
6136          * Since this is a dedup block, the override I/O will determine if the
6137          * block is already in the DDT. If so, then it will replace the io_bp
6138          * with the bp from the DDT and allow the I/O to finish. When the I/O
6139          * reaches the done callback, dbuf_write_override_done, it will
6140          * check to see if the io_bp and io_bp_override are identical.
6141          * If they are not, then it indicates that the bp was replaced with
6142          * the bp in the DDT and the override bp is freed. This allows
6143          * us to arrive here with a reference on a block that is being
6144          * freed. So if we have an I/O in progress, or a reference to
6145          * this hdr, then we don't destroy the hdr.
6146          */
6147         if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) &&
6148             refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) {
6149                 arc_change_state(arc_anon, hdr, hash_lock);
6150                 arc_hdr_destroy(hdr);
6151                 mutex_exit(hash_lock);
6152         } else {
6153                 mutex_exit(hash_lock);
6154         }
6155
6156 }
6157
6158 /*
6159  * Release this buffer from the cache, making it an anonymous buffer.  This
6160  * must be done after a read and prior to modifying the buffer contents.
6161  * If the buffer has more than one reference, we must make
6162  * a new hdr for the buffer.
6163  */
6164 void
6165 arc_release(arc_buf_t *buf, void *tag)
6166 {
6167         arc_buf_hdr_t *hdr = buf->b_hdr;
6168
6169         /*
6170          * It would be nice to assert that if it's DMU metadata (level >
6171          * 0 || it's the dnode file), then it must be syncing context.
6172          * But we don't know that information at this level.
6173          */
6174
6175         mutex_enter(&buf->b_evict_lock);
6176
6177         ASSERT(HDR_HAS_L1HDR(hdr));
6178
6179         /*
6180          * We don't grab the hash lock prior to this check, because if
6181          * the buffer's header is in the arc_anon state, it won't be
6182          * linked into the hash table.
6183          */
6184         if (hdr->b_l1hdr.b_state == arc_anon) {
6185                 mutex_exit(&buf->b_evict_lock);
6186                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
6187                 ASSERT(!HDR_IN_HASH_TABLE(hdr));
6188                 ASSERT(!HDR_HAS_L2HDR(hdr));
6189                 ASSERT(HDR_EMPTY(hdr));
6190                 ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
6191                 ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
6192                 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
6193
6194                 hdr->b_l1hdr.b_arc_access = 0;
6195
6196                 /*
6197                  * If the buf is being overridden then it may already
6198                  * have a hdr that is not empty.
6199                  */
6200                 buf_discard_identity(hdr);
6201                 arc_buf_thaw(buf);
6202
6203                 return;
6204         }
6205
6206         kmutex_t *hash_lock = HDR_LOCK(hdr);
6207         mutex_enter(hash_lock);
6208
6209         /*
6210          * This assignment is only valid as long as the hash_lock is
6211          * held, we must be careful not to reference state or the
6212          * b_state field after dropping the lock.
6213          */
6214         arc_state_t *state = hdr->b_l1hdr.b_state;
6215         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
6216         ASSERT3P(state, !=, arc_anon);
6217
6218         /* this buffer is not on any list */
6219         ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0);
6220
6221         if (HDR_HAS_L2HDR(hdr)) {
6222                 mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
6223
6224                 /*
6225                  * We have to recheck this conditional again now that
6226                  * we're holding the l2ad_mtx to prevent a race with
6227                  * another thread which might be concurrently calling
6228                  * l2arc_evict(). In that case, l2arc_evict() might have
6229                  * destroyed the header's L2 portion as we were waiting
6230                  * to acquire the l2ad_mtx.
6231                  */
6232                 if (HDR_HAS_L2HDR(hdr)) {
6233                         l2arc_trim(hdr);
6234                         arc_hdr_l2hdr_destroy(hdr);
6235                 }
6236
6237                 mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
6238         }
6239
6240         /*
6241          * Do we have more than one buf?
6242          */
6243         if (hdr->b_l1hdr.b_bufcnt > 1) {
6244                 arc_buf_hdr_t *nhdr;
6245                 uint64_t spa = hdr->b_spa;
6246                 uint64_t psize = HDR_GET_PSIZE(hdr);
6247                 uint64_t lsize = HDR_GET_LSIZE(hdr);
6248                 enum zio_compress compress = HDR_GET_COMPRESS(hdr);
6249                 arc_buf_contents_t type = arc_buf_type(hdr);
6250                 VERIFY3U(hdr->b_type, ==, type);
6251
6252                 ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
6253                 (void) remove_reference(hdr, hash_lock, tag);
6254
6255                 if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) {
6256                         ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
6257                         ASSERT(ARC_BUF_LAST(buf));
6258                 }
6259
6260                 /*
6261                  * Pull the data off of this hdr and attach it to
6262                  * a new anonymous hdr. Also find the last buffer
6263                  * in the hdr's buffer list.
6264                  */
6265                 arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
6266                 ASSERT3P(lastbuf, !=, NULL);
6267
6268                 /*
6269                  * If the current arc_buf_t and the hdr are sharing their data
6270                  * buffer, then we must stop sharing that block.
6271                  */
6272                 if (arc_buf_is_shared(buf)) {
6273                         VERIFY(!arc_buf_is_shared(lastbuf));
6274
6275                         /*
6276                          * First, sever the block sharing relationship between
6277                          * buf and the arc_buf_hdr_t.
6278                          */
6279                         arc_unshare_buf(hdr, buf);
6280
6281                         /*
6282                          * Now we need to recreate the hdr's b_pabd. Since we
6283                          * have lastbuf handy, we try to share with it, but if
6284                          * we can't then we allocate a new b_pabd and copy the
6285                          * data from buf into it.
6286                          */
6287                         if (arc_can_share(hdr, lastbuf)) {
6288                                 arc_share_buf(hdr, lastbuf);
6289                         } else {
6290                                 arc_hdr_alloc_pabd(hdr);
6291                                 abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
6292                                     buf->b_data, psize);
6293                         }
6294                         VERIFY3P(lastbuf->b_data, !=, NULL);
6295                 } else if (HDR_SHARED_DATA(hdr)) {
6296                         /*
6297                          * Uncompressed shared buffers are always at the end
6298                          * of the list. Compressed buffers don't have the
6299                          * same requirements. This makes it hard to
6300                          * simply assert that the lastbuf is shared so
6301                          * we rely on the hdr's compression flags to determine
6302                          * if we have a compressed, shared buffer.
6303                          */
6304                         ASSERT(arc_buf_is_shared(lastbuf) ||
6305                             HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
6306                         ASSERT(!ARC_BUF_SHARED(buf));
6307                 }
6308                 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
6309                 ASSERT3P(state, !=, arc_l2c_only);
6310
6311                 (void) refcount_remove_many(&state->arcs_size,
6312                     arc_buf_size(buf), buf);
6313
6314                 if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
6315                         ASSERT3P(state, !=, arc_l2c_only);
6316                         (void) refcount_remove_many(&state->arcs_esize[type],
6317                             arc_buf_size(buf), buf);
6318                 }
6319
6320                 hdr->b_l1hdr.b_bufcnt -= 1;
6321                 arc_cksum_verify(buf);
6322 #ifdef illumos
6323                 arc_buf_unwatch(buf);
6324 #endif
6325
6326                 mutex_exit(hash_lock);
6327
6328                 /*
6329                  * Allocate a new hdr. The new hdr will contain a b_pabd
6330                  * buffer which will be freed in arc_write().
6331                  */
6332                 nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type);
6333                 ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL);
6334                 ASSERT0(nhdr->b_l1hdr.b_bufcnt);
6335                 ASSERT0(refcount_count(&nhdr->b_l1hdr.b_refcnt));
6336                 VERIFY3U(nhdr->b_type, ==, type);
6337                 ASSERT(!HDR_SHARED_DATA(nhdr));
6338
6339                 nhdr->b_l1hdr.b_buf = buf;
6340                 nhdr->b_l1hdr.b_bufcnt = 1;
6341                 (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
6342                 buf->b_hdr = nhdr;
6343
6344                 mutex_exit(&buf->b_evict_lock);
6345                 (void) refcount_add_many(&arc_anon->arcs_size,
6346                     arc_buf_size(buf), buf);
6347         } else {
6348                 mutex_exit(&buf->b_evict_lock);
6349                 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
6350                 /* protected by hash lock, or hdr is on arc_anon */
6351                 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
6352                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
6353                 arc_change_state(arc_anon, hdr, hash_lock);
6354                 hdr->b_l1hdr.b_arc_access = 0;
6355                 mutex_exit(hash_lock);
6356
6357                 buf_discard_identity(hdr);
6358                 arc_buf_thaw(buf);
6359         }
6360 }
6361
6362 int
6363 arc_released(arc_buf_t *buf)
6364 {
6365         int released;
6366
6367         mutex_enter(&buf->b_evict_lock);
6368         released = (buf->b_data != NULL &&
6369             buf->b_hdr->b_l1hdr.b_state == arc_anon);
6370         mutex_exit(&buf->b_evict_lock);
6371         return (released);
6372 }
6373
6374 #ifdef ZFS_DEBUG
6375 int
6376 arc_referenced(arc_buf_t *buf)
6377 {
6378         int referenced;
6379
6380         mutex_enter(&buf->b_evict_lock);
6381         referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
6382         mutex_exit(&buf->b_evict_lock);
6383         return (referenced);
6384 }
6385 #endif
6386
6387 static void
6388 arc_write_ready(zio_t *zio)
6389 {
6390         arc_write_callback_t *callback = zio->io_private;
6391         arc_buf_t *buf = callback->awcb_buf;
6392         arc_buf_hdr_t *hdr = buf->b_hdr;
6393         uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp);
6394
6395         ASSERT(HDR_HAS_L1HDR(hdr));
6396         ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
6397         ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
6398
6399         /*
6400          * If we're reexecuting this zio because the pool suspended, then
6401          * cleanup any state that was previously set the first time the
6402          * callback was invoked.
6403          */
6404         if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
6405                 arc_cksum_free(hdr);
6406 #ifdef illumos
6407                 arc_buf_unwatch(buf);
6408 #endif
6409                 if (hdr->b_l1hdr.b_pabd != NULL) {
6410                         if (arc_buf_is_shared(buf)) {
6411                                 arc_unshare_buf(hdr, buf);
6412                         } else {
6413                                 arc_hdr_free_pabd(hdr);
6414                         }
6415                 }
6416         }
6417         ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
6418         ASSERT(!HDR_SHARED_DATA(hdr));
6419         ASSERT(!arc_buf_is_shared(buf));
6420
6421         callback->awcb_ready(zio, buf, callback->awcb_private);
6422
6423         if (HDR_IO_IN_PROGRESS(hdr))
6424                 ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);
6425
6426         arc_cksum_compute(buf);
6427         arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
6428
6429         enum zio_compress compress;
6430         if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
6431                 compress = ZIO_COMPRESS_OFF;
6432         } else {
6433                 ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp));
6434                 compress = BP_GET_COMPRESS(zio->io_bp);
6435         }
6436         HDR_SET_PSIZE(hdr, psize);
6437         arc_hdr_set_compress(hdr, compress);
6438
6439
6440         /*
6441          * Fill the hdr with data. If the hdr is compressed, the data we want
6442          * is available from the zio, otherwise we can take it from the buf.
6443          *
6444          * We might be able to share the buf's data with the hdr here. However,
6445          * doing so would cause the ARC to be full of linear ABDs if we write a
6446          * lot of shareable data. As a compromise, we check whether scattered
6447          * ABDs are allowed, and assume that if they are then the user wants
6448          * the ARC to be primarily filled with them regardless of the data being
6449          * written. Therefore, if they're allowed then we allocate one and copy
6450          * the data into it; otherwise, we share the data directly if we can.
6451          */
6452         if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) {
6453                 arc_hdr_alloc_pabd(hdr);
6454
6455                 /*
6456                  * Ideally, we would always copy the io_abd into b_pabd, but the
6457                  * user may have disabled compressed ARC, thus we must check the
6458                  * hdr's compression setting rather than the io_bp's.
6459                  */
6460                 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
6461                         ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=,
6462                             ZIO_COMPRESS_OFF);
6463                         ASSERT3U(psize, >, 0);
6464
6465                         abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
6466                 } else {
6467                         ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
6468
6469                         abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
6470                             arc_buf_size(buf));
6471                 }
6472         } else {
6473                 ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
6474                 ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
6475                 ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
6476
6477                 arc_share_buf(hdr, buf);
6478         }
6479
6480         arc_hdr_verify(hdr, zio->io_bp);
6481 }
6482
6483 static void
6484 arc_write_children_ready(zio_t *zio)
6485 {
6486         arc_write_callback_t *callback = zio->io_private;
6487         arc_buf_t *buf = callback->awcb_buf;
6488
6489         callback->awcb_children_ready(zio, buf, callback->awcb_private);
6490 }
6491
6492 /*
6493  * The SPA calls this callback for each physical write that happens on behalf
6494  * of a logical write.  See the comment in dbuf_write_physdone() for details.
6495  */
6496 static void
6497 arc_write_physdone(zio_t *zio)
6498 {
6499         arc_write_callback_t *cb = zio->io_private;
6500         if (cb->awcb_physdone != NULL)
6501                 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
6502 }
6503
6504 static void
6505 arc_write_done(zio_t *zio)
6506 {
6507         arc_write_callback_t *callback = zio->io_private;
6508         arc_buf_t *buf = callback->awcb_buf;
6509         arc_buf_hdr_t *hdr = buf->b_hdr;
6510
6511         ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
6512
6513         if (zio->io_error == 0) {
6514                 arc_hdr_verify(hdr, zio->io_bp);
6515
6516                 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
6517                         buf_discard_identity(hdr);
6518                 } else {
6519                         hdr->b_dva = *BP_IDENTITY(zio->io_bp);
6520                         hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
6521                 }
6522         } else {
6523                 ASSERT(HDR_EMPTY(hdr));
6524         }
6525
6526         /*
6527          * If the block to be written was all-zero or compressed enough to be
6528          * embedded in the BP, no write was performed so there will be no
6529          * dva/birth/checksum.  The buffer must therefore remain anonymous
6530          * (and uncached).
6531          */
6532         if (!HDR_EMPTY(hdr)) {
6533                 arc_buf_hdr_t *exists;
6534                 kmutex_t *hash_lock;
6535
6536                 ASSERT3U(zio->io_error, ==, 0);
6537
6538                 arc_cksum_verify(buf);
6539
6540                 exists = buf_hash_insert(hdr, &hash_lock);
6541                 if (exists != NULL) {
6542                         /*
6543                          * This can only happen if we overwrite for
6544                          * sync-to-convergence, because we remove
6545                          * buffers from the hash table when we arc_free().
6546                          */
6547                         if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
6548                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
6549                                         panic("bad overwrite, hdr=%p exists=%p",
6550                                             (void *)hdr, (void *)exists);
6551                                 ASSERT(refcount_is_zero(
6552                                     &exists->b_l1hdr.b_refcnt));
6553                                 arc_change_state(arc_anon, exists, hash_lock);
6554                                 mutex_exit(hash_lock);
6555                                 arc_hdr_destroy(exists);
6556                                 exists = buf_hash_insert(hdr, &hash_lock);
6557                                 ASSERT3P(exists, ==, NULL);
6558                         } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
6559                                 /* nopwrite */
6560                                 ASSERT(zio->io_prop.zp_nopwrite);
6561                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
6562                                         panic("bad nopwrite, hdr=%p exists=%p",
6563                                             (void *)hdr, (void *)exists);
6564                         } else {
6565                                 /* Dedup */
6566                                 ASSERT(hdr->b_l1hdr.b_bufcnt == 1);
6567                                 ASSERT(hdr->b_l1hdr.b_state == arc_anon);
6568                                 ASSERT(BP_GET_DEDUP(zio->io_bp));
6569                                 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
6570                         }
6571                 }
6572                 arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
6573                 /* if it's not anon, we are doing a scrub */
6574                 if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
6575                         arc_access(hdr, hash_lock);
6576                 mutex_exit(hash_lock);
6577         } else {
6578                 arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
6579         }
6580
6581         ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
6582         callback->awcb_done(zio, buf, callback->awcb_private);
6583
6584         abd_put(zio->io_abd);
6585         kmem_free(callback, sizeof (arc_write_callback_t));
6586 }
6587
6588 zio_t *
6589 arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
6590     boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready,
6591     arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone,
6592     arc_write_done_func_t *done, void *private, zio_priority_t priority,
6593     int zio_flags, const zbookmark_phys_t *zb)
6594 {
6595         arc_buf_hdr_t *hdr = buf->b_hdr;
6596         arc_write_callback_t *callback;
6597         zio_t *zio;
6598         zio_prop_t localprop = *zp;
6599
6600         ASSERT3P(ready, !=, NULL);
6601         ASSERT3P(done, !=, NULL);
6602         ASSERT(!HDR_IO_ERROR(hdr));
6603         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
6604         ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
6605         ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
6606         if (l2arc)
6607                 arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
6608         if (ARC_BUF_COMPRESSED(buf)) {
6609                 /*
6610                  * We're writing a pre-compressed buffer.  Make the
6611                  * compression algorithm requested by the zio_prop_t match
6612                  * the pre-compressed buffer's compression algorithm.
6613                  */
6614                 localprop.zp_compress = HDR_GET_COMPRESS(hdr);
6615
6616                 ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf));
6617                 zio_flags |= ZIO_FLAG_RAW;
6618         }
6619         callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
6620         callback->awcb_ready = ready;
6621         callback->awcb_children_ready = children_ready;
6622         callback->awcb_physdone = physdone;
6623         callback->awcb_done = done;
6624         callback->awcb_private = private;
6625         callback->awcb_buf = buf;
6626
6627         /*
6628          * The hdr's b_pabd is now stale, free it now. A new data block
6629          * will be allocated when the zio pipeline calls arc_write_ready().
6630          */
6631         if (hdr->b_l1hdr.b_pabd != NULL) {
6632                 /*
6633                  * If the buf is currently sharing the data block with
6634                  * the hdr then we need to break that relationship here.
6635                  * The hdr will remain with a NULL data pointer and the
6636                  * buf will take sole ownership of the block.
6637                  */
6638                 if (arc_buf_is_shared(buf)) {
6639                         arc_unshare_buf(hdr, buf);
6640                 } else {
6641                         arc_hdr_free_pabd(hdr);
6642                 }
6643                 VERIFY3P(buf->b_data, !=, NULL);
6644                 arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
6645         }
6646         ASSERT(!arc_buf_is_shared(buf));
6647         ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
6648
6649         zio = zio_write(pio, spa, txg, bp,
6650             abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
6651             HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
6652             (children_ready != NULL) ? arc_write_children_ready : NULL,
6653             arc_write_physdone, arc_write_done, callback,
6654             priority, zio_flags, zb);
6655
6656         return (zio);
6657 }
6658
6659 static int
6660 arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
6661 {
6662 #ifdef _KERNEL
6663         uint64_t available_memory = ptob(freemem);
6664
6665 #if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
6666         available_memory = MIN(available_memory, uma_avail());
6667 #endif
6668
6669         if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100)
6670                 return (0);
6671
6672         if (txg > spa->spa_lowmem_last_txg) {
6673                 spa->spa_lowmem_last_txg = txg;
6674                 spa->spa_lowmem_page_load = 0;
6675         }
6676         /*
6677          * If we are in pageout, we know that memory is already tight,
6678          * the arc is already going to be evicting, so we just want to
6679          * continue to let page writes occur as quickly as possible.
6680          */
6681         if (curproc == pageproc) {
6682                 if (spa->spa_lowmem_page_load >
6683                     MAX(ptob(minfree), available_memory) / 4)
6684                         return (SET_ERROR(ERESTART));
6685                 /* Note: reserve is inflated, so we deflate */
6686                 atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8);
6687                 return (0);
6688         } else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) {
6689                 /* memory is low, delay before restarting */
6690                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
6691                 return (SET_ERROR(EAGAIN));
6692         }
6693         spa->spa_lowmem_page_load = 0;
6694 #endif /* _KERNEL */
6695         return (0);
6696 }
6697
6698 void
6699 arc_tempreserve_clear(uint64_t reserve)
6700 {
6701         atomic_add_64(&arc_tempreserve, -reserve);
6702         ASSERT((int64_t)arc_tempreserve >= 0);
6703 }
6704
6705 int
6706 arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg)
6707 {
6708         int error;
6709         uint64_t anon_size;
6710
6711         if (reserve > arc_c/4 && !arc_no_grow) {
6712                 arc_c = MIN(arc_c_max, reserve * 4);
6713                 DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c);
6714         }
6715         if (reserve > arc_c)
6716                 return (SET_ERROR(ENOMEM));
6717
6718         /*
6719          * Don't count loaned bufs as in flight dirty data to prevent long
6720          * network delays from blocking transactions that are ready to be
6721          * assigned to a txg.
6722          */
6723
6724         /* assert that it has not wrapped around */
6725         ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
6726
6727         anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) -
6728             arc_loaned_bytes), 0);
6729
6730         /*
6731          * Writes will, almost always, require additional memory allocations
6732          * in order to compress/encrypt/etc the data.  We therefore need to
6733          * make sure that there is sufficient available memory for this.
6734          */
6735         error = arc_memory_throttle(spa, reserve, txg);
6736         if (error != 0)
6737                 return (error);
6738
6739         /*
6740          * Throttle writes when the amount of dirty data in the cache
6741          * gets too large.  We try to keep the cache less than half full
6742          * of dirty blocks so that our sync times don't grow too large.
6743          *
6744          * In the case of one pool being built on another pool, we want
6745          * to make sure we don't end up throttling the lower (backing)
6746          * pool when the upper pool is the majority contributor to dirty
6747          * data. To insure we make forward progress during throttling, we
6748          * also check the current pool's net dirty data and only throttle
6749          * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty
6750          * data in the cache.
6751          *
6752          * Note: if two requests come in concurrently, we might let them
6753          * both succeed, when one of them should fail.  Not a huge deal.
6754          */
6755         uint64_t total_dirty = reserve + arc_tempreserve + anon_size;
6756         uint64_t spa_dirty_anon = spa_dirty_data(spa);
6757
6758         if (total_dirty > arc_c * zfs_arc_dirty_limit_percent / 100 &&
6759             anon_size > arc_c * zfs_arc_anon_limit_percent / 100 &&
6760             spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) {
6761                 uint64_t meta_esize =
6762                     refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
6763                 uint64_t data_esize =
6764                     refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
6765                 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
6766                     "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
6767                     arc_tempreserve >> 10, meta_esize >> 10,
6768                     data_esize >> 10, reserve >> 10, arc_c >> 10);
6769                 return (SET_ERROR(ERESTART));
6770         }
6771         atomic_add_64(&arc_tempreserve, reserve);
6772         return (0);
6773 }
6774
6775 static void
6776 arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
6777     kstat_named_t *evict_data, kstat_named_t *evict_metadata)
6778 {
6779         size->value.ui64 = refcount_count(&state->arcs_size);
6780         evict_data->value.ui64 =
6781             refcount_count(&state->arcs_esize[ARC_BUFC_DATA]);
6782         evict_metadata->value.ui64 =
6783             refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]);
6784 }
6785
6786 static int
6787 arc_kstat_update(kstat_t *ksp, int rw)
6788 {
6789         arc_stats_t *as = ksp->ks_data;
6790
6791         if (rw == KSTAT_WRITE) {
6792                 return (EACCES);
6793         } else {
6794                 arc_kstat_update_state(arc_anon,
6795                     &as->arcstat_anon_size,
6796                     &as->arcstat_anon_evictable_data,
6797                     &as->arcstat_anon_evictable_metadata);
6798                 arc_kstat_update_state(arc_mru,
6799                     &as->arcstat_mru_size,
6800                     &as->arcstat_mru_evictable_data,
6801                     &as->arcstat_mru_evictable_metadata);
6802                 arc_kstat_update_state(arc_mru_ghost,
6803                     &as->arcstat_mru_ghost_size,
6804                     &as->arcstat_mru_ghost_evictable_data,
6805                     &as->arcstat_mru_ghost_evictable_metadata);
6806                 arc_kstat_update_state(arc_mfu,
6807                     &as->arcstat_mfu_size,
6808                     &as->arcstat_mfu_evictable_data,
6809                     &as->arcstat_mfu_evictable_metadata);
6810                 arc_kstat_update_state(arc_mfu_ghost,
6811                     &as->arcstat_mfu_ghost_size,
6812                     &as->arcstat_mfu_ghost_evictable_data,
6813                     &as->arcstat_mfu_ghost_evictable_metadata);
6814
6815                 ARCSTAT(arcstat_size) = aggsum_value(&arc_size);
6816                 ARCSTAT(arcstat_meta_used) = aggsum_value(&arc_meta_used);
6817                 ARCSTAT(arcstat_data_size) = aggsum_value(&astat_data_size);
6818                 ARCSTAT(arcstat_metadata_size) =
6819                     aggsum_value(&astat_metadata_size);
6820                 ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size);
6821                 ARCSTAT(arcstat_bonus_size) = aggsum_value(&astat_bonus_size);
6822                 ARCSTAT(arcstat_dnode_size) = aggsum_value(&astat_dnode_size);
6823                 ARCSTAT(arcstat_dbuf_size) = aggsum_value(&astat_dbuf_size);
6824                 ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size);
6825         }
6826
6827         return (0);
6828 }
6829
6830 /*
6831  * This function *must* return indices evenly distributed between all
6832  * sublists of the multilist. This is needed due to how the ARC eviction
6833  * code is laid out; arc_evict_state() assumes ARC buffers are evenly
6834  * distributed between all sublists and uses this assumption when
6835  * deciding which sublist to evict from and how much to evict from it.
6836  */
6837 unsigned int
6838 arc_state_multilist_index_func(multilist_t *ml, void *obj)
6839 {
6840         arc_buf_hdr_t *hdr = obj;
6841
6842         /*
6843          * We rely on b_dva to generate evenly distributed index
6844          * numbers using buf_hash below. So, as an added precaution,
6845          * let's make sure we never add empty buffers to the arc lists.
6846          */
6847         ASSERT(!HDR_EMPTY(hdr));
6848
6849         /*
6850          * The assumption here, is the hash value for a given
6851          * arc_buf_hdr_t will remain constant throughout it's lifetime
6852          * (i.e. it's b_spa, b_dva, and b_birth fields don't change).
6853          * Thus, we don't need to store the header's sublist index
6854          * on insertion, as this index can be recalculated on removal.
6855          *
6856          * Also, the low order bits of the hash value are thought to be
6857          * distributed evenly. Otherwise, in the case that the multilist
6858          * has a power of two number of sublists, each sublists' usage
6859          * would not be evenly distributed.
6860          */
6861         return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
6862             multilist_get_num_sublists(ml));
6863 }
6864
6865 #ifdef _KERNEL
6866 static eventhandler_tag arc_event_lowmem = NULL;
6867
6868 static void
6869 arc_lowmem(void *arg __unused, int howto __unused)
6870 {
6871
6872         mutex_enter(&arc_reclaim_lock);
6873         DTRACE_PROBE1(arc__needfree, int64_t, ((int64_t)freemem - zfs_arc_free_target) * PAGESIZE);
6874         cv_signal(&arc_reclaim_thread_cv);
6875
6876         /*
6877          * It is unsafe to block here in arbitrary threads, because we can come
6878          * here from ARC itself and may hold ARC locks and thus risk a deadlock
6879          * with ARC reclaim thread.
6880          */
6881         if (curproc == pageproc)
6882                 (void) cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
6883         mutex_exit(&arc_reclaim_lock);
6884 }
6885 #endif
6886
6887 static void
6888 arc_state_init(void)
6889 {
6890         arc_anon = &ARC_anon;
6891         arc_mru = &ARC_mru;
6892         arc_mru_ghost = &ARC_mru_ghost;
6893         arc_mfu = &ARC_mfu;
6894         arc_mfu_ghost = &ARC_mfu_ghost;
6895         arc_l2c_only = &ARC_l2c_only;
6896
6897         arc_mru->arcs_list[ARC_BUFC_METADATA] =
6898             multilist_create(sizeof (arc_buf_hdr_t),
6899             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
6900             arc_state_multilist_index_func);
6901         arc_mru->arcs_list[ARC_BUFC_DATA] =
6902             multilist_create(sizeof (arc_buf_hdr_t),
6903             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
6904             arc_state_multilist_index_func);
6905         arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] =
6906             multilist_create(sizeof (arc_buf_hdr_t),
6907             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
6908             arc_state_multilist_index_func);
6909         arc_mru_ghost->arcs_list[ARC_BUFC_DATA] =
6910             multilist_create(sizeof (arc_buf_hdr_t),
6911             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
6912             arc_state_multilist_index_func);
6913         arc_mfu->arcs_list[ARC_BUFC_METADATA] =
6914             multilist_create(sizeof (arc_buf_hdr_t),
6915             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
6916             arc_state_multilist_index_func);
6917         arc_mfu->arcs_list[ARC_BUFC_DATA] =
6918             multilist_create(sizeof (arc_buf_hdr_t),
6919             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
6920             arc_state_multilist_index_func);
6921         arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] =
6922             multilist_create(sizeof (arc_buf_hdr_t),
6923             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
6924             arc_state_multilist_index_func);
6925         arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] =
6926             multilist_create(sizeof (arc_buf_hdr_t),
6927             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
6928             arc_state_multilist_index_func);
6929         arc_l2c_only->arcs_list[ARC_BUFC_METADATA] =
6930             multilist_create(sizeof (arc_buf_hdr_t),
6931             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
6932             arc_state_multilist_index_func);
6933         arc_l2c_only->arcs_list[ARC_BUFC_DATA] =
6934             multilist_create(sizeof (arc_buf_hdr_t),
6935             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
6936             arc_state_multilist_index_func);
6937
6938         refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
6939         refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
6940         refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
6941         refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
6942         refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
6943         refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
6944         refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
6945         refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
6946         refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
6947         refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
6948         refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
6949         refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
6950
6951         refcount_create(&arc_anon->arcs_size);
6952         refcount_create(&arc_mru->arcs_size);
6953         refcount_create(&arc_mru_ghost->arcs_size);
6954         refcount_create(&arc_mfu->arcs_size);
6955         refcount_create(&arc_mfu_ghost->arcs_size);
6956         refcount_create(&arc_l2c_only->arcs_size);
6957
6958         aggsum_init(&arc_meta_used, 0);
6959         aggsum_init(&arc_size, 0);
6960         aggsum_init(&astat_data_size, 0);
6961         aggsum_init(&astat_metadata_size, 0);
6962         aggsum_init(&astat_hdr_size, 0);
6963         aggsum_init(&astat_bonus_size, 0);
6964         aggsum_init(&astat_dnode_size, 0);
6965         aggsum_init(&astat_dbuf_size, 0);
6966         aggsum_init(&astat_l2_hdr_size, 0);
6967 }
6968
6969 static void
6970 arc_state_fini(void)
6971 {
6972         refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
6973         refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
6974         refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
6975         refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
6976         refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
6977         refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
6978         refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
6979         refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
6980         refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
6981         refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
6982         refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
6983         refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
6984
6985         refcount_destroy(&arc_anon->arcs_size);
6986         refcount_destroy(&arc_mru->arcs_size);
6987         refcount_destroy(&arc_mru_ghost->arcs_size);
6988         refcount_destroy(&arc_mfu->arcs_size);
6989         refcount_destroy(&arc_mfu_ghost->arcs_size);
6990         refcount_destroy(&arc_l2c_only->arcs_size);
6991
6992         multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]);
6993         multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
6994         multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]);
6995         multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
6996         multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]);
6997         multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
6998         multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]);
6999         multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
7000 }
7001
7002 uint64_t
7003 arc_max_bytes(void)
7004 {
7005         return (arc_c_max);
7006 }
7007
7008 void
7009 arc_init(void)
7010 {
7011         int i, prefetch_tunable_set = 0;
7012
7013         /*
7014          * allmem is "all memory that we could possibly use".
7015          */
7016 #ifdef illumos
7017 #ifdef _KERNEL
7018         uint64_t allmem = ptob(physmem - swapfs_minfree);
7019 #else
7020         uint64_t allmem = (physmem * PAGESIZE) / 2;
7021 #endif
7022 #else
7023         uint64_t allmem = kmem_size();
7024 #endif
7025
7026
7027         mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
7028         cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL);
7029         cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
7030
7031         mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL);
7032         cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL);
7033
7034         /* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */
7035         arc_c_min = MAX(allmem / 32, arc_abs_min);
7036         /* set max to 5/8 of all memory, or all but 1GB, whichever is more */
7037         if (allmem >= 1 << 30)
7038                 arc_c_max = allmem - (1 << 30);
7039         else
7040                 arc_c_max = arc_c_min;
7041         arc_c_max = MAX(allmem * 5 / 8, arc_c_max);
7042
7043         /*
7044          * In userland, there's only the memory pressure that we artificially
7045          * create (see arc_available_memory()).  Don't let arc_c get too
7046          * small, because it can cause transactions to be larger than
7047          * arc_c, causing arc_tempreserve_space() to fail.
7048          */
7049 #ifndef _KERNEL
7050         arc_c_min = arc_c_max / 2;
7051 #endif
7052
7053 #ifdef _KERNEL
7054         /*
7055          * Allow the tunables to override our calculations if they are
7056          * reasonable.
7057          */
7058         if (zfs_arc_max > arc_abs_min && zfs_arc_max < allmem) {
7059                 arc_c_max = zfs_arc_max;
7060                 arc_c_min = MIN(arc_c_min, arc_c_max);
7061         }
7062         if (zfs_arc_min > arc_abs_min && zfs_arc_min <= arc_c_max)
7063                 arc_c_min = zfs_arc_min;
7064 #endif
7065
7066         arc_c = arc_c_max;
7067         arc_p = (arc_c >> 1);
7068
7069         /* limit meta-data to 1/4 of the arc capacity */
7070         arc_meta_limit = arc_c_max / 4;
7071
7072 #ifdef _KERNEL
7073         /*
7074          * Metadata is stored in the kernel's heap.  Don't let us
7075          * use more than half the heap for the ARC.
7076          */
7077 #ifdef __FreeBSD__
7078         arc_meta_limit = MIN(arc_meta_limit, uma_limit() / 2);
7079         arc_dnode_limit = arc_meta_limit / 10;
7080 #else
7081         arc_meta_limit = MIN(arc_meta_limit,
7082             vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 2);
7083 #endif
7084 #endif
7085
7086         /* Allow the tunable to override if it is reasonable */
7087         if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
7088                 arc_meta_limit = zfs_arc_meta_limit;
7089
7090         if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
7091                 arc_c_min = arc_meta_limit / 2;
7092
7093         if (zfs_arc_meta_min > 0) {
7094                 arc_meta_min = zfs_arc_meta_min;
7095         } else {
7096                 arc_meta_min = arc_c_min / 2;
7097         }
7098
7099         /* Valid range: <arc_meta_min> - <arc_c_max> */
7100         if ((zfs_arc_dnode_limit) && (zfs_arc_dnode_limit != arc_dnode_limit) &&
7101             (zfs_arc_dnode_limit >= zfs_arc_meta_min) &&
7102             (zfs_arc_dnode_limit <= arc_c_max))
7103                 arc_dnode_limit = zfs_arc_dnode_limit;
7104
7105         if (zfs_arc_grow_retry > 0)
7106                 arc_grow_retry = zfs_arc_grow_retry;
7107
7108         if (zfs_arc_shrink_shift > 0)
7109                 arc_shrink_shift = zfs_arc_shrink_shift;
7110
7111         if (zfs_arc_no_grow_shift > 0)
7112                 arc_no_grow_shift = zfs_arc_no_grow_shift;
7113         /*
7114          * Ensure that arc_no_grow_shift is less than arc_shrink_shift.
7115          */
7116         if (arc_no_grow_shift >= arc_shrink_shift)
7117                 arc_no_grow_shift = arc_shrink_shift - 1;
7118
7119         if (zfs_arc_p_min_shift > 0)
7120                 arc_p_min_shift = zfs_arc_p_min_shift;
7121
7122         /* if kmem_flags are set, lets try to use less memory */
7123         if (kmem_debugging())
7124                 arc_c = arc_c / 2;
7125         if (arc_c < arc_c_min)
7126                 arc_c = arc_c_min;
7127
7128         zfs_arc_min = arc_c_min;
7129         zfs_arc_max = arc_c_max;
7130
7131         arc_state_init();
7132         buf_init();
7133
7134         list_create(&arc_prune_list, sizeof (arc_prune_t),
7135             offsetof(arc_prune_t, p_node));
7136         mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
7137
7138         arc_prune_taskq = taskq_create("arc_prune", max_ncpus, minclsyspri,
7139             max_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
7140
7141         arc_reclaim_thread_exit = B_FALSE;
7142         arc_dnlc_evicts_thread_exit = FALSE;
7143
7144         arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
7145             sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
7146
7147         if (arc_ksp != NULL) {
7148                 arc_ksp->ks_data = &arc_stats;
7149                 arc_ksp->ks_update = arc_kstat_update;
7150                 kstat_install(arc_ksp);
7151         }
7152
7153         (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
7154             TS_RUN, minclsyspri);
7155
7156 #ifdef _KERNEL
7157         arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
7158             EVENTHANDLER_PRI_FIRST);
7159 #endif
7160
7161         (void) thread_create(NULL, 0, arc_dnlc_evicts_thread, NULL, 0, &p0,
7162             TS_RUN, minclsyspri);
7163
7164         arc_dead = B_FALSE;
7165         arc_warm = B_FALSE;
7166
7167         /*
7168          * Calculate maximum amount of dirty data per pool.
7169          *
7170          * If it has been set by /etc/system, take that.
7171          * Otherwise, use a percentage of physical memory defined by
7172          * zfs_dirty_data_max_percent (default 10%) with a cap at
7173          * zfs_dirty_data_max_max (default 4GB).
7174          */
7175         if (zfs_dirty_data_max == 0) {
7176                 zfs_dirty_data_max = ptob(physmem) *
7177                     zfs_dirty_data_max_percent / 100;
7178                 zfs_dirty_data_max = MIN(zfs_dirty_data_max,
7179                     zfs_dirty_data_max_max);
7180         }
7181
7182 #ifdef _KERNEL
7183         if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
7184                 prefetch_tunable_set = 1;
7185
7186 #ifdef __i386__
7187         if (prefetch_tunable_set == 0) {
7188                 printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
7189                     "-- to enable,\n");
7190                 printf("            add \"vfs.zfs.prefetch_disable=0\" "
7191                     "to /boot/loader.conf.\n");
7192                 zfs_prefetch_disable = 1;
7193         }
7194 #else
7195         if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
7196             prefetch_tunable_set == 0) {
7197                 printf("ZFS NOTICE: Prefetch is disabled by default if less "
7198                     "than 4GB of RAM is present;\n"
7199                     "            to enable, add \"vfs.zfs.prefetch_disable=0\" "
7200                     "to /boot/loader.conf.\n");
7201                 zfs_prefetch_disable = 1;
7202         }
7203 #endif
7204         /* Warn about ZFS memory and address space requirements. */
7205         if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
7206                 printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
7207                     "expect unstable behavior.\n");
7208         }
7209         if (allmem < 512 * (1 << 20)) {
7210                 printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
7211                     "expect unstable behavior.\n");
7212                 printf("             Consider tuning vm.kmem_size and "
7213                     "vm.kmem_size_max\n");
7214                 printf("             in /boot/loader.conf.\n");
7215         }
7216 #endif
7217 }
7218
7219 void
7220 arc_fini(void)
7221 {
7222         arc_prune_t *p;
7223
7224 #ifdef _KERNEL
7225         if (arc_event_lowmem != NULL)
7226                 EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
7227 #endif
7228
7229         mutex_enter(&arc_reclaim_lock);
7230         arc_reclaim_thread_exit = B_TRUE;
7231         /*
7232          * The reclaim thread will set arc_reclaim_thread_exit back to
7233          * B_FALSE when it is finished exiting; we're waiting for that.
7234          */
7235         while (arc_reclaim_thread_exit) {
7236                 cv_signal(&arc_reclaim_thread_cv);
7237                 cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock);
7238         }
7239         mutex_exit(&arc_reclaim_lock);
7240
7241         /* Use B_TRUE to ensure *all* buffers are evicted */
7242         arc_flush(NULL, B_TRUE);
7243
7244         mutex_enter(&arc_dnlc_evicts_lock);
7245         arc_dnlc_evicts_thread_exit = TRUE;
7246         /*
7247          * The user evicts thread will set arc_user_evicts_thread_exit
7248          * to FALSE when it is finished exiting; we're waiting for that.
7249          */
7250         while (arc_dnlc_evicts_thread_exit) {
7251                 cv_signal(&arc_dnlc_evicts_cv);
7252                 cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock);
7253         }
7254         mutex_exit(&arc_dnlc_evicts_lock);
7255
7256         arc_dead = B_TRUE;
7257
7258         if (arc_ksp != NULL) {
7259                 kstat_delete(arc_ksp);
7260                 arc_ksp = NULL;
7261         }
7262
7263         taskq_wait(arc_prune_taskq);
7264         taskq_destroy(arc_prune_taskq);
7265
7266         mutex_enter(&arc_prune_mtx);
7267         while ((p = list_head(&arc_prune_list)) != NULL) {
7268                 list_remove(&arc_prune_list, p);
7269                 refcount_remove(&p->p_refcnt, &arc_prune_list);
7270                 refcount_destroy(&p->p_refcnt);
7271                 kmem_free(p, sizeof (*p));
7272         }
7273         mutex_exit(&arc_prune_mtx);
7274
7275         list_destroy(&arc_prune_list);
7276         mutex_destroy(&arc_prune_mtx);
7277         mutex_destroy(&arc_reclaim_lock);
7278         cv_destroy(&arc_reclaim_thread_cv);
7279         cv_destroy(&arc_reclaim_waiters_cv);
7280
7281         mutex_destroy(&arc_dnlc_evicts_lock);
7282         cv_destroy(&arc_dnlc_evicts_cv);
7283
7284         arc_state_fini();
7285         buf_fini();
7286
7287         ASSERT0(arc_loaned_bytes);
7288 }
7289
7290 /*
7291  * Level 2 ARC
7292  *
7293  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
7294  * It uses dedicated storage devices to hold cached data, which are populated
7295  * using large infrequent writes.  The main role of this cache is to boost
7296  * the performance of random read workloads.  The intended L2ARC devices
7297  * include short-stroked disks, solid state disks, and other media with
7298  * substantially faster read latency than disk.
7299  *
7300  *                 +-----------------------+
7301  *                 |         ARC           |
7302  *                 +-----------------------+
7303  *                    |         ^     ^
7304  *                    |         |     |
7305  *      l2arc_feed_thread()    arc_read()
7306  *                    |         |     |
7307  *                    |  l2arc read   |
7308  *                    V         |     |
7309  *               +---------------+    |
7310  *               |     L2ARC     |    |
7311  *               +---------------+    |
7312  *                   |    ^           |
7313  *          l2arc_write() |           |
7314  *                   |    |           |
7315  *                   V    |           |
7316  *                 +-------+      +-------+
7317  *                 | vdev  |      | vdev  |
7318  *                 | cache |      | cache |
7319  *                 +-------+      +-------+
7320  *                 +=========+     .-----.
7321  *                 :  L2ARC  :    |-_____-|
7322  *                 : devices :    | Disks |
7323  *                 +=========+    `-_____-'
7324  *
7325  * Read requests are satisfied from the following sources, in order:
7326  *
7327  *      1) ARC
7328  *      2) vdev cache of L2ARC devices
7329  *      3) L2ARC devices
7330  *      4) vdev cache of disks
7331  *      5) disks
7332  *
7333  * Some L2ARC device types exhibit extremely slow write performance.
7334  * To accommodate for this there are some significant differences between
7335  * the L2ARC and traditional cache design:
7336  *
7337  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
7338  * the ARC behave as usual, freeing buffers and placing headers on ghost
7339  * lists.  The ARC does not send buffers to the L2ARC during eviction as
7340  * this would add inflated write latencies for all ARC memory pressure.
7341  *
7342  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
7343  * It does this by periodically scanning buffers from the eviction-end of
7344  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
7345  * not already there. It scans until a headroom of buffers is satisfied,
7346  * which itself is a buffer for ARC eviction. If a compressible buffer is
7347  * found during scanning and selected for writing to an L2ARC device, we
7348  * temporarily boost scanning headroom during the next scan cycle to make
7349  * sure we adapt to compression effects (which might significantly reduce
7350  * the data volume we write to L2ARC). The thread that does this is
7351  * l2arc_feed_thread(), illustrated below; example sizes are included to
7352  * provide a better sense of ratio than this diagram:
7353  *
7354  *             head -->                        tail
7355  *              +---------------------+----------+
7356  *      ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
7357  *              +---------------------+----------+   |   o L2ARC eligible
7358  *      ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
7359  *              +---------------------+----------+   |
7360  *                   15.9 Gbytes      ^ 32 Mbytes    |
7361  *                                 headroom          |
7362  *                                            l2arc_feed_thread()
7363  *                                                   |
7364  *                       l2arc write hand <--[oooo]--'
7365  *                               |           8 Mbyte
7366  *                               |          write max
7367  *                               V
7368  *                +==============================+
7369  *      L2ARC dev |####|#|###|###|    |####| ... |
7370  *                +==============================+
7371  *                           32 Gbytes
7372  *
7373  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
7374  * evicted, then the L2ARC has cached a buffer much sooner than it probably
7375  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
7376  * safe to say that this is an uncommon case, since buffers at the end of
7377  * the ARC lists have moved there due to inactivity.
7378  *
7379  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
7380  * then the L2ARC simply misses copying some buffers.  This serves as a
7381  * pressure valve to prevent heavy read workloads from both stalling the ARC
7382  * with waits and clogging the L2ARC with writes.  This also helps prevent
7383  * the potential for the L2ARC to churn if it attempts to cache content too
7384  * quickly, such as during backups of the entire pool.
7385  *
7386  * 5. After system boot and before the ARC has filled main memory, there are
7387  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
7388  * lists can remain mostly static.  Instead of searching from tail of these
7389  * lists as pictured, the l2arc_feed_thread() will search from the list heads
7390  * for eligible buffers, greatly increasing its chance of finding them.
7391  *
7392  * The L2ARC device write speed is also boosted during this time so that
7393  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
7394  * there are no L2ARC reads, and no fear of degrading read performance
7395  * through increased writes.
7396  *
7397  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
7398  * the vdev queue can aggregate them into larger and fewer writes.  Each
7399  * device is written to in a rotor fashion, sweeping writes through
7400  * available space then repeating.
7401  *
7402  * 7. The L2ARC does not store dirty content.  It never needs to flush
7403  * write buffers back to disk based storage.
7404  *
7405  * 8. If an ARC buffer is written (and dirtied) which also exists in the
7406  * L2ARC, the now stale L2ARC buffer is immediately dropped.
7407  *
7408  * The performance of the L2ARC can be tweaked by a number of tunables, which
7409  * may be necessary for different workloads:
7410  *
7411  *      l2arc_write_max         max write bytes per interval
7412  *      l2arc_write_boost       extra write bytes during device warmup
7413  *      l2arc_noprefetch        skip caching prefetched buffers
7414  *      l2arc_headroom          number of max device writes to precache
7415  *      l2arc_headroom_boost    when we find compressed buffers during ARC
7416  *                              scanning, we multiply headroom by this
7417  *                              percentage factor for the next scan cycle,
7418  *                              since more compressed buffers are likely to
7419  *                              be present
7420  *      l2arc_feed_secs         seconds between L2ARC writing
7421  *
7422  * Tunables may be removed or added as future performance improvements are
7423  * integrated, and also may become zpool properties.
7424  *
7425  * There are three key functions that control how the L2ARC warms up:
7426  *
7427  *      l2arc_write_eligible()  check if a buffer is eligible to cache
7428  *      l2arc_write_size()      calculate how much to write
7429  *      l2arc_write_interval()  calculate sleep delay between writes
7430  *
7431  * These three functions determine what to write, how much, and how quickly
7432  * to send writes.
7433  */
7434
7435 static boolean_t
7436 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
7437 {
7438         /*
7439          * A buffer is *not* eligible for the L2ARC if it:
7440          * 1. belongs to a different spa.
7441          * 2. is already cached on the L2ARC.
7442          * 3. has an I/O in progress (it may be an incomplete read).
7443          * 4. is flagged not eligible (zfs property).
7444          */
7445         if (hdr->b_spa != spa_guid) {
7446                 ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
7447                 return (B_FALSE);
7448         }
7449         if (HDR_HAS_L2HDR(hdr)) {
7450                 ARCSTAT_BUMP(arcstat_l2_write_in_l2);
7451                 return (B_FALSE);
7452         }
7453         if (HDR_IO_IN_PROGRESS(hdr)) {
7454                 ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
7455                 return (B_FALSE);
7456         }
7457         if (!HDR_L2CACHE(hdr)) {
7458                 ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
7459                 return (B_FALSE);
7460         }
7461
7462         return (B_TRUE);
7463 }
7464
7465 static uint64_t
7466 l2arc_write_size(void)
7467 {
7468         uint64_t size;
7469
7470         /*
7471          * Make sure our globals have meaningful values in case the user
7472          * altered them.
7473          */
7474         size = l2arc_write_max;
7475         if (size == 0) {
7476                 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
7477                     "be greater than zero, resetting it to the default (%d)",
7478                     L2ARC_WRITE_SIZE);
7479                 size = l2arc_write_max = L2ARC_WRITE_SIZE;
7480         }
7481
7482         if (arc_warm == B_FALSE)
7483                 size += l2arc_write_boost;
7484
7485         return (size);
7486
7487 }
7488
7489 static clock_t
7490 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
7491 {
7492         clock_t interval, next, now;
7493
7494         /*
7495          * If the ARC lists are busy, increase our write rate; if the
7496          * lists are stale, idle back.  This is achieved by checking
7497          * how much we previously wrote - if it was more than half of
7498          * what we wanted, schedule the next write much sooner.
7499          */
7500         if (l2arc_feed_again && wrote > (wanted / 2))
7501                 interval = (hz * l2arc_feed_min_ms) / 1000;
7502         else
7503                 interval = hz * l2arc_feed_secs;
7504
7505         now = ddi_get_lbolt();
7506         next = MAX(now, MIN(now + interval, began + interval));
7507
7508         return (next);
7509 }
7510
7511 /*
7512  * Cycle through L2ARC devices.  This is how L2ARC load balances.
7513  * If a device is returned, this also returns holding the spa config lock.
7514  */
7515 static l2arc_dev_t *
7516 l2arc_dev_get_next(void)
7517 {
7518         l2arc_dev_t *first, *next = NULL;
7519
7520         /*
7521          * Lock out the removal of spas (spa_namespace_lock), then removal
7522          * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
7523          * both locks will be dropped and a spa config lock held instead.
7524          */
7525         mutex_enter(&spa_namespace_lock);
7526         mutex_enter(&l2arc_dev_mtx);
7527
7528         /* if there are no vdevs, there is nothing to do */
7529         if (l2arc_ndev == 0)
7530                 goto out;
7531
7532         first = NULL;
7533         next = l2arc_dev_last;
7534         do {
7535                 /* loop around the list looking for a non-faulted vdev */
7536                 if (next == NULL) {
7537                         next = list_head(l2arc_dev_list);
7538                 } else {
7539                         next = list_next(l2arc_dev_list, next);
7540                         if (next == NULL)
7541                                 next = list_head(l2arc_dev_list);
7542                 }
7543
7544                 /* if we have come back to the start, bail out */
7545                 if (first == NULL)
7546                         first = next;
7547                 else if (next == first)
7548                         break;
7549
7550         } while (vdev_is_dead(next->l2ad_vdev));
7551
7552         /* if we were unable to find any usable vdevs, return NULL */
7553         if (vdev_is_dead(next->l2ad_vdev))
7554                 next = NULL;
7555
7556         l2arc_dev_last = next;
7557
7558 out:
7559         mutex_exit(&l2arc_dev_mtx);
7560
7561         /*
7562          * Grab the config lock to prevent the 'next' device from being
7563          * removed while we are writing to it.
7564          */
7565         if (next != NULL)
7566                 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
7567         mutex_exit(&spa_namespace_lock);
7568
7569         return (next);
7570 }
7571
7572 /*
7573  * Free buffers that were tagged for destruction.
7574  */
7575 static void
7576 l2arc_do_free_on_write()
7577 {
7578         list_t *buflist;
7579         l2arc_data_free_t *df, *df_prev;
7580
7581         mutex_enter(&l2arc_free_on_write_mtx);
7582         buflist = l2arc_free_on_write;
7583
7584         for (df = list_tail(buflist); df; df = df_prev) {
7585                 df_prev = list_prev(buflist, df);
7586                 ASSERT3P(df->l2df_abd, !=, NULL);
7587                 abd_free(df->l2df_abd);
7588                 list_remove(buflist, df);
7589                 kmem_free(df, sizeof (l2arc_data_free_t));
7590         }
7591
7592         mutex_exit(&l2arc_free_on_write_mtx);
7593 }
7594
7595 /*
7596  * A write to a cache device has completed.  Update all headers to allow
7597  * reads from these buffers to begin.
7598  */
7599 static void
7600 l2arc_write_done(zio_t *zio)
7601 {
7602         l2arc_write_callback_t *cb;
7603         l2arc_dev_t *dev;
7604         list_t *buflist;
7605         arc_buf_hdr_t *head, *hdr, *hdr_prev;
7606         kmutex_t *hash_lock;
7607         int64_t bytes_dropped = 0;
7608
7609         cb = zio->io_private;
7610         ASSERT3P(cb, !=, NULL);
7611         dev = cb->l2wcb_dev;
7612         ASSERT3P(dev, !=, NULL);
7613         head = cb->l2wcb_head;
7614         ASSERT3P(head, !=, NULL);
7615         buflist = &dev->l2ad_buflist;
7616         ASSERT3P(buflist, !=, NULL);
7617         DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
7618             l2arc_write_callback_t *, cb);
7619
7620         if (zio->io_error != 0)
7621                 ARCSTAT_BUMP(arcstat_l2_writes_error);
7622
7623         /*
7624          * All writes completed, or an error was hit.
7625          */
7626 top:
7627         mutex_enter(&dev->l2ad_mtx);
7628         for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
7629                 hdr_prev = list_prev(buflist, hdr);
7630
7631                 hash_lock = HDR_LOCK(hdr);
7632
7633                 /*
7634                  * We cannot use mutex_enter or else we can deadlock
7635                  * with l2arc_write_buffers (due to swapping the order
7636                  * the hash lock and l2ad_mtx are taken).
7637                  */
7638                 if (!mutex_tryenter(hash_lock)) {
7639                         /*
7640                          * Missed the hash lock. We must retry so we
7641                          * don't leave the ARC_FLAG_L2_WRITING bit set.
7642                          */
7643                         ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
7644
7645                         /*
7646                          * We don't want to rescan the headers we've
7647                          * already marked as having been written out, so
7648                          * we reinsert the head node so we can pick up
7649                          * where we left off.
7650                          */
7651                         list_remove(buflist, head);
7652                         list_insert_after(buflist, hdr, head);
7653
7654                         mutex_exit(&dev->l2ad_mtx);
7655
7656                         /*
7657                          * We wait for the hash lock to become available
7658                          * to try and prevent busy waiting, and increase
7659                          * the chance we'll be able to acquire the lock
7660                          * the next time around.
7661                          */
7662                         mutex_enter(hash_lock);
7663                         mutex_exit(hash_lock);
7664                         goto top;
7665                 }
7666
7667                 /*
7668                  * We could not have been moved into the arc_l2c_only
7669                  * state while in-flight due to our ARC_FLAG_L2_WRITING
7670                  * bit being set. Let's just ensure that's being enforced.
7671                  */
7672                 ASSERT(HDR_HAS_L1HDR(hdr));
7673
7674                 if (zio->io_error != 0) {
7675                         /*
7676                          * Error - drop L2ARC entry.
7677                          */
7678                         list_remove(buflist, hdr);
7679                         l2arc_trim(hdr);
7680                         arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
7681
7682                         ARCSTAT_INCR(arcstat_l2_psize, -arc_hdr_size(hdr));
7683                         ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr));
7684
7685                         bytes_dropped += arc_hdr_size(hdr);
7686                         (void) refcount_remove_many(&dev->l2ad_alloc,
7687                             arc_hdr_size(hdr), hdr);
7688                 }
7689
7690                 /*
7691                  * Allow ARC to begin reads and ghost list evictions to
7692                  * this L2ARC entry.
7693                  */
7694                 arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING);
7695
7696                 mutex_exit(hash_lock);
7697         }
7698
7699         atomic_inc_64(&l2arc_writes_done);
7700         list_remove(buflist, head);
7701         ASSERT(!HDR_HAS_L1HDR(head));
7702         kmem_cache_free(hdr_l2only_cache, head);
7703         mutex_exit(&dev->l2ad_mtx);
7704
7705         vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
7706
7707         l2arc_do_free_on_write();
7708
7709         kmem_free(cb, sizeof (l2arc_write_callback_t));
7710 }
7711
7712 /*
7713  * A read to a cache device completed.  Validate buffer contents before
7714  * handing over to the regular ARC routines.
7715  */
7716 static void
7717 l2arc_read_done(zio_t *zio)
7718 {
7719         l2arc_read_callback_t *cb;
7720         arc_buf_hdr_t *hdr;
7721         kmutex_t *hash_lock;
7722         boolean_t valid_cksum;
7723
7724         ASSERT3P(zio->io_vd, !=, NULL);
7725         ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
7726
7727         spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
7728
7729         cb = zio->io_private;
7730         ASSERT3P(cb, !=, NULL);
7731         hdr = cb->l2rcb_hdr;
7732         ASSERT3P(hdr, !=, NULL);
7733
7734         hash_lock = HDR_LOCK(hdr);
7735         mutex_enter(hash_lock);
7736         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
7737
7738         /*
7739          * If the data was read into a temporary buffer,
7740          * move it and free the buffer.
7741          */
7742         if (cb->l2rcb_abd != NULL) {
7743                 ASSERT3U(arc_hdr_size(hdr), <, zio->io_size);
7744                 if (zio->io_error == 0) {
7745                         abd_copy(hdr->b_l1hdr.b_pabd, cb->l2rcb_abd,
7746                             arc_hdr_size(hdr));
7747                 }
7748
7749                 /*
7750                  * The following must be done regardless of whether
7751                  * there was an error:
7752                  * - free the temporary buffer
7753                  * - point zio to the real ARC buffer
7754                  * - set zio size accordingly
7755                  * These are required because zio is either re-used for
7756                  * an I/O of the block in the case of the error
7757                  * or the zio is passed to arc_read_done() and it
7758                  * needs real data.
7759                  */
7760                 abd_free(cb->l2rcb_abd);
7761                 zio->io_size = zio->io_orig_size = arc_hdr_size(hdr);
7762                 zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd;
7763         }
7764
7765         ASSERT3P(zio->io_abd, !=, NULL);
7766
7767         /*
7768          * Check this survived the L2ARC journey.
7769          */
7770         ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd);
7771         zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
7772         zio->io_bp = &zio->io_bp_copy;  /* XXX fix in L2ARC 2.0 */
7773
7774         valid_cksum = arc_cksum_is_equal(hdr, zio);
7775         if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
7776                 mutex_exit(hash_lock);
7777                 zio->io_private = hdr;
7778                 arc_read_done(zio);
7779         } else {
7780                 mutex_exit(hash_lock);
7781                 /*
7782                  * Buffer didn't survive caching.  Increment stats and
7783                  * reissue to the original storage device.
7784                  */
7785                 if (zio->io_error != 0) {
7786                         ARCSTAT_BUMP(arcstat_l2_io_error);
7787                 } else {
7788                         zio->io_error = SET_ERROR(EIO);
7789                 }
7790                 if (!valid_cksum)
7791                         ARCSTAT_BUMP(arcstat_l2_cksum_bad);
7792
7793                 /*
7794                  * If there's no waiter, issue an async i/o to the primary
7795                  * storage now.  If there *is* a waiter, the caller must
7796                  * issue the i/o in a context where it's OK to block.
7797                  */
7798                 if (zio->io_waiter == NULL) {
7799                         zio_t *pio = zio_unique_parent(zio);
7800
7801                         ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
7802
7803                         zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp,
7804                             hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done,
7805                             hdr, zio->io_priority, cb->l2rcb_flags,
7806                             &cb->l2rcb_zb));
7807                 }
7808         }
7809
7810         kmem_free(cb, sizeof (l2arc_read_callback_t));
7811 }
7812
7813 /*
7814  * This is the list priority from which the L2ARC will search for pages to
7815  * cache.  This is used within loops (0..3) to cycle through lists in the
7816  * desired order.  This order can have a significant effect on cache
7817  * performance.
7818  *
7819  * Currently the metadata lists are hit first, MFU then MRU, followed by
7820  * the data lists.  This function returns a locked list, and also returns
7821  * the lock pointer.
7822  */
7823 static multilist_sublist_t *
7824 l2arc_sublist_lock(int list_num)
7825 {
7826         multilist_t *ml = NULL;
7827         unsigned int idx;
7828
7829         ASSERT(list_num >= 0 && list_num <= 3);
7830
7831         switch (list_num) {
7832         case 0:
7833                 ml = arc_mfu->arcs_list[ARC_BUFC_METADATA];
7834                 break;
7835         case 1:
7836                 ml = arc_mru->arcs_list[ARC_BUFC_METADATA];
7837                 break;
7838         case 2:
7839                 ml = arc_mfu->arcs_list[ARC_BUFC_DATA];
7840                 break;
7841         case 3:
7842                 ml = arc_mru->arcs_list[ARC_BUFC_DATA];
7843                 break;
7844         }
7845
7846         /*
7847          * Return a randomly-selected sublist. This is acceptable
7848          * because the caller feeds only a little bit of data for each
7849          * call (8MB). Subsequent calls will result in different
7850          * sublists being selected.
7851          */
7852         idx = multilist_get_random_index(ml);
7853         return (multilist_sublist_lock(ml, idx));
7854 }
7855
7856 /*
7857  * Evict buffers from the device write hand to the distance specified in
7858  * bytes.  This distance may span populated buffers, it may span nothing.
7859  * This is clearing a region on the L2ARC device ready for writing.
7860  * If the 'all' boolean is set, every buffer is evicted.
7861  */
7862 static void
7863 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
7864 {
7865         list_t *buflist;
7866         arc_buf_hdr_t *hdr, *hdr_prev;
7867         kmutex_t *hash_lock;
7868         uint64_t taddr;
7869
7870         buflist = &dev->l2ad_buflist;
7871
7872         if (!all && dev->l2ad_first) {
7873                 /*
7874                  * This is the first sweep through the device.  There is
7875                  * nothing to evict.
7876                  */
7877                 return;
7878         }
7879
7880         if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
7881                 /*
7882                  * When nearing the end of the device, evict to the end
7883                  * before the device write hand jumps to the start.
7884                  */
7885                 taddr = dev->l2ad_end;
7886         } else {
7887                 taddr = dev->l2ad_hand + distance;
7888         }
7889         DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
7890             uint64_t, taddr, boolean_t, all);
7891
7892 top:
7893         mutex_enter(&dev->l2ad_mtx);
7894         for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
7895                 hdr_prev = list_prev(buflist, hdr);
7896
7897                 hash_lock = HDR_LOCK(hdr);
7898
7899                 /*
7900                  * We cannot use mutex_enter or else we can deadlock
7901                  * with l2arc_write_buffers (due to swapping the order
7902                  * the hash lock and l2ad_mtx are taken).
7903                  */
7904                 if (!mutex_tryenter(hash_lock)) {
7905                         /*
7906                          * Missed the hash lock.  Retry.
7907                          */
7908                         ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
7909                         mutex_exit(&dev->l2ad_mtx);
7910                         mutex_enter(hash_lock);
7911                         mutex_exit(hash_lock);
7912                         goto top;
7913                 }
7914
7915                 /*
7916                  * A header can't be on this list if it doesn't have L2 header.
7917                  */
7918                 ASSERT(HDR_HAS_L2HDR(hdr));
7919
7920                 /* Ensure this header has finished being written. */
7921                 ASSERT(!HDR_L2_WRITING(hdr));
7922                 ASSERT(!HDR_L2_WRITE_HEAD(hdr));
7923
7924                 if (!all && (hdr->b_l2hdr.b_daddr >= taddr ||
7925                     hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
7926                         /*
7927                          * We've evicted to the target address,
7928                          * or the end of the device.
7929                          */
7930                         mutex_exit(hash_lock);
7931                         break;
7932                 }
7933
7934                 if (!HDR_HAS_L1HDR(hdr)) {
7935                         ASSERT(!HDR_L2_READING(hdr));
7936                         /*
7937                          * This doesn't exist in the ARC.  Destroy.
7938                          * arc_hdr_destroy() will call list_remove()
7939                          * and decrement arcstat_l2_lsize.
7940                          */
7941                         arc_change_state(arc_anon, hdr, hash_lock);
7942                         arc_hdr_destroy(hdr);
7943                 } else {
7944                         ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
7945                         ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
7946                         /*
7947                          * Invalidate issued or about to be issued
7948                          * reads, since we may be about to write
7949                          * over this location.
7950                          */
7951                         if (HDR_L2_READING(hdr)) {
7952                                 ARCSTAT_BUMP(arcstat_l2_evict_reading);
7953                                 arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED);
7954                         }
7955
7956                         arc_hdr_l2hdr_destroy(hdr);
7957                 }
7958                 mutex_exit(hash_lock);
7959         }
7960         mutex_exit(&dev->l2ad_mtx);
7961 }
7962
7963 /*
7964  * Find and write ARC buffers to the L2ARC device.
7965  *
7966  * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
7967  * for reading until they have completed writing.
7968  * The headroom_boost is an in-out parameter used to maintain headroom boost
7969  * state between calls to this function.
7970  *
7971  * Returns the number of bytes actually written (which may be smaller than
7972  * the delta by which the device hand has changed due to alignment).
7973  */
7974 static uint64_t
7975 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
7976 {
7977         arc_buf_hdr_t *hdr, *hdr_prev, *head;
7978         uint64_t write_asize, write_psize, write_lsize, headroom;
7979         boolean_t full;
7980         l2arc_write_callback_t *cb;
7981         zio_t *pio, *wzio;
7982         uint64_t guid = spa_load_guid(spa);
7983         int try;
7984
7985         ASSERT3P(dev->l2ad_vdev, !=, NULL);
7986
7987         pio = NULL;
7988         write_lsize = write_asize = write_psize = 0;
7989         full = B_FALSE;
7990         head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
7991         arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
7992
7993         ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
7994         /*
7995          * Copy buffers for L2ARC writing.
7996          */
7997         for (try = 0; try <= 3; try++) {
7998                 multilist_sublist_t *mls = l2arc_sublist_lock(try);
7999                 uint64_t passed_sz = 0;
8000
8001                 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
8002
8003                 /*
8004                  * L2ARC fast warmup.
8005                  *
8006                  * Until the ARC is warm and starts to evict, read from the
8007                  * head of the ARC lists rather than the tail.
8008                  */
8009                 if (arc_warm == B_FALSE)
8010                         hdr = multilist_sublist_head(mls);
8011                 else
8012                         hdr = multilist_sublist_tail(mls);
8013                 if (hdr == NULL)
8014                         ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
8015
8016                 headroom = target_sz * l2arc_headroom;
8017                 if (zfs_compressed_arc_enabled)
8018                         headroom = (headroom * l2arc_headroom_boost) / 100;
8019
8020                 for (; hdr; hdr = hdr_prev) {
8021                         kmutex_t *hash_lock;
8022
8023                         if (arc_warm == B_FALSE)
8024                                 hdr_prev = multilist_sublist_next(mls, hdr);
8025                         else
8026                                 hdr_prev = multilist_sublist_prev(mls, hdr);
8027                         ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned,
8028                             HDR_GET_LSIZE(hdr));
8029
8030                         hash_lock = HDR_LOCK(hdr);
8031                         if (!mutex_tryenter(hash_lock)) {
8032                                 ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
8033                                 /*
8034                                  * Skip this buffer rather than waiting.
8035                                  */
8036                                 continue;
8037                         }
8038
8039                         passed_sz += HDR_GET_LSIZE(hdr);
8040                         if (passed_sz > headroom) {
8041                                 /*
8042                                  * Searched too far.
8043                                  */
8044                                 mutex_exit(hash_lock);
8045                                 ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
8046                                 break;
8047                         }
8048
8049                         if (!l2arc_write_eligible(guid, hdr)) {
8050                                 mutex_exit(hash_lock);
8051                                 continue;
8052                         }
8053
8054                         /*
8055                          * We rely on the L1 portion of the header below, so
8056                          * it's invalid for this header to have been evicted out
8057                          * of the ghost cache, prior to being written out. The
8058                          * ARC_FLAG_L2_WRITING bit ensures this won't happen.
8059                          */
8060                         ASSERT(HDR_HAS_L1HDR(hdr));
8061
8062                         ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
8063                         ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
8064                         ASSERT3U(arc_hdr_size(hdr), >, 0);
8065                         uint64_t psize = arc_hdr_size(hdr);
8066                         uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
8067                             psize);
8068
8069                         if ((write_asize + asize) > target_sz) {
8070                                 full = B_TRUE;
8071                                 mutex_exit(hash_lock);
8072                                 ARCSTAT_BUMP(arcstat_l2_write_full);
8073                                 break;
8074                         }
8075
8076                         if (pio == NULL) {
8077                                 /*
8078                                  * Insert a dummy header on the buflist so
8079                                  * l2arc_write_done() can find where the
8080                                  * write buffers begin without searching.
8081                                  */
8082                                 mutex_enter(&dev->l2ad_mtx);
8083                                 list_insert_head(&dev->l2ad_buflist, head);
8084                                 mutex_exit(&dev->l2ad_mtx);
8085
8086                                 cb = kmem_alloc(
8087                                     sizeof (l2arc_write_callback_t), KM_SLEEP);
8088                                 cb->l2wcb_dev = dev;
8089                                 cb->l2wcb_head = head;
8090                                 pio = zio_root(spa, l2arc_write_done, cb,
8091                                     ZIO_FLAG_CANFAIL);
8092                                 ARCSTAT_BUMP(arcstat_l2_write_pios);
8093                         }
8094
8095                         hdr->b_l2hdr.b_dev = dev;
8096                         hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
8097                         arc_hdr_set_flags(hdr,
8098                             ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR);
8099
8100                         mutex_enter(&dev->l2ad_mtx);
8101                         list_insert_head(&dev->l2ad_buflist, hdr);
8102                         mutex_exit(&dev->l2ad_mtx);
8103
8104                         (void) refcount_add_many(&dev->l2ad_alloc, psize, hdr);
8105
8106                         /*
8107                          * Normally the L2ARC can use the hdr's data, but if
8108                          * we're sharing data between the hdr and one of its
8109                          * bufs, L2ARC needs its own copy of the data so that
8110                          * the ZIO below can't race with the buf consumer.
8111                          * Another case where we need to create a copy of the
8112                          * data is when the buffer size is not device-aligned
8113                          * and we need to pad the block to make it such.
8114                          * That also keeps the clock hand suitably aligned.
8115                          *
8116                          * To ensure that the copy will be available for the
8117                          * lifetime of the ZIO and be cleaned up afterwards, we
8118                          * add it to the l2arc_free_on_write queue.
8119                          */
8120                         abd_t *to_write;
8121                         if (!HDR_SHARED_DATA(hdr) && psize == asize) {
8122                                 to_write = hdr->b_l1hdr.b_pabd;
8123                         } else {
8124                                 to_write = abd_alloc_for_io(asize,
8125                                     HDR_ISTYPE_METADATA(hdr));
8126                                 abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize);
8127                                 if (asize != psize) {
8128                                         abd_zero_off(to_write, psize,
8129                                             asize - psize);
8130                                 }
8131                                 l2arc_free_abd_on_write(to_write, asize,
8132                                     arc_buf_type(hdr));
8133                         }
8134                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
8135                             hdr->b_l2hdr.b_daddr, asize, to_write,
8136                             ZIO_CHECKSUM_OFF, NULL, hdr,
8137                             ZIO_PRIORITY_ASYNC_WRITE,
8138                             ZIO_FLAG_CANFAIL, B_FALSE);
8139
8140                         write_lsize += HDR_GET_LSIZE(hdr);
8141                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
8142                             zio_t *, wzio);
8143
8144                         write_psize += psize;
8145                         write_asize += asize;
8146                         dev->l2ad_hand += asize;
8147
8148                         mutex_exit(hash_lock);
8149
8150                         (void) zio_nowait(wzio);
8151                 }
8152
8153                 multilist_sublist_unlock(mls);
8154
8155                 if (full == B_TRUE)
8156                         break;
8157         }
8158
8159         /* No buffers selected for writing? */
8160         if (pio == NULL) {
8161                 ASSERT0(write_lsize);
8162                 ASSERT(!HDR_HAS_L1HDR(head));
8163                 kmem_cache_free(hdr_l2only_cache, head);
8164                 return (0);
8165         }
8166
8167         ASSERT3U(write_psize, <=, target_sz);
8168         ARCSTAT_BUMP(arcstat_l2_writes_sent);
8169         ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize);
8170         ARCSTAT_INCR(arcstat_l2_lsize, write_lsize);
8171         ARCSTAT_INCR(arcstat_l2_psize, write_psize);
8172         vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
8173
8174         /*
8175          * Bump device hand to the device start if it is approaching the end.
8176          * l2arc_evict() will already have evicted ahead for this case.
8177          */
8178         if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
8179                 dev->l2ad_hand = dev->l2ad_start;
8180                 dev->l2ad_first = B_FALSE;
8181         }
8182
8183         dev->l2ad_writing = B_TRUE;
8184         (void) zio_wait(pio);
8185         dev->l2ad_writing = B_FALSE;
8186
8187         return (write_asize);
8188 }
8189
8190 /*
8191  * This thread feeds the L2ARC at regular intervals.  This is the beating
8192  * heart of the L2ARC.
8193  */
8194 /* ARGSUSED */
8195 static void
8196 l2arc_feed_thread(void *unused __unused)
8197 {
8198         callb_cpr_t cpr;
8199         l2arc_dev_t *dev;
8200         spa_t *spa;
8201         uint64_t size, wrote;
8202         clock_t begin, next = ddi_get_lbolt();
8203
8204         CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
8205
8206         mutex_enter(&l2arc_feed_thr_lock);
8207
8208         while (l2arc_thread_exit == 0) {
8209                 CALLB_CPR_SAFE_BEGIN(&cpr);
8210                 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
8211                     next - ddi_get_lbolt());
8212                 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
8213                 next = ddi_get_lbolt() + hz;
8214
8215                 /*
8216                  * Quick check for L2ARC devices.
8217                  */
8218                 mutex_enter(&l2arc_dev_mtx);
8219                 if (l2arc_ndev == 0) {
8220                         mutex_exit(&l2arc_dev_mtx);
8221                         continue;
8222                 }
8223                 mutex_exit(&l2arc_dev_mtx);
8224                 begin = ddi_get_lbolt();
8225
8226                 /*
8227                  * This selects the next l2arc device to write to, and in
8228                  * doing so the next spa to feed from: dev->l2ad_spa.   This
8229                  * will return NULL if there are now no l2arc devices or if
8230                  * they are all faulted.
8231                  *
8232                  * If a device is returned, its spa's config lock is also
8233                  * held to prevent device removal.  l2arc_dev_get_next()
8234                  * will grab and release l2arc_dev_mtx.
8235                  */
8236                 if ((dev = l2arc_dev_get_next()) == NULL)
8237                         continue;
8238
8239                 spa = dev->l2ad_spa;
8240                 ASSERT3P(spa, !=, NULL);
8241
8242                 /*
8243                  * If the pool is read-only then force the feed thread to
8244                  * sleep a little longer.
8245                  */
8246                 if (!spa_writeable(spa)) {
8247                         next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
8248                         spa_config_exit(spa, SCL_L2ARC, dev);
8249                         continue;
8250                 }
8251
8252                 /*
8253                  * Avoid contributing to memory pressure.
8254                  */
8255                 if (arc_reclaim_needed()) {
8256                         ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
8257                         spa_config_exit(spa, SCL_L2ARC, dev);
8258                         continue;
8259                 }
8260
8261                 ARCSTAT_BUMP(arcstat_l2_feeds);
8262
8263                 size = l2arc_write_size();
8264
8265                 /*
8266                  * Evict L2ARC buffers that will be overwritten.
8267                  */
8268                 l2arc_evict(dev, size, B_FALSE);
8269
8270                 /*
8271                  * Write ARC buffers.
8272                  */
8273                 wrote = l2arc_write_buffers(spa, dev, size);
8274
8275                 /*
8276                  * Calculate interval between writes.
8277                  */
8278                 next = l2arc_write_interval(begin, size, wrote);
8279                 spa_config_exit(spa, SCL_L2ARC, dev);
8280         }
8281
8282         l2arc_thread_exit = 0;
8283         cv_broadcast(&l2arc_feed_thr_cv);
8284         CALLB_CPR_EXIT(&cpr);           /* drops l2arc_feed_thr_lock */
8285         thread_exit();
8286 }
8287
8288 boolean_t
8289 l2arc_vdev_present(vdev_t *vd)
8290 {
8291         l2arc_dev_t *dev;
8292
8293         mutex_enter(&l2arc_dev_mtx);
8294         for (dev = list_head(l2arc_dev_list); dev != NULL;
8295             dev = list_next(l2arc_dev_list, dev)) {
8296                 if (dev->l2ad_vdev == vd)
8297                         break;
8298         }
8299         mutex_exit(&l2arc_dev_mtx);
8300
8301         return (dev != NULL);
8302 }
8303
8304 /*
8305  * Add a vdev for use by the L2ARC.  By this point the spa has already
8306  * validated the vdev and opened it.
8307  */
8308 void
8309 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
8310 {
8311         l2arc_dev_t *adddev;
8312
8313         ASSERT(!l2arc_vdev_present(vd));
8314
8315         vdev_ashift_optimize(vd);
8316
8317         /*
8318          * Create a new l2arc device entry.
8319          */
8320         adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
8321         adddev->l2ad_spa = spa;
8322         adddev->l2ad_vdev = vd;
8323         adddev->l2ad_start = VDEV_LABEL_START_SIZE;
8324         adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
8325         adddev->l2ad_hand = adddev->l2ad_start;
8326         adddev->l2ad_first = B_TRUE;
8327         adddev->l2ad_writing = B_FALSE;
8328
8329         mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
8330         /*
8331          * This is a list of all ARC buffers that are still valid on the
8332          * device.
8333          */
8334         list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
8335             offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
8336
8337         vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
8338         refcount_create(&adddev->l2ad_alloc);
8339
8340         /*
8341          * Add device to global list
8342          */
8343         mutex_enter(&l2arc_dev_mtx);
8344         list_insert_head(l2arc_dev_list, adddev);
8345         atomic_inc_64(&l2arc_ndev);
8346         mutex_exit(&l2arc_dev_mtx);
8347 }
8348
8349 /*
8350  * Remove a vdev from the L2ARC.
8351  */
8352 void
8353 l2arc_remove_vdev(vdev_t *vd)
8354 {
8355         l2arc_dev_t *dev, *nextdev, *remdev = NULL;
8356
8357         /*
8358          * Find the device by vdev
8359          */
8360         mutex_enter(&l2arc_dev_mtx);
8361         for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
8362                 nextdev = list_next(l2arc_dev_list, dev);
8363                 if (vd == dev->l2ad_vdev) {
8364                         remdev = dev;
8365                         break;
8366                 }
8367         }
8368         ASSERT3P(remdev, !=, NULL);
8369
8370         /*
8371          * Remove device from global list
8372          */
8373         list_remove(l2arc_dev_list, remdev);
8374         l2arc_dev_last = NULL;          /* may have been invalidated */
8375         atomic_dec_64(&l2arc_ndev);
8376         mutex_exit(&l2arc_dev_mtx);
8377
8378         /*
8379          * Clear all buflists and ARC references.  L2ARC device flush.
8380          */
8381         l2arc_evict(remdev, 0, B_TRUE);
8382         list_destroy(&remdev->l2ad_buflist);
8383         mutex_destroy(&remdev->l2ad_mtx);
8384         refcount_destroy(&remdev->l2ad_alloc);
8385         kmem_free(remdev, sizeof (l2arc_dev_t));
8386 }
8387
8388 void
8389 l2arc_init(void)
8390 {
8391         l2arc_thread_exit = 0;
8392         l2arc_ndev = 0;
8393         l2arc_writes_sent = 0;
8394         l2arc_writes_done = 0;
8395
8396         mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
8397         cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
8398         mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
8399         mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
8400
8401         l2arc_dev_list = &L2ARC_dev_list;
8402         l2arc_free_on_write = &L2ARC_free_on_write;
8403         list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
8404             offsetof(l2arc_dev_t, l2ad_node));
8405         list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
8406             offsetof(l2arc_data_free_t, l2df_list_node));
8407 }
8408
8409 void
8410 l2arc_fini(void)
8411 {
8412         /*
8413          * This is called from dmu_fini(), which is called from spa_fini();
8414          * Because of this, we can assume that all l2arc devices have
8415          * already been removed when the pools themselves were removed.
8416          */
8417
8418         l2arc_do_free_on_write();
8419
8420         mutex_destroy(&l2arc_feed_thr_lock);
8421         cv_destroy(&l2arc_feed_thr_cv);
8422         mutex_destroy(&l2arc_dev_mtx);
8423         mutex_destroy(&l2arc_free_on_write_mtx);
8424
8425         list_destroy(l2arc_dev_list);
8426         list_destroy(l2arc_free_on_write);
8427 }
8428
8429 void
8430 l2arc_start(void)
8431 {
8432         if (!(spa_mode_global & FWRITE))
8433                 return;
8434
8435         (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
8436             TS_RUN, minclsyspri);
8437 }
8438
8439 void
8440 l2arc_stop(void)
8441 {
8442         if (!(spa_mode_global & FWRITE))
8443                 return;
8444
8445         mutex_enter(&l2arc_feed_thr_lock);
8446         cv_signal(&l2arc_feed_thr_cv);  /* kick thread out of startup */
8447         l2arc_thread_exit = 1;
8448         while (l2arc_thread_exit != 0)
8449                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
8450         mutex_exit(&l2arc_feed_thr_lock);
8451 }