module/zfs/arc.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 2013 by Delphix. All rights reserved.
  25  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  26  */
  27
  28 /*
  29  * DVA-based Adjustable Replacement Cache
  30  *
  31  * While much of the theory of operation used here is
  32  * based on the self-tuning, low overhead replacement cache
  33  * presented by Megiddo and Modha at FAST 2003, there are some
  34  * significant differences:
  35  *
  36  * 1. The Megiddo and Modha model assumes any page is evictable.
  37  * Pages in its cache cannot be "locked" into memory.  This makes
  38  * the eviction algorithm simple: evict the last page in the list.
  39  * This also make the performance characteristics easy to reason
  40  * about.  Our cache is not so simple.  At any given moment, some
  41  * subset of the blocks in the cache are un-evictable because we
  42  * have handed out a reference to them.  Blocks are only evictable
  43  * when there are no external references active.  This makes
  44  * eviction far more problematic:  we choose to evict the evictable
  45  * blocks that are the "lowest" in the list.
  46  *
  47  * There are times when it is not possible to evict the requested
  48  * space.  In these circumstances we are unable to adjust the cache
  49  * size.  To prevent the cache growing unbounded at these times we
  50  * implement a "cache throttle" that slows the flow of new data
  51  * into the cache until we can make space available.
  52  *
  53  * 2. The Megiddo and Modha model assumes a fixed cache size.
  54  * Pages are evicted when the cache is full and there is a cache
  55  * miss.  Our model has a variable sized cache.  It grows with
  56  * high use, but also tries to react to memory pressure from the
  57  * operating system: decreasing its size when system memory is
  58  * tight.
  59  *
  60  * 3. The Megiddo and Modha model assumes a fixed page size. All
  61  * elements of the cache are therefore exactly the same size.  So
  62  * when adjusting the cache size following a cache miss, its simply
  63  * a matter of choosing a single page to evict.  In our model, we
  64  * have variable sized cache blocks (rangeing from 512 bytes to
  65  * 128K bytes).  We therefore choose a set of blocks to evict to make
  66  * space for a cache miss that approximates as closely as possible
  67  * the space used by the new block.
  68  *
  69  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  70  * by N. Megiddo & D. Modha, FAST 2003
  71  */
  72
  73 /*
  74  * The locking model:
  75  *
  76  * A new reference to a cache buffer can be obtained in two
  77  * ways: 1) via a hash table lookup using the DVA as a key,
  78  * or 2) via one of the ARC lists.  The arc_read() interface
  79  * uses method 1, while the internal arc algorithms for
  80  * adjusting the cache use method 2.  We therefore provide two
  81  * types of locks: 1) the hash table lock array, and 2) the
  82  * arc list locks.
  83  *
  84  * Buffers do not have their own mutexes, rather they rely on the
  85  * hash table mutexes for the bulk of their protection (i.e. most
  86  * fields in the arc_buf_hdr_t are protected by these mutexes).
  87  *
  88  * buf_hash_find() returns the appropriate mutex (held) when it
  89  * locates the requested buffer in the hash table.  It returns
  90  * NULL for the mutex if the buffer was not in the table.
  91  *
  92  * buf_hash_remove() expects the appropriate hash mutex to be
  93  * already held before it is invoked.
  94  *
  95  * Each arc state also has a mutex which is used to protect the
  96  * buffer list associated with the state.  When attempting to
  97  * obtain a hash table lock while holding an arc list lock you
  98  * must use: mutex_tryenter() to avoid deadlock.  Also note that
  99  * the active state mutex must be held before the ghost state mutex.
 100  *
 101  * Arc buffers may have an associated eviction callback function.
 102  * This function will be invoked prior to removing the buffer (e.g.
 103  * in arc_do_user_evicts()).  Note however that the data associated
 104  * with the buffer may be evicted prior to the callback.  The callback
 105  * must be made with *no locks held* (to prevent deadlock).  Additionally,
 106  * the users of callbacks must ensure that their private data is
 107  * protected from simultaneous callbacks from arc_buf_evict()
 108  * and arc_do_user_evicts().
 109  *
 110  * It as also possible to register a callback which is run when the
 111  * arc_meta_limit is reached and no buffers can be safely evicted.  In
 112  * this case the arc user should drop a reference on some arc buffers so
 113  * they can be reclaimed and the arc_meta_limit honored.  For example,
 114  * when using the ZPL each dentry holds a references on a znode.  These
 115  * dentries must be pruned before the arc buffer holding the znode can
 116  * be safely evicted.
 117  *
 118  * Note that the majority of the performance stats are manipulated
 119  * with atomic operations.
 120  *
 121  * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
 122  *
 123  *      - L2ARC buflist creation
 124  *      - L2ARC buflist eviction
 125  *      - L2ARC write completion, which walks L2ARC buflists
 126  *      - ARC header destruction, as it removes from L2ARC buflists
 127  *      - ARC header release, as it removes from L2ARC buflists
 128  */
 129
 130 #include <sys/spa.h>
 131 #include <sys/zio.h>
 132 #include <sys/zio_compress.h>
 133 #include <sys/zfs_context.h>
 134 #include <sys/arc.h>
 135 #include <sys/vdev.h>
 136 #include <sys/vdev_impl.h>
 137 #include <sys/dsl_pool.h>
 138 #ifdef _KERNEL
 139 #include <sys/vmsystm.h>
 140 #include <vm/anon.h>
 141 #include <sys/fs/swapnode.h>
 142 #include <sys/zpl.h>
 143 #endif
 144 #include <sys/callb.h>
 145 #include <sys/kstat.h>
 146 #include <sys/dmu_tx.h>
 147 #include <zfs_fletcher.h>
 148
 149 #ifndef _KERNEL
 150 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 151 boolean_t arc_watch = B_FALSE;
 152 #endif
 153
 154 static kmutex_t         arc_reclaim_thr_lock;
 155 static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 156 static uint8_t          arc_thread_exit;
 157
 158 /* number of bytes to prune from caches when at arc_meta_limit is reached */
 159 int zfs_arc_meta_prune = 1048576;
 160
 161 typedef enum arc_reclaim_strategy {
 162         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
 163         ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
 164 } arc_reclaim_strategy_t;
 165
 166 /*
 167  * The number of iterations through arc_evict_*() before we
 168  * drop & reacquire the lock.
 169  */
 170 int arc_evict_iterations = 100;
 171
 172 /* number of seconds before growing cache again */
 173 int zfs_arc_grow_retry = 5;
 174
 175 /* disable anon data aggressively growing arc_p */
 176 int zfs_arc_p_aggressive_disable = 1;
 177
 178 /* disable arc_p adapt dampener in arc_adapt */
 179 int zfs_arc_p_dampener_disable = 1;
 180
 181 /* log2(fraction of arc to reclaim) */
 182 int zfs_arc_shrink_shift = 5;
 183
 184 /*
 185  * minimum lifespan of a prefetch block in clock ticks
 186  * (initialized in arc_init())
 187  */
 188 int zfs_arc_min_prefetch_lifespan = HZ;
 189
 190 /* disable arc proactive arc throttle due to low memory */
 191 int zfs_arc_memory_throttle_disable = 1;
 192
 193 /* disable duplicate buffer eviction */
 194 int zfs_disable_dup_eviction = 0;
 195
 196 /*
 197  * If this percent of memory is free, don't throttle.
 198  */
 199 int arc_lotsfree_percent = 10;
 200
 201 static int arc_dead;
 202
 203 /* expiration time for arc_no_grow */
 204 static clock_t arc_grow_time = 0;
 205
 206 /*
 207  * The arc has filled available memory and has now warmed up.
 208  */
 209 static boolean_t arc_warm;
 210
 211 /*
 212  * These tunables are for performance analysis.
 213  */
 214 unsigned long zfs_arc_max = 0;
 215 unsigned long zfs_arc_min = 0;
 216 unsigned long zfs_arc_meta_limit = 0;
 217
 218 /*
 219  * Note that buffers can be in one of 6 states:
 220  *      ARC_anon        - anonymous (discussed below)
 221  *      ARC_mru         - recently used, currently cached
 222  *      ARC_mru_ghost   - recentely used, no longer in cache
 223  *      ARC_mfu         - frequently used, currently cached
 224  *      ARC_mfu_ghost   - frequently used, no longer in cache
 225  *      ARC_l2c_only    - exists in L2ARC but not other states
 226  * When there are no active references to the buffer, they are
 227  * are linked onto a list in one of these arc states.  These are
 228  * the only buffers that can be evicted or deleted.  Within each
 229  * state there are multiple lists, one for meta-data and one for
 230  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
 231  * etc.) is tracked separately so that it can be managed more
 232  * explicitly: favored over data, limited explicitly.
 233  *
 234  * Anonymous buffers are buffers that are not associated with
 235  * a DVA.  These are buffers that hold dirty block copies
 236  * before they are written to stable storage.  By definition,
 237  * they are "ref'd" and are considered part of arc_mru
 238  * that cannot be freed.  Generally, they will aquire a DVA
 239  * as they are written and migrate onto the arc_mru list.
 240  *
 241  * The ARC_l2c_only state is for buffers that are in the second
 242  * level ARC but no longer in any of the ARC_m* lists.  The second
 243  * level ARC itself may also contain buffers that are in any of
 244  * the ARC_m* states - meaning that a buffer can exist in two
 245  * places.  The reason for the ARC_l2c_only state is to keep the
 246  * buffer header in the hash table, so that reads that hit the
 247  * second level ARC benefit from these fast lookups.
 248  */
 249
 250 typedef struct arc_state {
 251         list_t  arcs_list[ARC_BUFC_NUMTYPES];   /* list of evictable buffers */
 252         uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
 253         uint64_t arcs_size;     /* total amount of data in this state */
 254         kmutex_t arcs_mtx;
 255         arc_state_type_t arcs_state;
 256 } arc_state_t;
 257
 258 /* The 6 states: */
 259 static arc_state_t ARC_anon;
 260 static arc_state_t ARC_mru;
 261 static arc_state_t ARC_mru_ghost;
 262 static arc_state_t ARC_mfu;
 263 static arc_state_t ARC_mfu_ghost;
 264 static arc_state_t ARC_l2c_only;
 265
 266 typedef struct arc_stats {
 267         kstat_named_t arcstat_hits;
 268         kstat_named_t arcstat_misses;
 269         kstat_named_t arcstat_demand_data_hits;
 270         kstat_named_t arcstat_demand_data_misses;
 271         kstat_named_t arcstat_demand_metadata_hits;
 272         kstat_named_t arcstat_demand_metadata_misses;
 273         kstat_named_t arcstat_prefetch_data_hits;
 274         kstat_named_t arcstat_prefetch_data_misses;
 275         kstat_named_t arcstat_prefetch_metadata_hits;
 276         kstat_named_t arcstat_prefetch_metadata_misses;
 277         kstat_named_t arcstat_mru_hits;
 278         kstat_named_t arcstat_mru_ghost_hits;
 279         kstat_named_t arcstat_mfu_hits;
 280         kstat_named_t arcstat_mfu_ghost_hits;
 281         kstat_named_t arcstat_deleted;
 282         kstat_named_t arcstat_recycle_miss;
 283         /*
 284          * Number of buffers that could not be evicted because the hash lock
 285          * was held by another thread.  The lock may not necessarily be held
 286          * by something using the same buffer, since hash locks are shared
 287          * by multiple buffers.
 288          */
 289         kstat_named_t arcstat_mutex_miss;
 290         /*
 291          * Number of buffers skipped because they have I/O in progress, are
 292          * indrect prefetch buffers that have not lived long enough, or are
 293          * not from the spa we're trying to evict from.
 294          */
 295         kstat_named_t arcstat_evict_skip;
 296         kstat_named_t arcstat_evict_l2_cached;
 297         kstat_named_t arcstat_evict_l2_eligible;
 298         kstat_named_t arcstat_evict_l2_ineligible;
 299         kstat_named_t arcstat_hash_elements;
 300         kstat_named_t arcstat_hash_elements_max;
 301         kstat_named_t arcstat_hash_collisions;
 302         kstat_named_t arcstat_hash_chains;
 303         kstat_named_t arcstat_hash_chain_max;
 304         kstat_named_t arcstat_p;
 305         kstat_named_t arcstat_c;
 306         kstat_named_t arcstat_c_min;
 307         kstat_named_t arcstat_c_max;
 308         kstat_named_t arcstat_size;
 309         kstat_named_t arcstat_hdr_size;
 310         kstat_named_t arcstat_data_size;
 311         kstat_named_t arcstat_meta_size;
 312         kstat_named_t arcstat_other_size;
 313         kstat_named_t arcstat_anon_size;
 314         kstat_named_t arcstat_anon_evict_data;
 315         kstat_named_t arcstat_anon_evict_metadata;
 316         kstat_named_t arcstat_mru_size;
 317         kstat_named_t arcstat_mru_evict_data;
 318         kstat_named_t arcstat_mru_evict_metadata;
 319         kstat_named_t arcstat_mru_ghost_size;
 320         kstat_named_t arcstat_mru_ghost_evict_data;
 321         kstat_named_t arcstat_mru_ghost_evict_metadata;
 322         kstat_named_t arcstat_mfu_size;
 323         kstat_named_t arcstat_mfu_evict_data;
 324         kstat_named_t arcstat_mfu_evict_metadata;
 325         kstat_named_t arcstat_mfu_ghost_size;
 326         kstat_named_t arcstat_mfu_ghost_evict_data;
 327         kstat_named_t arcstat_mfu_ghost_evict_metadata;
 328         kstat_named_t arcstat_l2_hits;
 329         kstat_named_t arcstat_l2_misses;
 330         kstat_named_t arcstat_l2_feeds;
 331         kstat_named_t arcstat_l2_rw_clash;
 332         kstat_named_t arcstat_l2_read_bytes;
 333         kstat_named_t arcstat_l2_write_bytes;
 334         kstat_named_t arcstat_l2_writes_sent;
 335         kstat_named_t arcstat_l2_writes_done;
 336         kstat_named_t arcstat_l2_writes_error;
 337         kstat_named_t arcstat_l2_writes_hdr_miss;
 338         kstat_named_t arcstat_l2_evict_lock_retry;
 339         kstat_named_t arcstat_l2_evict_reading;
 340         kstat_named_t arcstat_l2_free_on_write;
 341         kstat_named_t arcstat_l2_abort_lowmem;
 342         kstat_named_t arcstat_l2_cksum_bad;
 343         kstat_named_t arcstat_l2_io_error;
 344         kstat_named_t arcstat_l2_size;
 345         kstat_named_t arcstat_l2_asize;
 346         kstat_named_t arcstat_l2_hdr_size;
 347         kstat_named_t arcstat_l2_compress_successes;
 348         kstat_named_t arcstat_l2_compress_zeros;
 349         kstat_named_t arcstat_l2_compress_failures;
 350         kstat_named_t arcstat_memory_throttle_count;
 351         kstat_named_t arcstat_duplicate_buffers;
 352         kstat_named_t arcstat_duplicate_buffers_size;
 353         kstat_named_t arcstat_duplicate_reads;
 354         kstat_named_t arcstat_memory_direct_count;
 355         kstat_named_t arcstat_memory_indirect_count;
 356         kstat_named_t arcstat_no_grow;
 357         kstat_named_t arcstat_tempreserve;
 358         kstat_named_t arcstat_loaned_bytes;
 359         kstat_named_t arcstat_prune;
 360         kstat_named_t arcstat_meta_used;
 361         kstat_named_t arcstat_meta_limit;
 362         kstat_named_t arcstat_meta_max;
 363 } arc_stats_t;
 364
 365 static arc_stats_t arc_stats = {
 366         { "hits",                       KSTAT_DATA_UINT64 },
 367         { "misses",                     KSTAT_DATA_UINT64 },
 368         { "demand_data_hits",           KSTAT_DATA_UINT64 },
 369         { "demand_data_misses",         KSTAT_DATA_UINT64 },
 370         { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
 371         { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
 372         { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
 373         { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
 374         { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
 375         { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },
 376         { "mru_hits",                   KSTAT_DATA_UINT64 },
 377         { "mru_ghost_hits",             KSTAT_DATA_UINT64 },
 378         { "mfu_hits",                   KSTAT_DATA_UINT64 },
 379         { "mfu_ghost_hits",             KSTAT_DATA_UINT64 },
 380         { "deleted",                    KSTAT_DATA_UINT64 },
 381         { "recycle_miss",               KSTAT_DATA_UINT64 },
 382         { "mutex_miss",                 KSTAT_DATA_UINT64 },
 383         { "evict_skip",                 KSTAT_DATA_UINT64 },
 384         { "evict_l2_cached",            KSTAT_DATA_UINT64 },
 385         { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
 386         { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
 387         { "hash_elements",              KSTAT_DATA_UINT64 },
 388         { "hash_elements_max",          KSTAT_DATA_UINT64 },
 389         { "hash_collisions",            KSTAT_DATA_UINT64 },
 390         { "hash_chains",                KSTAT_DATA_UINT64 },
 391         { "hash_chain_max",             KSTAT_DATA_UINT64 },
 392         { "p",                          KSTAT_DATA_UINT64 },
 393         { "c",                          KSTAT_DATA_UINT64 },
 394         { "c_min",                      KSTAT_DATA_UINT64 },
 395         { "c_max",                      KSTAT_DATA_UINT64 },
 396         { "size",                       KSTAT_DATA_UINT64 },
 397         { "hdr_size",                   KSTAT_DATA_UINT64 },
 398         { "data_size",                  KSTAT_DATA_UINT64 },
 399         { "meta_size",                  KSTAT_DATA_UINT64 },
 400         { "other_size",                 KSTAT_DATA_UINT64 },
 401         { "anon_size",                  KSTAT_DATA_UINT64 },
 402         { "anon_evict_data",            KSTAT_DATA_UINT64 },
 403         { "anon_evict_metadata",        KSTAT_DATA_UINT64 },
 404         { "mru_size",                   KSTAT_DATA_UINT64 },
 405         { "mru_evict_data",             KSTAT_DATA_UINT64 },
 406         { "mru_evict_metadata",         KSTAT_DATA_UINT64 },
 407         { "mru_ghost_size",             KSTAT_DATA_UINT64 },
 408         { "mru_ghost_evict_data",       KSTAT_DATA_UINT64 },
 409         { "mru_ghost_evict_metadata",   KSTAT_DATA_UINT64 },
 410         { "mfu_size",                   KSTAT_DATA_UINT64 },
 411         { "mfu_evict_data",             KSTAT_DATA_UINT64 },
 412         { "mfu_evict_metadata",         KSTAT_DATA_UINT64 },
 413         { "mfu_ghost_size",             KSTAT_DATA_UINT64 },
 414         { "mfu_ghost_evict_data",       KSTAT_DATA_UINT64 },
 415         { "mfu_ghost_evict_metadata",   KSTAT_DATA_UINT64 },
 416         { "l2_hits",                    KSTAT_DATA_UINT64 },
 417         { "l2_misses",                  KSTAT_DATA_UINT64 },
 418         { "l2_feeds",                   KSTAT_DATA_UINT64 },
 419         { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 420         { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 421         { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 422         { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 423         { "l2_writes_done",             KSTAT_DATA_UINT64 },
 424         { "l2_writes_error",            KSTAT_DATA_UINT64 },
 425         { "l2_writes_hdr_miss",         KSTAT_DATA_UINT64 },
 426         { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 427         { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 428         { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 429         { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 430         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 431         { "l2_io_error",                KSTAT_DATA_UINT64 },
 432         { "l2_size",                    KSTAT_DATA_UINT64 },
 433         { "l2_asize",                   KSTAT_DATA_UINT64 },
 434         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 435         { "l2_compress_successes",      KSTAT_DATA_UINT64 },
 436         { "l2_compress_zeros",          KSTAT_DATA_UINT64 },
 437         { "l2_compress_failures",       KSTAT_DATA_UINT64 },
 438         { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 439         { "duplicate_buffers",          KSTAT_DATA_UINT64 },
 440         { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
 441         { "duplicate_reads",            KSTAT_DATA_UINT64 },
 442         { "memory_direct_count",        KSTAT_DATA_UINT64 },
 443         { "memory_indirect_count",      KSTAT_DATA_UINT64 },
 444         { "arc_no_grow",                KSTAT_DATA_UINT64 },
 445         { "arc_tempreserve",            KSTAT_DATA_UINT64 },
 446         { "arc_loaned_bytes",           KSTAT_DATA_UINT64 },
 447         { "arc_prune",                  KSTAT_DATA_UINT64 },
 448         { "arc_meta_used",              KSTAT_DATA_UINT64 },
 449         { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 450         { "arc_meta_max",               KSTAT_DATA_UINT64 },
 451 };
 452
 453 #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
 454
 455 #define ARCSTAT_INCR(stat, val) \
 456         atomic_add_64(&arc_stats.stat.value.ui64, (val))
 457
 458 #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 459 #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 460
 461 #define ARCSTAT_MAX(stat, val) {                                        \
 462         uint64_t m;                                                     \
 463         while ((val) > (m = arc_stats.stat.value.ui64) &&               \
 464             (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
 465                 continue;                                               \
 466 }
 467
 468 #define ARCSTAT_MAXSTAT(stat) \
 469         ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
 470
 471 /*
 472  * We define a macro to allow ARC hits/misses to be easily broken down by
 473  * two separate conditions, giving a total of four different subtypes for
 474  * each of hits and misses (so eight statistics total).
 475  */
 476 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 477         if (cond1) {                                                    \
 478                 if (cond2) {                                            \
 479                         ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 480                 } else {                                                \
 481                         ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 482                 }                                                       \
 483         } else {                                                        \
 484                 if (cond2) {                                            \
 485                         ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 486                 } else {                                                \
 487                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 488                 }                                                       \
 489         }
 490
 491 kstat_t                 *arc_ksp;
 492 static arc_state_t      *arc_anon;
 493 static arc_state_t      *arc_mru;
 494 static arc_state_t      *arc_mru_ghost;
 495 static arc_state_t      *arc_mfu;
 496 static arc_state_t      *arc_mfu_ghost;
 497 static arc_state_t      *arc_l2c_only;
 498
 499 /*
 500  * There are several ARC variables that are critical to export as kstats --
 501  * but we don't want to have to grovel around in the kstat whenever we wish to
 502  * manipulate them.  For these variables, we therefore define them to be in
 503  * terms of the statistic variable.  This assures that we are not introducing
 504  * the possibility of inconsistency by having shadow copies of the variables,
 505  * while still allowing the code to be readable.
 506  */
 507 #define arc_size        ARCSTAT(arcstat_size)   /* actual total arc size */
 508 #define arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
 509 #define arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
 510 #define arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
 511 #define arc_c_max       ARCSTAT(arcstat_c_max)  /* max target cache size */
 512 #define arc_no_grow     ARCSTAT(arcstat_no_grow)
 513 #define arc_tempreserve ARCSTAT(arcstat_tempreserve)
 514 #define arc_loaned_bytes        ARCSTAT(arcstat_loaned_bytes)
 515 #define arc_meta_limit  ARCSTAT(arcstat_meta_limit) /* max size for metadata */
 516 #define arc_meta_used   ARCSTAT(arcstat_meta_used) /* size of metadata */
 517 #define arc_meta_max    ARCSTAT(arcstat_meta_max) /* max size of metadata */
 518
 519 #define L2ARC_IS_VALID_COMPRESS(_c_) \
 520         ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
 521
 522 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 523
 524 typedef struct arc_callback arc_callback_t;
 525
 526 struct arc_callback {
 527         void                    *acb_private;
 528         arc_done_func_t         *acb_done;
 529         arc_buf_t               *acb_buf;
 530         zio_t                   *acb_zio_dummy;
 531         arc_callback_t          *acb_next;
 532 };
 533
 534 typedef struct arc_write_callback arc_write_callback_t;
 535
 536 struct arc_write_callback {
 537         void            *awcb_private;
 538         arc_done_func_t *awcb_ready;
 539         arc_done_func_t *awcb_physdone;
 540         arc_done_func_t *awcb_done;
 541         arc_buf_t       *awcb_buf;
 542 };
 543
 544 struct arc_buf_hdr {
 545         /* protected by hash lock */
 546         dva_t                   b_dva;
 547         uint64_t                b_birth;
 548         uint64_t                b_cksum0;
 549
 550         kmutex_t                b_freeze_lock;
 551         zio_cksum_t             *b_freeze_cksum;
 552
 553         arc_buf_hdr_t           *b_hash_next;
 554         arc_buf_t               *b_buf;
 555         uint32_t                b_flags;
 556         uint32_t                b_datacnt;
 557
 558         arc_callback_t          *b_acb;
 559         kcondvar_t              b_cv;
 560
 561         /* immutable */
 562         arc_buf_contents_t      b_type;
 563         uint64_t                b_size;
 564         uint64_t                b_spa;
 565
 566         /* protected by arc state mutex */
 567         arc_state_t             *b_state;
 568         list_node_t             b_arc_node;
 569
 570         /* updated atomically */
 571         clock_t                 b_arc_access;
 572         uint32_t                b_mru_hits;
 573         uint32_t                b_mru_ghost_hits;
 574         uint32_t                b_mfu_hits;
 575         uint32_t                b_mfu_ghost_hits;
 576         uint32_t                b_l2_hits;
 577
 578         /* self protecting */
 579         refcount_t              b_refcnt;
 580
 581         l2arc_buf_hdr_t         *b_l2hdr;
 582         list_node_t             b_l2node;
 583 };
 584
 585 static list_t arc_prune_list;
 586 static kmutex_t arc_prune_mtx;
 587 static arc_buf_t *arc_eviction_list;
 588 static kmutex_t arc_eviction_mtx;
 589 static arc_buf_hdr_t arc_eviction_hdr;
 590 static void arc_get_data_buf(arc_buf_t *buf);
 591 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
 592 static int arc_evict_needed(arc_buf_contents_t type);
 593 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes,
 594     arc_buf_contents_t type);
 595 static void arc_buf_watch(arc_buf_t *buf);
 596
 597 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
 598
 599 #define GHOST_STATE(state)      \
 600         ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
 601         (state) == arc_l2c_only)
 602
 603 /*
 604  * Private ARC flags.  These flags are private ARC only flags that will show up
 605  * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
 606  * be passed in as arc_flags in things like arc_read.  However, these flags
 607  * should never be passed and should only be set by ARC code.  When adding new
 608  * public flags, make sure not to smash the private ones.
 609  */
 610
 611 #define ARC_IN_HASH_TABLE       (1 << 9)        /* this buffer is hashed */
 612 #define ARC_IO_IN_PROGRESS      (1 << 10)       /* I/O in progress for buf */
 613 #define ARC_IO_ERROR            (1 << 11)       /* I/O failed for buf */
 614 #define ARC_FREED_IN_READ       (1 << 12)       /* buf freed while in read */
 615 #define ARC_BUF_AVAILABLE       (1 << 13)       /* block not in active use */
 616 #define ARC_INDIRECT            (1 << 14)       /* this is an indirect block */
 617 #define ARC_FREE_IN_PROGRESS    (1 << 15)       /* hdr about to be freed */
 618 #define ARC_L2_WRITING          (1 << 16)       /* L2ARC write in progress */
 619 #define ARC_L2_EVICTED          (1 << 17)       /* evicted during I/O */
 620 #define ARC_L2_WRITE_HEAD       (1 << 18)       /* head of write list */
 621
 622 #define HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_IN_HASH_TABLE)
 623 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
 624 #define HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_IO_ERROR)
 625 #define HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_PREFETCH)
 626 #define HDR_FREED_IN_READ(hdr)  ((hdr)->b_flags & ARC_FREED_IN_READ)
 627 #define HDR_BUF_AVAILABLE(hdr)  ((hdr)->b_flags & ARC_BUF_AVAILABLE)
 628 #define HDR_FREE_IN_PROGRESS(hdr)       ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
 629 #define HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_L2CACHE)
 630 #define HDR_L2_READING(hdr)     ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \
 631                                     (hdr)->b_l2hdr != NULL)
 632 #define HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_L2_WRITING)
 633 #define HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_L2_EVICTED)
 634 #define HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
 635
 636 /*
 637  * Other sizes
 638  */
 639
 640 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 641 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
 642
 643 /*
 644  * Hash table routines
 645  */
 646
 647 #define HT_LOCK_ALIGN   64
 648 #define HT_LOCK_PAD     (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN)))
 649
 650 struct ht_lock {
 651         kmutex_t        ht_lock;
 652 #ifdef _KERNEL
 653         unsigned char   pad[HT_LOCK_PAD];
 654 #endif
 655 };
 656
 657 #define BUF_LOCKS 256
 658 typedef struct buf_hash_table {
 659         uint64_t ht_mask;
 660         arc_buf_hdr_t **ht_table;
 661         struct ht_lock ht_locks[BUF_LOCKS];
 662 } buf_hash_table_t;
 663
 664 static buf_hash_table_t buf_hash_table;
 665
 666 #define BUF_HASH_INDEX(spa, dva, birth) \
 667         (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 668 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 669 #define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 670 #define HDR_LOCK(hdr) \
 671         (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 672
 673 uint64_t zfs_crc64_table[256];
 674
 675 /*
 676  * Level 2 ARC
 677  */
 678
 679 #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
 680 #define L2ARC_HEADROOM          2                       /* num of writes */
 681 /*
 682  * If we discover during ARC scan any buffers to be compressed, we boost
 683  * our headroom for the next scanning cycle by this percentage multiple.
 684  */
 685 #define L2ARC_HEADROOM_BOOST    200
 686 #define L2ARC_FEED_SECS         1               /* caching interval secs */
 687 #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 688
 689 #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 690 #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 691
 692 /* L2ARC Performance Tunables */
 693 unsigned long l2arc_write_max = L2ARC_WRITE_SIZE;       /* def max write size */
 694 unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE;     /* extra warmup write */
 695 unsigned long l2arc_headroom = L2ARC_HEADROOM;          /* # of dev writes */
 696 unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 697 unsigned long l2arc_feed_secs = L2ARC_FEED_SECS;        /* interval seconds */
 698 unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;    /* min interval msecs */
 699 int l2arc_noprefetch = B_TRUE;                  /* don't cache prefetch bufs */
 700 int l2arc_nocompress = B_FALSE;                 /* don't compress bufs */
 701 int l2arc_feed_again = B_TRUE;                  /* turbo warmup */
 702 int l2arc_norw = B_FALSE;                       /* no reads during writes */
 703
 704 /*
 705  * L2ARC Internals
 706  */
 707 typedef struct l2arc_dev {
 708         vdev_t                  *l2ad_vdev;     /* vdev */
 709         spa_t                   *l2ad_spa;      /* spa */
 710         uint64_t                l2ad_hand;      /* next write location */
 711         uint64_t                l2ad_start;     /* first addr on device */
 712         uint64_t                l2ad_end;       /* last addr on device */
 713         uint64_t                l2ad_evict;     /* last addr eviction reached */
 714         boolean_t               l2ad_first;     /* first sweep through */
 715         boolean_t               l2ad_writing;   /* currently writing */
 716         list_t                  *l2ad_buflist;  /* buffer list */
 717         list_node_t             l2ad_node;      /* device list node */
 718 } l2arc_dev_t;
 719
 720 static list_t L2ARC_dev_list;                   /* device list */
 721 static list_t *l2arc_dev_list;                  /* device list pointer */
 722 static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 723 static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 724 static kmutex_t l2arc_buflist_mtx;              /* mutex for all buflists */
 725 static list_t L2ARC_free_on_write;              /* free after write buf list */
 726 static list_t *l2arc_free_on_write;             /* free after write list ptr */
 727 static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 728 static uint64_t l2arc_ndev;                     /* number of devices */
 729
 730 typedef struct l2arc_read_callback {
 731         arc_buf_t               *l2rcb_buf;             /* read buffer */
 732         spa_t                   *l2rcb_spa;             /* spa */
 733         blkptr_t                l2rcb_bp;               /* original blkptr */
 734         zbookmark_t             l2rcb_zb;               /* original bookmark */
 735         int                     l2rcb_flags;            /* original flags */
 736         enum zio_compress       l2rcb_compress;         /* applied compress */
 737 } l2arc_read_callback_t;
 738
 739 typedef struct l2arc_write_callback {
 740         l2arc_dev_t     *l2wcb_dev;             /* device info */
 741         arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
 742 } l2arc_write_callback_t;
 743
 744 struct l2arc_buf_hdr {
 745         /* protected by arc_buf_hdr  mutex */
 746         l2arc_dev_t             *b_dev;         /* L2ARC device */
 747         uint64_t                b_daddr;        /* disk address, offset byte */
 748         /* compression applied to buffer data */
 749         enum zio_compress       b_compress;
 750         /* real alloc'd buffer size depending on b_compress applied */
 751         uint32_t                b_hits;
 752         uint64_t                b_asize;
 753         /* temporary buffer holder for in-flight compressed data */
 754         void                    *b_tmp_cdata;
 755 };
 756
 757 typedef struct l2arc_data_free {
 758         /* protected by l2arc_free_on_write_mtx */
 759         void            *l2df_data;
 760         size_t          l2df_size;
 761         void            (*l2df_func)(void *, size_t);
 762         list_node_t     l2df_list_node;
 763 } l2arc_data_free_t;
 764
 765 static kmutex_t l2arc_feed_thr_lock;
 766 static kcondvar_t l2arc_feed_thr_cv;
 767 static uint8_t l2arc_thread_exit;
 768
 769 static void l2arc_read_done(zio_t *zio);
 770 static void l2arc_hdr_stat_add(void);
 771 static void l2arc_hdr_stat_remove(void);
 772
 773 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
 774 static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
 775     enum zio_compress c);
 776 static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
 777
 778 static uint64_t
 779 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 780 {
 781         uint8_t *vdva = (uint8_t *)dva;
 782         uint64_t crc = -1ULL;
 783         int i;
 784
 785         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 786
 787         for (i = 0; i < sizeof (dva_t); i++)
 788                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 789
 790         crc ^= (spa>>8) ^ birth;
 791
 792         return (crc);
 793 }
 794
 795 #define BUF_EMPTY(buf)                                          \
 796         ((buf)->b_dva.dva_word[0] == 0 &&                       \
 797         (buf)->b_dva.dva_word[1] == 0 &&                        \
 798         (buf)->b_birth == 0)
 799
 800 #define BUF_EQUAL(spa, dva, birth, buf)                         \
 801         ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&     \
 802         ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&     \
 803         ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 804
 805 static void
 806 buf_discard_identity(arc_buf_hdr_t *hdr)
 807 {
 808         hdr->b_dva.dva_word[0] = 0;
 809         hdr->b_dva.dva_word[1] = 0;
 810         hdr->b_birth = 0;
 811         hdr->b_cksum0 = 0;
 812 }
 813
 814 static arc_buf_hdr_t *
 815 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
 816 {
 817         uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 818         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 819         arc_buf_hdr_t *buf;
 820
 821         mutex_enter(hash_lock);
 822         for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
 823             buf = buf->b_hash_next) {
 824                 if (BUF_EQUAL(spa, dva, birth, buf)) {
 825                         *lockp = hash_lock;
 826                         return (buf);
 827                 }
 828         }
 829         mutex_exit(hash_lock);
 830         *lockp = NULL;
 831         return (NULL);
 832 }
 833
 834 /*
 835  * Insert an entry into the hash table.  If there is already an element
 836  * equal to elem in the hash table, then the already existing element
 837  * will be returned and the new element will not be inserted.
 838  * Otherwise returns NULL.
 839  */
 840 static arc_buf_hdr_t *
 841 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
 842 {
 843         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 844         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 845         arc_buf_hdr_t *fbuf;
 846         uint32_t i;
 847
 848         ASSERT(!HDR_IN_HASH_TABLE(buf));
 849         *lockp = hash_lock;
 850         mutex_enter(hash_lock);
 851         for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
 852             fbuf = fbuf->b_hash_next, i++) {
 853                 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
 854                         return (fbuf);
 855         }
 856
 857         buf->b_hash_next = buf_hash_table.ht_table[idx];
 858         buf_hash_table.ht_table[idx] = buf;
 859         buf->b_flags |= ARC_IN_HASH_TABLE;
 860
 861         /* collect some hash table performance data */
 862         if (i > 0) {
 863                 ARCSTAT_BUMP(arcstat_hash_collisions);
 864                 if (i == 1)
 865                         ARCSTAT_BUMP(arcstat_hash_chains);
 866
 867                 ARCSTAT_MAX(arcstat_hash_chain_max, i);
 868         }
 869
 870         ARCSTAT_BUMP(arcstat_hash_elements);
 871         ARCSTAT_MAXSTAT(arcstat_hash_elements);
 872
 873         return (NULL);
 874 }
 875
 876 static void
 877 buf_hash_remove(arc_buf_hdr_t *buf)
 878 {
 879         arc_buf_hdr_t *fbuf, **bufp;
 880         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 881
 882         ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 883         ASSERT(HDR_IN_HASH_TABLE(buf));
 884
 885         bufp = &buf_hash_table.ht_table[idx];
 886         while ((fbuf = *bufp) != buf) {
 887                 ASSERT(fbuf != NULL);
 888                 bufp = &fbuf->b_hash_next;
 889         }
 890         *bufp = buf->b_hash_next;
 891         buf->b_hash_next = NULL;
 892         buf->b_flags &= ~ARC_IN_HASH_TABLE;
 893
 894         /* collect some hash table performance data */
 895         ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 896
 897         if (buf_hash_table.ht_table[idx] &&
 898             buf_hash_table.ht_table[idx]->b_hash_next == NULL)
 899                 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 900 }
 901
 902 /*
 903  * Global data structures and functions for the buf kmem cache.
 904  */
 905 static kmem_cache_t *hdr_cache;
 906 static kmem_cache_t *buf_cache;
 907 static kmem_cache_t *l2arc_hdr_cache;
 908
 909 static void
 910 buf_fini(void)
 911 {
 912         int i;
 913
 914 #if defined(_KERNEL) && defined(HAVE_SPL)
 915         /*
 916          * Large allocations which do not require contiguous pages
 917          * should be using vmem_free() in the linux kernel\
 918          */
 919         vmem_free(buf_hash_table.ht_table,
 920             (buf_hash_table.ht_mask + 1) * sizeof (void *));
 921 #else
 922         kmem_free(buf_hash_table.ht_table,
 923             (buf_hash_table.ht_mask + 1) * sizeof (void *));
 924 #endif
 925         for (i = 0; i < BUF_LOCKS; i++)
 926                 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
 927         kmem_cache_destroy(hdr_cache);
 928         kmem_cache_destroy(buf_cache);
 929         kmem_cache_destroy(l2arc_hdr_cache);
 930 }
 931
 932 /*
 933  * Constructor callback - called when the cache is empty
 934  * and a new buf is requested.
 935  */
 936 /* ARGSUSED */
 937 static int
 938 hdr_cons(void *vbuf, void *unused, int kmflag)
 939 {
 940         arc_buf_hdr_t *buf = vbuf;
 941
 942         bzero(buf, sizeof (arc_buf_hdr_t));
 943         refcount_create(&buf->b_refcnt);
 944         cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
 945         mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
 946         list_link_init(&buf->b_arc_node);
 947         list_link_init(&buf->b_l2node);
 948         arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 949
 950         return (0);
 951 }
 952
 953 /* ARGSUSED */
 954 static int
 955 buf_cons(void *vbuf, void *unused, int kmflag)
 956 {
 957         arc_buf_t *buf = vbuf;
 958
 959         bzero(buf, sizeof (arc_buf_t));
 960         mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 961         arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 962
 963         return (0);
 964 }
 965
 966 /*
 967  * Destructor callback - called when a cached buf is
 968  * no longer required.
 969  */
 970 /* ARGSUSED */
 971 static void
 972 hdr_dest(void *vbuf, void *unused)
 973 {
 974         arc_buf_hdr_t *buf = vbuf;
 975
 976         ASSERT(BUF_EMPTY(buf));
 977         refcount_destroy(&buf->b_refcnt);
 978         cv_destroy(&buf->b_cv);
 979         mutex_destroy(&buf->b_freeze_lock);
 980         arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 981 }
 982
 983 /* ARGSUSED */
 984 static void
 985 buf_dest(void *vbuf, void *unused)
 986 {
 987         arc_buf_t *buf = vbuf;
 988
 989         mutex_destroy(&buf->b_evict_lock);
 990         arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 991 }
 992
 993 static void
 994 buf_init(void)
 995 {
 996         uint64_t *ct;
 997         uint64_t hsize = 1ULL << 12;
 998         int i, j;
 999
1000         /*
1001          * The hash table is big enough to fill all of physical memory
1002          * with an average 64K block size.  The table will take up
1003          * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
1004          */
1005         while (hsize * 65536 < physmem * PAGESIZE)
1006                 hsize <<= 1;
1007 retry:
1008         buf_hash_table.ht_mask = hsize - 1;
1009 #if defined(_KERNEL) && defined(HAVE_SPL)
1010         /*
1011          * Large allocations which do not require contiguous pages
1012          * should be using vmem_alloc() in the linux kernel
1013          */
1014         buf_hash_table.ht_table =
1015             vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
1016 #else
1017         buf_hash_table.ht_table =
1018             kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1019 #endif
1020         if (buf_hash_table.ht_table == NULL) {
1021                 ASSERT(hsize > (1ULL << 8));
1022                 hsize >>= 1;
1023                 goto retry;
1024         }
1025
1026         hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
1027             0, hdr_cons, hdr_dest, NULL, NULL, NULL, 0);
1028         buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1029             0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1030         l2arc_hdr_cache = kmem_cache_create("l2arc_buf_hdr_t", L2HDR_SIZE,
1031             0, NULL, NULL, NULL, NULL, NULL, 0);
1032
1033         for (i = 0; i < 256; i++)
1034                 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1035                         *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1036
1037         for (i = 0; i < BUF_LOCKS; i++) {
1038                 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1039                     NULL, MUTEX_DEFAULT, NULL);
1040         }
1041 }
1042
1043 #define ARC_MINTIME     (hz>>4) /* 62 ms */
1044
1045 static void
1046 arc_cksum_verify(arc_buf_t *buf)
1047 {
1048         zio_cksum_t zc;
1049
1050         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1051                 return;
1052
1053         mutex_enter(&buf->b_hdr->b_freeze_lock);
1054         if (buf->b_hdr->b_freeze_cksum == NULL ||
1055             (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
1056                 mutex_exit(&buf->b_hdr->b_freeze_lock);
1057                 return;
1058         }
1059         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1060         if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1061                 panic("buffer modified while frozen!");
1062         mutex_exit(&buf->b_hdr->b_freeze_lock);
1063 }
1064
1065 static int
1066 arc_cksum_equal(arc_buf_t *buf)
1067 {
1068         zio_cksum_t zc;
1069         int equal;
1070
1071         mutex_enter(&buf->b_hdr->b_freeze_lock);
1072         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1073         equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1074         mutex_exit(&buf->b_hdr->b_freeze_lock);
1075
1076         return (equal);
1077 }
1078
1079 static void
1080 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1081 {
1082         if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1083                 return;
1084
1085         mutex_enter(&buf->b_hdr->b_freeze_lock);
1086         if (buf->b_hdr->b_freeze_cksum != NULL) {
1087                 mutex_exit(&buf->b_hdr->b_freeze_lock);
1088                 return;
1089         }
1090         buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
1091             KM_PUSHPAGE);
1092         fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1093             buf->b_hdr->b_freeze_cksum);
1094         mutex_exit(&buf->b_hdr->b_freeze_lock);
1095         arc_buf_watch(buf);
1096 }
1097
1098 #ifndef _KERNEL
1099 void
1100 arc_buf_sigsegv(int sig, siginfo_t *si, void *unused)
1101 {
1102         panic("Got SIGSEGV at address: 0x%lx\n", (long) si->si_addr);
1103 }
1104 #endif
1105
1106 /* ARGSUSED */
1107 static void
1108 arc_buf_unwatch(arc_buf_t *buf)
1109 {
1110 #ifndef _KERNEL
1111         if (arc_watch) {
1112                 ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size,
1113                     PROT_READ | PROT_WRITE));
1114         }
1115 #endif
1116 }
1117
1118 /* ARGSUSED */
1119 static void
1120 arc_buf_watch(arc_buf_t *buf)
1121 {
1122 #ifndef _KERNEL
1123         if (arc_watch)
1124                 ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size, PROT_READ));
1125 #endif
1126 }
1127
1128 void
1129 arc_buf_thaw(arc_buf_t *buf)
1130 {
1131         if (zfs_flags & ZFS_DEBUG_MODIFY) {
1132                 if (buf->b_hdr->b_state != arc_anon)
1133                         panic("modifying non-anon buffer!");
1134                 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1135                         panic("modifying buffer while i/o in progress!");
1136                 arc_cksum_verify(buf);
1137         }
1138
1139         mutex_enter(&buf->b_hdr->b_freeze_lock);
1140         if (buf->b_hdr->b_freeze_cksum != NULL) {
1141                 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1142                 buf->b_hdr->b_freeze_cksum = NULL;
1143         }
1144
1145         mutex_exit(&buf->b_hdr->b_freeze_lock);
1146
1147         arc_buf_unwatch(buf);
1148 }
1149
1150 void
1151 arc_buf_freeze(arc_buf_t *buf)
1152 {
1153         kmutex_t *hash_lock;
1154
1155         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1156                 return;
1157
1158         hash_lock = HDR_LOCK(buf->b_hdr);
1159         mutex_enter(hash_lock);
1160
1161         ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1162             buf->b_hdr->b_state == arc_anon);
1163         arc_cksum_compute(buf, B_FALSE);
1164         mutex_exit(hash_lock);
1165
1166 }
1167
1168 static void
1169 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1170 {
1171         ASSERT(MUTEX_HELD(hash_lock));
1172
1173         if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1174             (ab->b_state != arc_anon)) {
1175                 uint64_t delta = ab->b_size * ab->b_datacnt;
1176                 list_t *list = &ab->b_state->arcs_list[ab->b_type];
1177                 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1178
1179                 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
1180                 mutex_enter(&ab->b_state->arcs_mtx);
1181                 ASSERT(list_link_active(&ab->b_arc_node));
1182                 list_remove(list, ab);
1183                 if (GHOST_STATE(ab->b_state)) {
1184                         ASSERT0(ab->b_datacnt);
1185                         ASSERT3P(ab->b_buf, ==, NULL);
1186                         delta = ab->b_size;
1187                 }
1188                 ASSERT(delta > 0);
1189                 ASSERT3U(*size, >=, delta);
1190                 atomic_add_64(size, -delta);
1191                 mutex_exit(&ab->b_state->arcs_mtx);
1192                 /* remove the prefetch flag if we get a reference */
1193                 if (ab->b_flags & ARC_PREFETCH)
1194                         ab->b_flags &= ~ARC_PREFETCH;
1195         }
1196 }
1197
1198 static int
1199 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1200 {
1201         int cnt;
1202         arc_state_t *state = ab->b_state;
1203
1204         ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1205         ASSERT(!GHOST_STATE(state));
1206
1207         if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1208             (state != arc_anon)) {
1209                 uint64_t *size = &state->arcs_lsize[ab->b_type];
1210
1211                 ASSERT(!MUTEX_HELD(&state->arcs_mtx));
1212                 mutex_enter(&state->arcs_mtx);
1213                 ASSERT(!list_link_active(&ab->b_arc_node));
1214                 list_insert_head(&state->arcs_list[ab->b_type], ab);
1215                 ASSERT(ab->b_datacnt > 0);
1216                 atomic_add_64(size, ab->b_size * ab->b_datacnt);
1217                 mutex_exit(&state->arcs_mtx);
1218         }
1219         return (cnt);
1220 }
1221
1222 /*
1223  * Returns detailed information about a specific arc buffer.  When the
1224  * state_index argument is set the function will calculate the arc header
1225  * list position for its arc state.  Since this requires a linear traversal
1226  * callers are strongly encourage not to do this.  However, it can be helpful
1227  * for targeted analysis so the functionality is provided.
1228  */
1229 void
1230 arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
1231 {
1232         arc_buf_hdr_t *hdr = ab->b_hdr;
1233         arc_state_t *state = hdr->b_state;
1234
1235         memset(abi, 0, sizeof (arc_buf_info_t));
1236         abi->abi_flags = hdr->b_flags;
1237         abi->abi_datacnt = hdr->b_datacnt;
1238         abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
1239         abi->abi_state_contents = hdr->b_type;
1240         abi->abi_state_index = -1;
1241         abi->abi_size = hdr->b_size;
1242         abi->abi_access = hdr->b_arc_access;
1243         abi->abi_mru_hits = hdr->b_mru_hits;
1244         abi->abi_mru_ghost_hits = hdr->b_mru_ghost_hits;
1245         abi->abi_mfu_hits = hdr->b_mfu_hits;
1246         abi->abi_mfu_ghost_hits = hdr->b_mfu_ghost_hits;
1247         abi->abi_holds = refcount_count(&hdr->b_refcnt);
1248
1249         if (hdr->b_l2hdr) {
1250                 abi->abi_l2arc_dattr = hdr->b_l2hdr->b_daddr;
1251                 abi->abi_l2arc_asize = hdr->b_l2hdr->b_asize;
1252                 abi->abi_l2arc_compress = hdr->b_l2hdr->b_compress;
1253                 abi->abi_l2arc_hits = hdr->b_l2hdr->b_hits;
1254         }
1255
1256         if (state && state_index && list_link_active(&hdr->b_arc_node)) {
1257                 list_t *list = &state->arcs_list[hdr->b_type];
1258                 arc_buf_hdr_t *h;
1259
1260                 mutex_enter(&state->arcs_mtx);
1261                 for (h = list_head(list); h != NULL; h = list_next(list, h)) {
1262                         abi->abi_state_index++;
1263                         if (h == hdr)
1264                                 break;
1265                 }
1266                 mutex_exit(&state->arcs_mtx);
1267         }
1268 }
1269
1270 /*
1271  * Move the supplied buffer to the indicated state.  The mutex
1272  * for the buffer must be held by the caller.
1273  */
1274 static void
1275 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1276 {
1277         arc_state_t *old_state = ab->b_state;
1278         int64_t refcnt = refcount_count(&ab->b_refcnt);
1279         uint64_t from_delta, to_delta;
1280
1281         ASSERT(MUTEX_HELD(hash_lock));
1282         ASSERT3P(new_state, !=, old_state);
1283         ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1284         ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1285         ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1286
1287         from_delta = to_delta = ab->b_datacnt * ab->b_size;
1288
1289         /*
1290          * If this buffer is evictable, transfer it from the
1291          * old state list to the new state list.
1292          */
1293         if (refcnt == 0) {
1294                 if (old_state != arc_anon) {
1295                         int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1296                         uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1297
1298                         if (use_mutex)
1299                                 mutex_enter(&old_state->arcs_mtx);
1300
1301                         ASSERT(list_link_active(&ab->b_arc_node));
1302                         list_remove(&old_state->arcs_list[ab->b_type], ab);
1303
1304                         /*
1305                          * If prefetching out of the ghost cache,
1306                          * we will have a non-zero datacnt.
1307                          */
1308                         if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1309                                 /* ghost elements have a ghost size */
1310                                 ASSERT(ab->b_buf == NULL);
1311                                 from_delta = ab->b_size;
1312                         }
1313                         ASSERT3U(*size, >=, from_delta);
1314                         atomic_add_64(size, -from_delta);
1315
1316                         if (use_mutex)
1317                                 mutex_exit(&old_state->arcs_mtx);
1318                 }
1319                 if (new_state != arc_anon) {
1320                         int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1321                         uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1322
1323                         if (use_mutex)
1324                                 mutex_enter(&new_state->arcs_mtx);
1325
1326                         list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1327
1328                         /* ghost elements have a ghost size */
1329                         if (GHOST_STATE(new_state)) {
1330                                 ASSERT(ab->b_datacnt == 0);
1331                                 ASSERT(ab->b_buf == NULL);
1332                                 to_delta = ab->b_size;
1333                         }
1334                         atomic_add_64(size, to_delta);
1335
1336                         if (use_mutex)
1337                                 mutex_exit(&new_state->arcs_mtx);
1338                 }
1339         }
1340
1341         ASSERT(!BUF_EMPTY(ab));
1342         if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1343                 buf_hash_remove(ab);
1344
1345         /* adjust state sizes */
1346         if (to_delta)
1347                 atomic_add_64(&new_state->arcs_size, to_delta);
1348         if (from_delta) {
1349                 ASSERT3U(old_state->arcs_size, >=, from_delta);
1350                 atomic_add_64(&old_state->arcs_size, -from_delta);
1351         }
1352         ab->b_state = new_state;
1353
1354         /* adjust l2arc hdr stats */
1355         if (new_state == arc_l2c_only)
1356                 l2arc_hdr_stat_add();
1357         else if (old_state == arc_l2c_only)
1358                 l2arc_hdr_stat_remove();
1359 }
1360
1361 void
1362 arc_space_consume(uint64_t space, arc_space_type_t type)
1363 {
1364         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1365
1366         switch (type) {
1367         default:
1368                 break;
1369         case ARC_SPACE_DATA:
1370                 ARCSTAT_INCR(arcstat_data_size, space);
1371                 break;
1372         case ARC_SPACE_META:
1373                 ARCSTAT_INCR(arcstat_meta_size, space);
1374                 break;
1375         case ARC_SPACE_OTHER:
1376                 ARCSTAT_INCR(arcstat_other_size, space);
1377                 break;
1378         case ARC_SPACE_HDRS:
1379                 ARCSTAT_INCR(arcstat_hdr_size, space);
1380                 break;
1381         case ARC_SPACE_L2HDRS:
1382                 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1383                 break;
1384         }
1385
1386         if (type != ARC_SPACE_DATA)
1387                 ARCSTAT_INCR(arcstat_meta_used, space);
1388
1389         atomic_add_64(&arc_size, space);
1390 }
1391
1392 void
1393 arc_space_return(uint64_t space, arc_space_type_t type)
1394 {
1395         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1396
1397         switch (type) {
1398         default:
1399                 break;
1400         case ARC_SPACE_DATA:
1401                 ARCSTAT_INCR(arcstat_data_size, -space);
1402                 break;
1403         case ARC_SPACE_META:
1404                 ARCSTAT_INCR(arcstat_meta_size, -space);
1405                 break;
1406         case ARC_SPACE_OTHER:
1407                 ARCSTAT_INCR(arcstat_other_size, -space);
1408                 break;
1409         case ARC_SPACE_HDRS:
1410                 ARCSTAT_INCR(arcstat_hdr_size, -space);
1411                 break;
1412         case ARC_SPACE_L2HDRS:
1413                 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1414                 break;
1415         }
1416
1417         if (type != ARC_SPACE_DATA) {
1418                 ASSERT(arc_meta_used >= space);
1419                 if (arc_meta_max < arc_meta_used)
1420                         arc_meta_max = arc_meta_used;
1421                 ARCSTAT_INCR(arcstat_meta_used, -space);
1422         }
1423
1424         ASSERT(arc_size >= space);
1425         atomic_add_64(&arc_size, -space);
1426 }
1427
1428 arc_buf_t *
1429 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1430 {
1431         arc_buf_hdr_t *hdr;
1432         arc_buf_t *buf;
1433
1434         ASSERT3U(size, >, 0);
1435         hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1436         ASSERT(BUF_EMPTY(hdr));
1437         hdr->b_size = size;
1438         hdr->b_type = type;
1439         hdr->b_spa = spa_load_guid(spa);
1440         hdr->b_state = arc_anon;
1441         hdr->b_arc_access = 0;
1442         hdr->b_mru_hits = 0;
1443         hdr->b_mru_ghost_hits = 0;
1444         hdr->b_mfu_hits = 0;
1445         hdr->b_mfu_ghost_hits = 0;
1446         hdr->b_l2_hits = 0;
1447         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1448         buf->b_hdr = hdr;
1449         buf->b_data = NULL;
1450         buf->b_efunc = NULL;
1451         buf->b_private = NULL;
1452         buf->b_next = NULL;
1453         hdr->b_buf = buf;
1454         arc_get_data_buf(buf);
1455         hdr->b_datacnt = 1;
1456         hdr->b_flags = 0;
1457         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1458         (void) refcount_add(&hdr->b_refcnt, tag);
1459
1460         return (buf);
1461 }
1462
1463 static char *arc_onloan_tag = "onloan";
1464
1465 /*
1466  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1467  * flight data by arc_tempreserve_space() until they are "returned". Loaned
1468  * buffers must be returned to the arc before they can be used by the DMU or
1469  * freed.
1470  */
1471 arc_buf_t *
1472 arc_loan_buf(spa_t *spa, int size)
1473 {
1474         arc_buf_t *buf;
1475
1476         buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1477
1478         atomic_add_64(&arc_loaned_bytes, size);
1479         return (buf);
1480 }
1481
1482 /*
1483  * Return a loaned arc buffer to the arc.
1484  */
1485 void
1486 arc_return_buf(arc_buf_t *buf, void *tag)
1487 {
1488         arc_buf_hdr_t *hdr = buf->b_hdr;
1489
1490         ASSERT(buf->b_data != NULL);
1491         (void) refcount_add(&hdr->b_refcnt, tag);
1492         (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1493
1494         atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1495 }
1496
1497 /* Detach an arc_buf from a dbuf (tag) */
1498 void
1499 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1500 {
1501         arc_buf_hdr_t *hdr;
1502
1503         ASSERT(buf->b_data != NULL);
1504         hdr = buf->b_hdr;
1505         (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1506         (void) refcount_remove(&hdr->b_refcnt, tag);
1507         buf->b_efunc = NULL;
1508         buf->b_private = NULL;
1509
1510         atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1511 }
1512
1513 static arc_buf_t *
1514 arc_buf_clone(arc_buf_t *from)
1515 {
1516         arc_buf_t *buf;
1517         arc_buf_hdr_t *hdr = from->b_hdr;
1518         uint64_t size = hdr->b_size;
1519
1520         ASSERT(hdr->b_state != arc_anon);
1521
1522         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1523         buf->b_hdr = hdr;
1524         buf->b_data = NULL;
1525         buf->b_efunc = NULL;
1526         buf->b_private = NULL;
1527         buf->b_next = hdr->b_buf;
1528         hdr->b_buf = buf;
1529         arc_get_data_buf(buf);
1530         bcopy(from->b_data, buf->b_data, size);
1531
1532         /*
1533          * This buffer already exists in the arc so create a duplicate
1534          * copy for the caller.  If the buffer is associated with user data
1535          * then track the size and number of duplicates.  These stats will be
1536          * updated as duplicate buffers are created and destroyed.
1537          */
1538         if (hdr->b_type == ARC_BUFC_DATA) {
1539                 ARCSTAT_BUMP(arcstat_duplicate_buffers);
1540                 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1541         }
1542         hdr->b_datacnt += 1;
1543         return (buf);
1544 }
1545
1546 void
1547 arc_buf_add_ref(arc_buf_t *buf, void* tag)
1548 {
1549         arc_buf_hdr_t *hdr;
1550         kmutex_t *hash_lock;
1551
1552         /*
1553          * Check to see if this buffer is evicted.  Callers
1554          * must verify b_data != NULL to know if the add_ref
1555          * was successful.
1556          */
1557         mutex_enter(&buf->b_evict_lock);
1558         if (buf->b_data == NULL) {
1559                 mutex_exit(&buf->b_evict_lock);
1560                 return;
1561         }
1562         hash_lock = HDR_LOCK(buf->b_hdr);
1563         mutex_enter(hash_lock);
1564         hdr = buf->b_hdr;
1565         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1566         mutex_exit(&buf->b_evict_lock);
1567
1568         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1569         add_reference(hdr, hash_lock, tag);
1570         DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1571         arc_access(hdr, hash_lock);
1572         mutex_exit(hash_lock);
1573         ARCSTAT_BUMP(arcstat_hits);
1574         ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1575             demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1576             data, metadata, hits);
1577 }
1578
1579 /*
1580  * Free the arc data buffer.  If it is an l2arc write in progress,
1581  * the buffer is placed on l2arc_free_on_write to be freed later.
1582  */
1583 static void
1584 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1585 {
1586         arc_buf_hdr_t *hdr = buf->b_hdr;
1587
1588         if (HDR_L2_WRITING(hdr)) {
1589                 l2arc_data_free_t *df;
1590                 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_PUSHPAGE);
1591                 df->l2df_data = buf->b_data;
1592                 df->l2df_size = hdr->b_size;
1593                 df->l2df_func = free_func;
1594                 mutex_enter(&l2arc_free_on_write_mtx);
1595                 list_insert_head(l2arc_free_on_write, df);
1596                 mutex_exit(&l2arc_free_on_write_mtx);
1597                 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1598         } else {
1599                 free_func(buf->b_data, hdr->b_size);
1600         }
1601 }
1602
1603 static void
1604 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1605 {
1606         arc_buf_t **bufp;
1607
1608         /* free up data associated with the buf */
1609         if (buf->b_data) {
1610                 arc_state_t *state = buf->b_hdr->b_state;
1611                 uint64_t size = buf->b_hdr->b_size;
1612                 arc_buf_contents_t type = buf->b_hdr->b_type;
1613
1614                 arc_cksum_verify(buf);
1615                 arc_buf_unwatch(buf);
1616
1617                 if (!recycle) {
1618                         if (type == ARC_BUFC_METADATA) {
1619                                 arc_buf_data_free(buf, zio_buf_free);
1620                                 arc_space_return(size, ARC_SPACE_META);
1621                         } else {
1622                                 ASSERT(type == ARC_BUFC_DATA);
1623                                 arc_buf_data_free(buf, zio_data_buf_free);
1624                                 arc_space_return(size, ARC_SPACE_DATA);
1625                         }
1626                 }
1627                 if (list_link_active(&buf->b_hdr->b_arc_node)) {
1628                         uint64_t *cnt = &state->arcs_lsize[type];
1629
1630                         ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1631                         ASSERT(state != arc_anon);
1632
1633                         ASSERT3U(*cnt, >=, size);
1634                         atomic_add_64(cnt, -size);
1635                 }
1636                 ASSERT3U(state->arcs_size, >=, size);
1637                 atomic_add_64(&state->arcs_size, -size);
1638                 buf->b_data = NULL;
1639
1640                 /*
1641                  * If we're destroying a duplicate buffer make sure
1642                  * that the appropriate statistics are updated.
1643                  */
1644                 if (buf->b_hdr->b_datacnt > 1 &&
1645                     buf->b_hdr->b_type == ARC_BUFC_DATA) {
1646                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1647                         ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1648                 }
1649                 ASSERT(buf->b_hdr->b_datacnt > 0);
1650                 buf->b_hdr->b_datacnt -= 1;
1651         }
1652
1653         /* only remove the buf if requested */
1654         if (!all)
1655                 return;
1656
1657         /* remove the buf from the hdr list */
1658         for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1659                 continue;
1660         *bufp = buf->b_next;
1661         buf->b_next = NULL;
1662
1663         ASSERT(buf->b_efunc == NULL);
1664
1665         /* clean up the buf */
1666         buf->b_hdr = NULL;
1667         kmem_cache_free(buf_cache, buf);
1668 }
1669
1670 static void
1671 arc_hdr_destroy(arc_buf_hdr_t *hdr)
1672 {
1673         l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1674
1675         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1676         ASSERT3P(hdr->b_state, ==, arc_anon);
1677         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1678
1679         if (l2hdr != NULL) {
1680                 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1681                 /*
1682                  * To prevent arc_free() and l2arc_evict() from
1683                  * attempting to free the same buffer at the same time,
1684                  * a FREE_IN_PROGRESS flag is given to arc_free() to
1685                  * give it priority.  l2arc_evict() can't destroy this
1686                  * header while we are waiting on l2arc_buflist_mtx.
1687                  *
1688                  * The hdr may be removed from l2ad_buflist before we
1689                  * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1690                  */
1691                 if (!buflist_held) {
1692                         mutex_enter(&l2arc_buflist_mtx);
1693                         l2hdr = hdr->b_l2hdr;
1694                 }
1695
1696                 if (l2hdr != NULL) {
1697                         list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1698                         ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1699                         ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1700                         kmem_cache_free(l2arc_hdr_cache, l2hdr);
1701                         arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
1702                         if (hdr->b_state == arc_l2c_only)
1703                                 l2arc_hdr_stat_remove();
1704                         hdr->b_l2hdr = NULL;
1705                 }
1706
1707                 if (!buflist_held)
1708                         mutex_exit(&l2arc_buflist_mtx);
1709         }
1710
1711         if (!BUF_EMPTY(hdr)) {
1712                 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1713                 buf_discard_identity(hdr);
1714         }
1715         while (hdr->b_buf) {
1716                 arc_buf_t *buf = hdr->b_buf;
1717
1718                 if (buf->b_efunc) {
1719                         mutex_enter(&arc_eviction_mtx);
1720                         mutex_enter(&buf->b_evict_lock);
1721                         ASSERT(buf->b_hdr != NULL);
1722                         arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1723                         hdr->b_buf = buf->b_next;
1724                         buf->b_hdr = &arc_eviction_hdr;
1725                         buf->b_next = arc_eviction_list;
1726                         arc_eviction_list = buf;
1727                         mutex_exit(&buf->b_evict_lock);
1728                         mutex_exit(&arc_eviction_mtx);
1729                 } else {
1730                         arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1731                 }
1732         }
1733         if (hdr->b_freeze_cksum != NULL) {
1734                 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1735                 hdr->b_freeze_cksum = NULL;
1736         }
1737
1738         ASSERT(!list_link_active(&hdr->b_arc_node));
1739         ASSERT3P(hdr->b_hash_next, ==, NULL);
1740         ASSERT3P(hdr->b_acb, ==, NULL);
1741         kmem_cache_free(hdr_cache, hdr);
1742 }
1743
1744 void
1745 arc_buf_free(arc_buf_t *buf, void *tag)
1746 {
1747         arc_buf_hdr_t *hdr = buf->b_hdr;
1748         int hashed = hdr->b_state != arc_anon;
1749
1750         ASSERT(buf->b_efunc == NULL);
1751         ASSERT(buf->b_data != NULL);
1752
1753         if (hashed) {
1754                 kmutex_t *hash_lock = HDR_LOCK(hdr);
1755
1756                 mutex_enter(hash_lock);
1757                 hdr = buf->b_hdr;
1758                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1759
1760                 (void) remove_reference(hdr, hash_lock, tag);
1761                 if (hdr->b_datacnt > 1) {
1762                         arc_buf_destroy(buf, FALSE, TRUE);
1763                 } else {
1764                         ASSERT(buf == hdr->b_buf);
1765                         ASSERT(buf->b_efunc == NULL);
1766                         hdr->b_flags |= ARC_BUF_AVAILABLE;
1767                 }
1768                 mutex_exit(hash_lock);
1769         } else if (HDR_IO_IN_PROGRESS(hdr)) {
1770                 int destroy_hdr;
1771                 /*
1772                  * We are in the middle of an async write.  Don't destroy
1773                  * this buffer unless the write completes before we finish
1774                  * decrementing the reference count.
1775                  */
1776                 mutex_enter(&arc_eviction_mtx);
1777                 (void) remove_reference(hdr, NULL, tag);
1778                 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1779                 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1780                 mutex_exit(&arc_eviction_mtx);
1781                 if (destroy_hdr)
1782                         arc_hdr_destroy(hdr);
1783         } else {
1784                 if (remove_reference(hdr, NULL, tag) > 0)
1785                         arc_buf_destroy(buf, FALSE, TRUE);
1786                 else
1787                         arc_hdr_destroy(hdr);
1788         }
1789 }
1790
1791 boolean_t
1792 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1793 {
1794         arc_buf_hdr_t *hdr = buf->b_hdr;
1795         kmutex_t *hash_lock = NULL;
1796         boolean_t no_callback = (buf->b_efunc == NULL);
1797
1798         if (hdr->b_state == arc_anon) {
1799                 ASSERT(hdr->b_datacnt == 1);
1800                 arc_buf_free(buf, tag);
1801                 return (no_callback);
1802         }
1803
1804         hash_lock = HDR_LOCK(hdr);
1805         mutex_enter(hash_lock);
1806         hdr = buf->b_hdr;
1807         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1808         ASSERT(hdr->b_state != arc_anon);
1809         ASSERT(buf->b_data != NULL);
1810
1811         (void) remove_reference(hdr, hash_lock, tag);
1812         if (hdr->b_datacnt > 1) {
1813                 if (no_callback)
1814                         arc_buf_destroy(buf, FALSE, TRUE);
1815         } else if (no_callback) {
1816                 ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1817                 ASSERT(buf->b_efunc == NULL);
1818                 hdr->b_flags |= ARC_BUF_AVAILABLE;
1819         }
1820         ASSERT(no_callback || hdr->b_datacnt > 1 ||
1821             refcount_is_zero(&hdr->b_refcnt));
1822         mutex_exit(hash_lock);
1823         return (no_callback);
1824 }
1825
1826 int
1827 arc_buf_size(arc_buf_t *buf)
1828 {
1829         return (buf->b_hdr->b_size);
1830 }
1831
1832 /*
1833  * Called from the DMU to determine if the current buffer should be
1834  * evicted. In order to ensure proper locking, the eviction must be initiated
1835  * from the DMU. Return true if the buffer is associated with user data and
1836  * duplicate buffers still exist.
1837  */
1838 boolean_t
1839 arc_buf_eviction_needed(arc_buf_t *buf)
1840 {
1841         arc_buf_hdr_t *hdr;
1842         boolean_t evict_needed = B_FALSE;
1843
1844         if (zfs_disable_dup_eviction)
1845                 return (B_FALSE);
1846
1847         mutex_enter(&buf->b_evict_lock);
1848         hdr = buf->b_hdr;
1849         if (hdr == NULL) {
1850                 /*
1851                  * We are in arc_do_user_evicts(); let that function
1852                  * perform the eviction.
1853                  */
1854                 ASSERT(buf->b_data == NULL);
1855                 mutex_exit(&buf->b_evict_lock);
1856                 return (B_FALSE);
1857         } else if (buf->b_data == NULL) {
1858                 /*
1859                  * We have already been added to the arc eviction list;
1860                  * recommend eviction.
1861                  */
1862                 ASSERT3P(hdr, ==, &arc_eviction_hdr);
1863                 mutex_exit(&buf->b_evict_lock);
1864                 return (B_TRUE);
1865         }
1866
1867         if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
1868                 evict_needed = B_TRUE;
1869
1870         mutex_exit(&buf->b_evict_lock);
1871         return (evict_needed);
1872 }
1873
1874 /*
1875  * Evict buffers from list until we've removed the specified number of
1876  * bytes.  Move the removed buffers to the appropriate evict state.
1877  * If the recycle flag is set, then attempt to "recycle" a buffer:
1878  * - look for a buffer to evict that is `bytes' long.
1879  * - return the data block from this buffer rather than freeing it.
1880  * This flag is used by callers that are trying to make space for a
1881  * new buffer in a full arc cache.
1882  *
1883  * This function makes a "best effort".  It skips over any buffers
1884  * it can't get a hash_lock on, and so may not catch all candidates.
1885  * It may also return without evicting as much space as requested.
1886  */
1887 static void *
1888 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1889     arc_buf_contents_t type)
1890 {
1891         arc_state_t *evicted_state;
1892         uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1893         arc_buf_hdr_t *ab, *ab_prev = NULL;
1894         list_t *list = &state->arcs_list[type];
1895         kmutex_t *hash_lock;
1896         boolean_t have_lock;
1897         void *stolen = NULL;
1898         arc_buf_hdr_t marker = {{{ 0 }}};
1899         int count = 0;
1900
1901         ASSERT(state == arc_mru || state == arc_mfu);
1902
1903         evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1904
1905 top:
1906         mutex_enter(&state->arcs_mtx);
1907         mutex_enter(&evicted_state->arcs_mtx);
1908
1909         for (ab = list_tail(list); ab; ab = ab_prev) {
1910                 ab_prev = list_prev(list, ab);
1911                 /* prefetch buffers have a minimum lifespan */
1912                 if (HDR_IO_IN_PROGRESS(ab) ||
1913                     (spa && ab->b_spa != spa) ||
1914                     (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1915                     ddi_get_lbolt() - ab->b_arc_access <
1916                     zfs_arc_min_prefetch_lifespan)) {
1917                         skipped++;
1918                         continue;
1919                 }
1920                 /* "lookahead" for better eviction candidate */
1921                 if (recycle && ab->b_size != bytes &&
1922                     ab_prev && ab_prev->b_size == bytes)
1923                         continue;
1924
1925                 /* ignore markers */
1926                 if (ab->b_spa == 0)
1927                         continue;
1928
1929                 /*
1930                  * It may take a long time to evict all the bufs requested.
1931                  * To avoid blocking all arc activity, periodically drop
1932                  * the arcs_mtx and give other threads a chance to run
1933                  * before reacquiring the lock.
1934                  *
1935                  * If we are looking for a buffer to recycle, we are in
1936                  * the hot code path, so don't sleep.
1937                  */
1938                 if (!recycle && count++ > arc_evict_iterations) {
1939                         list_insert_after(list, ab, &marker);
1940                         mutex_exit(&evicted_state->arcs_mtx);
1941                         mutex_exit(&state->arcs_mtx);
1942                         kpreempt(KPREEMPT_SYNC);
1943                         mutex_enter(&state->arcs_mtx);
1944                         mutex_enter(&evicted_state->arcs_mtx);
1945                         ab_prev = list_prev(list, &marker);
1946                         list_remove(list, &marker);
1947                         count = 0;
1948                         continue;
1949                 }
1950
1951                 hash_lock = HDR_LOCK(ab);
1952                 have_lock = MUTEX_HELD(hash_lock);
1953                 if (have_lock || mutex_tryenter(hash_lock)) {
1954                         ASSERT0(refcount_count(&ab->b_refcnt));
1955                         ASSERT(ab->b_datacnt > 0);
1956                         while (ab->b_buf) {
1957                                 arc_buf_t *buf = ab->b_buf;
1958                                 if (!mutex_tryenter(&buf->b_evict_lock)) {
1959                                         missed += 1;
1960                                         break;
1961                                 }
1962                                 if (buf->b_data) {
1963                                         bytes_evicted += ab->b_size;
1964                                         if (recycle && ab->b_type == type &&
1965                                             ab->b_size == bytes &&
1966                                             !HDR_L2_WRITING(ab)) {
1967                                                 stolen = buf->b_data;
1968                                                 recycle = FALSE;
1969                                         }
1970                                 }
1971                                 if (buf->b_efunc) {
1972                                         mutex_enter(&arc_eviction_mtx);
1973                                         arc_buf_destroy(buf,
1974                                             buf->b_data == stolen, FALSE);
1975                                         ab->b_buf = buf->b_next;
1976                                         buf->b_hdr = &arc_eviction_hdr;
1977                                         buf->b_next = arc_eviction_list;
1978                                         arc_eviction_list = buf;
1979                                         mutex_exit(&arc_eviction_mtx);
1980                                         mutex_exit(&buf->b_evict_lock);
1981                                 } else {
1982                                         mutex_exit(&buf->b_evict_lock);
1983                                         arc_buf_destroy(buf,
1984                                             buf->b_data == stolen, TRUE);
1985                                 }
1986                         }
1987
1988                         if (ab->b_l2hdr) {
1989                                 ARCSTAT_INCR(arcstat_evict_l2_cached,
1990                                     ab->b_size);
1991                         } else {
1992                                 if (l2arc_write_eligible(ab->b_spa, ab)) {
1993                                         ARCSTAT_INCR(arcstat_evict_l2_eligible,
1994                                             ab->b_size);
1995                                 } else {
1996                                         ARCSTAT_INCR(
1997                                             arcstat_evict_l2_ineligible,
1998                                             ab->b_size);
1999                                 }
2000                         }
2001
2002                         if (ab->b_datacnt == 0) {
2003                                 arc_change_state(evicted_state, ab, hash_lock);
2004                                 ASSERT(HDR_IN_HASH_TABLE(ab));
2005                                 ab->b_flags |= ARC_IN_HASH_TABLE;
2006                                 ab->b_flags &= ~ARC_BUF_AVAILABLE;
2007                                 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
2008                         }
2009                         if (!have_lock)
2010                                 mutex_exit(hash_lock);
2011                         if (bytes >= 0 && bytes_evicted >= bytes)
2012                                 break;
2013                 } else {
2014                         missed += 1;
2015                 }
2016         }
2017
2018         mutex_exit(&evicted_state->arcs_mtx);
2019         mutex_exit(&state->arcs_mtx);
2020
2021         if (list == &state->arcs_list[ARC_BUFC_DATA] &&
2022             (bytes < 0 || bytes_evicted < bytes)) {
2023                 /* Prevent second pass from recycling metadata into data */
2024                 recycle = FALSE;
2025                 type = ARC_BUFC_METADATA;
2026                 list = &state->arcs_list[type];
2027                 goto top;
2028         }
2029
2030         if (bytes_evicted < bytes)
2031                 dprintf("only evicted %lld bytes from %x\n",
2032                     (longlong_t)bytes_evicted, state);
2033
2034         if (skipped)
2035                 ARCSTAT_INCR(arcstat_evict_skip, skipped);
2036
2037         if (missed)
2038                 ARCSTAT_INCR(arcstat_mutex_miss, missed);
2039
2040         /*
2041          * Note: we have just evicted some data into the ghost state,
2042          * potentially putting the ghost size over the desired size.  Rather
2043          * that evicting from the ghost list in this hot code path, leave
2044          * this chore to the arc_reclaim_thread().
2045          */
2046
2047         return (stolen);
2048 }
2049
2050 /*
2051  * Remove buffers from list until we've removed the specified number of
2052  * bytes.  Destroy the buffers that are removed.
2053  */
2054 static void
2055 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes,
2056     arc_buf_contents_t type)
2057 {
2058         arc_buf_hdr_t *ab, *ab_prev;
2059         arc_buf_hdr_t marker;
2060         list_t *list = &state->arcs_list[type];
2061         kmutex_t *hash_lock;
2062         uint64_t bytes_deleted = 0;
2063         uint64_t bufs_skipped = 0;
2064         int count = 0;
2065
2066         ASSERT(GHOST_STATE(state));
2067         bzero(&marker, sizeof (marker));
2068 top:
2069         mutex_enter(&state->arcs_mtx);
2070         for (ab = list_tail(list); ab; ab = ab_prev) {
2071                 ab_prev = list_prev(list, ab);
2072                 if (ab->b_type > ARC_BUFC_NUMTYPES)
2073                         panic("invalid ab=%p", (void *)ab);
2074                 if (spa && ab->b_spa != spa)
2075                         continue;
2076
2077                 /* ignore markers */
2078                 if (ab->b_spa == 0)
2079                         continue;
2080
2081                 hash_lock = HDR_LOCK(ab);
2082                 /* caller may be trying to modify this buffer, skip it */
2083                 if (MUTEX_HELD(hash_lock))
2084                         continue;
2085
2086                 /*
2087                  * It may take a long time to evict all the bufs requested.
2088                  * To avoid blocking all arc activity, periodically drop
2089                  * the arcs_mtx and give other threads a chance to run
2090                  * before reacquiring the lock.
2091                  */
2092                 if (count++ > arc_evict_iterations) {
2093                         list_insert_after(list, ab, &marker);
2094                         mutex_exit(&state->arcs_mtx);
2095                         kpreempt(KPREEMPT_SYNC);
2096                         mutex_enter(&state->arcs_mtx);
2097                         ab_prev = list_prev(list, &marker);
2098                         list_remove(list, &marker);
2099                         count = 0;
2100                         continue;
2101                 }
2102                 if (mutex_tryenter(hash_lock)) {
2103                         ASSERT(!HDR_IO_IN_PROGRESS(ab));
2104                         ASSERT(ab->b_buf == NULL);
2105                         ARCSTAT_BUMP(arcstat_deleted);
2106                         bytes_deleted += ab->b_size;
2107
2108                         if (ab->b_l2hdr != NULL) {
2109                                 /*
2110                                  * This buffer is cached on the 2nd Level ARC;
2111                                  * don't destroy the header.
2112                                  */
2113                                 arc_change_state(arc_l2c_only, ab, hash_lock);
2114                                 mutex_exit(hash_lock);
2115                         } else {
2116                                 arc_change_state(arc_anon, ab, hash_lock);
2117                                 mutex_exit(hash_lock);
2118                                 arc_hdr_destroy(ab);
2119                         }
2120
2121                         DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
2122                         if (bytes >= 0 && bytes_deleted >= bytes)
2123                                 break;
2124                 } else if (bytes < 0) {
2125                         /*
2126                          * Insert a list marker and then wait for the
2127                          * hash lock to become available. Once its
2128                          * available, restart from where we left off.
2129                          */
2130                         list_insert_after(list, ab, &marker);
2131                         mutex_exit(&state->arcs_mtx);
2132                         mutex_enter(hash_lock);
2133                         mutex_exit(hash_lock);
2134                         mutex_enter(&state->arcs_mtx);
2135                         ab_prev = list_prev(list, &marker);
2136                         list_remove(list, &marker);
2137                 } else {
2138                         bufs_skipped += 1;
2139                 }
2140         }
2141         mutex_exit(&state->arcs_mtx);
2142
2143         if (list == &state->arcs_list[ARC_BUFC_DATA] &&
2144             (bytes < 0 || bytes_deleted < bytes)) {
2145                 list = &state->arcs_list[ARC_BUFC_METADATA];
2146                 goto top;
2147         }
2148
2149         if (bufs_skipped) {
2150                 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2151                 ASSERT(bytes >= 0);
2152         }
2153
2154         if (bytes_deleted < bytes)
2155                 dprintf("only deleted %lld bytes from %p\n",
2156                     (longlong_t)bytes_deleted, state);
2157 }
2158
2159 static void
2160 arc_adjust(void)
2161 {
2162         int64_t adjustment, delta;
2163
2164         /*
2165          * Adjust MRU size
2166          */
2167
2168         adjustment = MIN((int64_t)(arc_size - arc_c),
2169             (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size - arc_p));
2170
2171         if (adjustment > 0 && arc_mru->arcs_size > 0) {
2172                 delta = MIN(arc_mru->arcs_size, adjustment);
2173                 (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
2174         }
2175
2176         /*
2177          * Adjust MFU size
2178          */
2179
2180         adjustment = arc_size - arc_c;
2181
2182         if (adjustment > 0 && arc_mfu->arcs_size > 0) {
2183                 delta = MIN(arc_mfu->arcs_size, adjustment);
2184                 (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
2185         }
2186
2187         /*
2188          * Adjust ghost lists
2189          */
2190
2191         adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2192
2193         if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2194                 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2195                 arc_evict_ghost(arc_mru_ghost, 0, delta, ARC_BUFC_DATA);
2196         }
2197
2198         adjustment =
2199             arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2200
2201         if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2202                 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2203                 arc_evict_ghost(arc_mfu_ghost, 0, delta, ARC_BUFC_DATA);
2204         }
2205 }
2206
2207 /*
2208  * Request that arc user drop references so that N bytes can be released
2209  * from the cache.  This provides a mechanism to ensure the arc can honor
2210  * the arc_meta_limit and reclaim buffers which are pinned in the cache
2211  * by higher layers.  (i.e. the zpl)
2212  */
2213 static void
2214 arc_do_user_prune(int64_t adjustment)
2215 {
2216         arc_prune_func_t *func;
2217         void *private;
2218         arc_prune_t *cp, *np;
2219
2220         mutex_enter(&arc_prune_mtx);
2221
2222         cp = list_head(&arc_prune_list);
2223         while (cp != NULL) {
2224                 func = cp->p_pfunc;
2225                 private = cp->p_private;
2226                 np = list_next(&arc_prune_list, cp);
2227                 refcount_add(&cp->p_refcnt, func);
2228                 mutex_exit(&arc_prune_mtx);
2229
2230                 if (func != NULL)
2231                         func(adjustment, private);
2232
2233                 mutex_enter(&arc_prune_mtx);
2234
2235                 /* User removed prune callback concurrently with execution */
2236                 if (refcount_remove(&cp->p_refcnt, func) == 0) {
2237                         ASSERT(!list_link_active(&cp->p_node));
2238                         refcount_destroy(&cp->p_refcnt);
2239                         kmem_free(cp, sizeof (*cp));
2240                 }
2241
2242                 cp = np;
2243         }
2244
2245         ARCSTAT_BUMP(arcstat_prune);
2246         mutex_exit(&arc_prune_mtx);
2247 }
2248
2249 static void
2250 arc_do_user_evicts(void)
2251 {
2252         mutex_enter(&arc_eviction_mtx);
2253         while (arc_eviction_list != NULL) {
2254                 arc_buf_t *buf = arc_eviction_list;
2255                 arc_eviction_list = buf->b_next;
2256                 mutex_enter(&buf->b_evict_lock);
2257                 buf->b_hdr = NULL;
2258                 mutex_exit(&buf->b_evict_lock);
2259                 mutex_exit(&arc_eviction_mtx);
2260
2261                 if (buf->b_efunc != NULL)
2262                         VERIFY(buf->b_efunc(buf) == 0);
2263
2264                 buf->b_efunc = NULL;
2265                 buf->b_private = NULL;
2266                 kmem_cache_free(buf_cache, buf);
2267                 mutex_enter(&arc_eviction_mtx);
2268         }
2269         mutex_exit(&arc_eviction_mtx);
2270 }
2271
2272 /*
2273  * Evict only meta data objects from the cache leaving the data objects.
2274  * This is only used to enforce the tunable arc_meta_limit, if we are
2275  * unable to evict enough buffers notify the user via the prune callback.
2276  */
2277 static void
2278 arc_adjust_meta(void)
2279 {
2280         int64_t adjustmnt, delta;
2281
2282         /*
2283          * This slightly differs than the way we evict from the mru in
2284          * arc_adjust because we don't have a "target" value (i.e. no
2285          * "meta" arc_p). As a result, I think we can completely
2286          * cannibalize the metadata in the MRU before we evict the
2287          * metadata from the MFU. I think we probably need to implement a
2288          * "metadata arc_p" value to do this properly.
2289          */
2290         adjustmnt = arc_meta_used - arc_meta_limit;
2291
2292         if (adjustmnt > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2293                 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustmnt);
2294                 arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA);
2295                 adjustmnt -= delta;
2296         }
2297
2298         /*
2299          * We can't afford to recalculate adjustmnt here. If we do,
2300          * new metadata buffers can sneak into the MRU or ANON lists,
2301          * thus penalize the MFU metadata. Although the fudge factor is
2302          * small, it has been empirically shown to be significant for
2303          * certain workloads (e.g. creating many empty directories). As
2304          * such, we use the original calculation for adjustmnt, and
2305          * simply decrement the amount of data evicted from the MRU.
2306          */
2307
2308         if (adjustmnt > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2309                 delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustmnt);
2310                 arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA);
2311         }
2312
2313         adjustmnt = arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
2314             arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] - arc_meta_limit;
2315
2316         if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2317                 delta = MIN(adjustmnt,
2318                     arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA]);
2319                 arc_evict_ghost(arc_mru_ghost, 0, delta, ARC_BUFC_METADATA);
2320         }
2321
2322         adjustmnt = arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] +
2323             arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA] - arc_meta_limit;
2324
2325         if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2326                 delta = MIN(adjustmnt,
2327                     arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA]);
2328                 arc_evict_ghost(arc_mfu_ghost, 0, delta, ARC_BUFC_METADATA);
2329         }
2330
2331         if (arc_meta_used > arc_meta_limit)
2332                 arc_do_user_prune(zfs_arc_meta_prune);
2333 }
2334
2335 /*
2336  * Flush all *evictable* data from the cache for the given spa.
2337  * NOTE: this will not touch "active" (i.e. referenced) data.
2338  */
2339 void
2340 arc_flush(spa_t *spa)
2341 {
2342         uint64_t guid = 0;
2343
2344         if (spa)
2345                 guid = spa_load_guid(spa);
2346
2347         while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
2348                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2349                 if (spa)
2350                         break;
2351         }
2352         while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
2353                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2354                 if (spa)
2355                         break;
2356         }
2357         while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
2358                 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2359                 if (spa)
2360                         break;
2361         }
2362         while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
2363                 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2364                 if (spa)
2365                         break;
2366         }
2367
2368         arc_evict_ghost(arc_mru_ghost, guid, -1, ARC_BUFC_DATA);
2369         arc_evict_ghost(arc_mfu_ghost, guid, -1, ARC_BUFC_DATA);
2370
2371         mutex_enter(&arc_reclaim_thr_lock);
2372         arc_do_user_evicts();
2373         mutex_exit(&arc_reclaim_thr_lock);
2374         ASSERT(spa || arc_eviction_list == NULL);
2375 }
2376
2377 void
2378 arc_shrink(uint64_t bytes)
2379 {
2380         if (arc_c > arc_c_min) {
2381                 uint64_t to_free;
2382
2383                 to_free = bytes ? bytes : arc_c >> zfs_arc_shrink_shift;
2384
2385                 if (arc_c > arc_c_min + to_free)
2386                         atomic_add_64(&arc_c, -to_free);
2387                 else
2388                         arc_c = arc_c_min;
2389
2390                 to_free = bytes ? bytes : arc_p >> zfs_arc_shrink_shift;
2391
2392                 if (arc_p > to_free)
2393                         atomic_add_64(&arc_p, -to_free);
2394                 else
2395                         arc_p = 0;
2396
2397                 if (arc_c > arc_size)
2398                         arc_c = MAX(arc_size, arc_c_min);
2399                 if (arc_p > arc_c)
2400                         arc_p = (arc_c >> 1);
2401                 ASSERT(arc_c >= arc_c_min);
2402                 ASSERT((int64_t)arc_p >= 0);
2403         }
2404
2405         if (arc_size > arc_c)
2406                 arc_adjust();
2407 }
2408
2409 static void
2410 arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes)
2411 {
2412         size_t                  i;
2413         kmem_cache_t            *prev_cache = NULL;
2414         kmem_cache_t            *prev_data_cache = NULL;
2415         extern kmem_cache_t     *zio_buf_cache[];
2416         extern kmem_cache_t     *zio_data_buf_cache[];
2417
2418         /*
2419          * An aggressive reclamation will shrink the cache size as well as
2420          * reap free buffers from the arc kmem caches.
2421          */
2422         if (strat == ARC_RECLAIM_AGGR)
2423                 arc_shrink(bytes);
2424
2425         for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2426                 if (zio_buf_cache[i] != prev_cache) {
2427                         prev_cache = zio_buf_cache[i];
2428                         kmem_cache_reap_now(zio_buf_cache[i]);
2429                 }
2430                 if (zio_data_buf_cache[i] != prev_data_cache) {
2431                         prev_data_cache = zio_data_buf_cache[i];
2432                         kmem_cache_reap_now(zio_data_buf_cache[i]);
2433                 }
2434         }
2435
2436         kmem_cache_reap_now(buf_cache);
2437         kmem_cache_reap_now(hdr_cache);
2438 }
2439
2440 /*
2441  * Unlike other ZFS implementations this thread is only responsible for
2442  * adapting the target ARC size on Linux.  The responsibility for memory
2443  * reclamation has been entirely delegated to the arc_shrinker_func()
2444  * which is registered with the VM.  To reflect this change in behavior
2445  * the arc_reclaim thread has been renamed to arc_adapt.
2446  */
2447 static void
2448 arc_adapt_thread(void)
2449 {
2450         callb_cpr_t             cpr;
2451
2452         CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2453
2454         mutex_enter(&arc_reclaim_thr_lock);
2455         while (arc_thread_exit == 0) {
2456 #ifndef _KERNEL
2457                 arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
2458
2459                 if (spa_get_random(100) == 0) {
2460
2461                         if (arc_no_grow) {
2462                                 if (last_reclaim == ARC_RECLAIM_CONS) {
2463                                         last_reclaim = ARC_RECLAIM_AGGR;
2464                                 } else {
2465                                         last_reclaim = ARC_RECLAIM_CONS;
2466                                 }
2467                         } else {
2468                                 arc_no_grow = TRUE;
2469                                 last_reclaim = ARC_RECLAIM_AGGR;
2470                                 membar_producer();
2471                         }
2472
2473                         /* reset the growth delay for every reclaim */
2474                         arc_grow_time = ddi_get_lbolt() +
2475                             (zfs_arc_grow_retry * hz);
2476
2477                         arc_kmem_reap_now(last_reclaim, 0);
2478                         arc_warm = B_TRUE;
2479                 }
2480 #endif /* !_KERNEL */
2481
2482                 /* No recent memory pressure allow the ARC to grow. */
2483                 if (arc_no_grow && ddi_get_lbolt() >= arc_grow_time)
2484                         arc_no_grow = FALSE;
2485
2486                 arc_adjust_meta();
2487
2488                 arc_adjust();
2489
2490                 if (arc_eviction_list != NULL)
2491                         arc_do_user_evicts();
2492
2493                 /* block until needed, or one second, whichever is shorter */
2494                 CALLB_CPR_SAFE_BEGIN(&cpr);
2495                 (void) cv_timedwait_interruptible(&arc_reclaim_thr_cv,
2496                     &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
2497                 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2498
2499
2500                 /* Allow the module options to be changed */
2501                 if (zfs_arc_max > 64 << 20 &&
2502                     zfs_arc_max < physmem * PAGESIZE &&
2503                     zfs_arc_max != arc_c_max)
2504                         arc_c_max = zfs_arc_max;
2505
2506                 if (zfs_arc_min > 0 &&
2507                     zfs_arc_min < arc_c_max &&
2508                     zfs_arc_min != arc_c_min)
2509                         arc_c_min = zfs_arc_min;
2510
2511                 if (zfs_arc_meta_limit > 0 &&
2512                     zfs_arc_meta_limit <= arc_c_max &&
2513                     zfs_arc_meta_limit != arc_meta_limit)
2514                         arc_meta_limit = zfs_arc_meta_limit;
2515
2516
2517
2518         }
2519
2520         arc_thread_exit = 0;
2521         cv_broadcast(&arc_reclaim_thr_cv);
2522         CALLB_CPR_EXIT(&cpr);           /* drops arc_reclaim_thr_lock */
2523         thread_exit();
2524 }
2525
2526 #ifdef _KERNEL
2527 /*
2528  * Determine the amount of memory eligible for eviction contained in the
2529  * ARC. All clean data reported by the ghost lists can always be safely
2530  * evicted. Due to arc_c_min, the same does not hold for all clean data
2531  * contained by the regular mru and mfu lists.
2532  *
2533  * In the case of the regular mru and mfu lists, we need to report as
2534  * much clean data as possible, such that evicting that same reported
2535  * data will not bring arc_size below arc_c_min. Thus, in certain
2536  * circumstances, the total amount of clean data in the mru and mfu
2537  * lists might not actually be evictable.
2538  *
2539  * The following two distinct cases are accounted for:
2540  *
2541  * 1. The sum of the amount of dirty data contained by both the mru and
2542  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
2543  *    is greater than or equal to arc_c_min.
2544  *    (i.e. amount of dirty data >= arc_c_min)
2545  *
2546  *    This is the easy case; all clean data contained by the mru and mfu
2547  *    lists is evictable. Evicting all clean data can only drop arc_size
2548  *    to the amount of dirty data, which is greater than arc_c_min.
2549  *
2550  * 2. The sum of the amount of dirty data contained by both the mru and
2551  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
2552  *    is less than arc_c_min.
2553  *    (i.e. arc_c_min > amount of dirty data)
2554  *
2555  *    2.1. arc_size is greater than or equal arc_c_min.
2556  *         (i.e. arc_size >= arc_c_min > amount of dirty data)
2557  *
2558  *         In this case, not all clean data from the regular mru and mfu
2559  *         lists is actually evictable; we must leave enough clean data
2560  *         to keep arc_size above arc_c_min. Thus, the maximum amount of
2561  *         evictable data from the two lists combined, is exactly the
2562  *         difference between arc_size and arc_c_min.
2563  *
2564  *    2.2. arc_size is less than arc_c_min
2565  *         (i.e. arc_c_min > arc_size > amount of dirty data)
2566  *
2567  *         In this case, none of the data contained in the mru and mfu
2568  *         lists is evictable, even if it's clean. Since arc_size is
2569  *         already below arc_c_min, evicting any more would only
2570  *         increase this negative difference.
2571  */
2572 static uint64_t
2573 arc_evictable_memory(void) {
2574         uint64_t arc_clean =
2575             arc_mru->arcs_lsize[ARC_BUFC_DATA] +
2576             arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
2577             arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
2578             arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
2579         uint64_t ghost_clean =
2580             arc_mru_ghost->arcs_lsize[ARC_BUFC_DATA] +
2581             arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] +
2582             arc_mfu_ghost->arcs_lsize[ARC_BUFC_DATA] +
2583             arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA];
2584         uint64_t arc_dirty = MAX((int64_t)arc_size - (int64_t)arc_clean, 0);
2585
2586         if (arc_dirty >= arc_c_min)
2587                 return (ghost_clean + arc_clean);
2588
2589         return (ghost_clean + MAX((int64_t)arc_size - (int64_t)arc_c_min, 0));
2590 }
2591
2592 static int
2593 __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
2594 {
2595         uint64_t pages;
2596
2597         /* The arc is considered warm once reclaim has occurred */
2598         if (unlikely(arc_warm == B_FALSE))
2599                 arc_warm = B_TRUE;
2600
2601         /* Return the potential number of reclaimable pages */
2602         pages = btop(arc_evictable_memory());
2603         if (sc->nr_to_scan == 0)
2604                 return (pages);
2605
2606         /* Not allowed to perform filesystem reclaim */
2607         if (!(sc->gfp_mask & __GFP_FS))
2608                 return (-1);
2609
2610         /* Reclaim in progress */
2611         if (mutex_tryenter(&arc_reclaim_thr_lock) == 0)
2612                 return (-1);
2613
2614         /*
2615          * Evict the requested number of pages by shrinking arc_c the
2616          * requested amount.  If there is nothing left to evict just
2617          * reap whatever we can from the various arc slabs.
2618          */
2619         if (pages > 0) {
2620                 arc_kmem_reap_now(ARC_RECLAIM_AGGR, ptob(sc->nr_to_scan));
2621                 pages = btop(arc_evictable_memory());
2622         } else {
2623                 arc_kmem_reap_now(ARC_RECLAIM_CONS, ptob(sc->nr_to_scan));
2624                 pages = -1;
2625         }
2626
2627         /*
2628          * When direct reclaim is observed it usually indicates a rapid
2629          * increase in memory pressure.  This occurs because the kswapd
2630          * threads were unable to asynchronously keep enough free memory
2631          * available.  In this case set arc_no_grow to briefly pause arc
2632          * growth to avoid compounding the memory pressure.
2633          */
2634         if (current_is_kswapd()) {
2635                 ARCSTAT_BUMP(arcstat_memory_indirect_count);
2636         } else {
2637                 arc_no_grow = B_TRUE;
2638                 arc_grow_time = ddi_get_lbolt() + (zfs_arc_grow_retry * hz);
2639                 ARCSTAT_BUMP(arcstat_memory_direct_count);
2640         }
2641
2642         mutex_exit(&arc_reclaim_thr_lock);
2643
2644         return (pages);
2645 }
2646 SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
2647
2648 SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_func, DEFAULT_SEEKS);
2649 #endif /* _KERNEL */
2650
2651 /*
2652  * Adapt arc info given the number of bytes we are trying to add and
2653  * the state that we are comming from.  This function is only called
2654  * when we are adding new content to the cache.
2655  */
2656 static void
2657 arc_adapt(int bytes, arc_state_t *state)
2658 {
2659         int mult;
2660
2661         if (state == arc_l2c_only)
2662                 return;
2663
2664         ASSERT(bytes > 0);
2665         /*
2666          * Adapt the target size of the MRU list:
2667          *      - if we just hit in the MRU ghost list, then increase
2668          *        the target size of the MRU list.
2669          *      - if we just hit in the MFU ghost list, then increase
2670          *        the target size of the MFU list by decreasing the
2671          *        target size of the MRU list.
2672          */
2673         if (state == arc_mru_ghost) {
2674                 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2675                     1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2676
2677                 if (!zfs_arc_p_dampener_disable)
2678                         mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2679
2680                 arc_p = MIN(arc_c, arc_p + bytes * mult);
2681         } else if (state == arc_mfu_ghost) {
2682                 uint64_t delta;
2683
2684                 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2685                     1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2686
2687                 if (!zfs_arc_p_dampener_disable)
2688                         mult = MIN(mult, 10);
2689
2690                 delta = MIN(bytes * mult, arc_p);
2691                 arc_p = MAX(0, arc_p - delta);
2692         }
2693         ASSERT((int64_t)arc_p >= 0);
2694
2695         if (arc_no_grow)
2696                 return;
2697
2698         if (arc_c >= arc_c_max)
2699                 return;
2700
2701         /*
2702          * If we're within (2 * maxblocksize) bytes of the target
2703          * cache size, increment the target cache size
2704          */
2705         if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2706                 atomic_add_64(&arc_c, (int64_t)bytes);
2707                 if (arc_c > arc_c_max)
2708                         arc_c = arc_c_max;
2709                 else if (state == arc_anon)
2710                         atomic_add_64(&arc_p, (int64_t)bytes);
2711                 if (arc_p > arc_c)
2712                         arc_p = arc_c;
2713         }
2714         ASSERT((int64_t)arc_p >= 0);
2715 }
2716
2717 /*
2718  * Check if the cache has reached its limits and eviction is required
2719  * prior to insert.
2720  */
2721 static int
2722 arc_evict_needed(arc_buf_contents_t type)
2723 {
2724         if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2725                 return (1);
2726
2727         if (arc_no_grow)
2728                 return (1);
2729
2730         return (arc_size > arc_c);
2731 }
2732
2733 /*
2734  * The buffer, supplied as the first argument, needs a data block.
2735  * So, if we are at cache max, determine which cache should be victimized.
2736  * We have the following cases:
2737  *
2738  * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2739  * In this situation if we're out of space, but the resident size of the MFU is
2740  * under the limit, victimize the MFU cache to satisfy this insertion request.
2741  *
2742  * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2743  * Here, we've used up all of the available space for the MRU, so we need to
2744  * evict from our own cache instead.  Evict from the set of resident MRU
2745  * entries.
2746  *
2747  * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2748  * c minus p represents the MFU space in the cache, since p is the size of the
2749  * cache that is dedicated to the MRU.  In this situation there's still space on
2750  * the MFU side, so the MRU side needs to be victimized.
2751  *
2752  * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2753  * MFU's resident set is consuming more space than it has been allotted.  In
2754  * this situation, we must victimize our own cache, the MFU, for this insertion.
2755  */
2756 static void
2757 arc_get_data_buf(arc_buf_t *buf)
2758 {
2759         arc_state_t             *state = buf->b_hdr->b_state;
2760         uint64_t                size = buf->b_hdr->b_size;
2761         arc_buf_contents_t      type = buf->b_hdr->b_type;
2762         arc_buf_contents_t      evict = ARC_BUFC_DATA;
2763         boolean_t               recycle = TRUE;
2764
2765         arc_adapt(size, state);
2766
2767         /*
2768          * We have not yet reached cache maximum size,
2769          * just allocate a new buffer.
2770          */
2771         if (!arc_evict_needed(type)) {
2772                 if (type == ARC_BUFC_METADATA) {
2773                         buf->b_data = zio_buf_alloc(size);
2774                         arc_space_consume(size, ARC_SPACE_META);
2775                 } else {
2776                         ASSERT(type == ARC_BUFC_DATA);
2777                         buf->b_data = zio_data_buf_alloc(size);
2778                         arc_space_consume(size, ARC_SPACE_DATA);
2779                 }
2780                 goto out;
2781         }
2782
2783         /*
2784          * If we are prefetching from the mfu ghost list, this buffer
2785          * will end up on the mru list; so steal space from there.
2786          */
2787         if (state == arc_mfu_ghost)
2788                 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2789         else if (state == arc_mru_ghost)
2790                 state = arc_mru;
2791
2792         if (state == arc_mru || state == arc_anon) {
2793                 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2794                 state = (arc_mfu->arcs_lsize[type] >= size &&
2795                     arc_p > mru_used) ? arc_mfu : arc_mru;
2796         } else {
2797                 /* MFU cases */
2798                 uint64_t mfu_space = arc_c - arc_p;
2799                 state =  (arc_mru->arcs_lsize[type] >= size &&
2800                     mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2801         }
2802
2803         /*
2804          * Evict data buffers prior to metadata buffers, unless we're
2805          * over the metadata limit and adding a metadata buffer.
2806          */
2807         if (type == ARC_BUFC_METADATA) {
2808                 if (arc_meta_used >= arc_meta_limit)
2809                         evict = ARC_BUFC_METADATA;
2810                 else
2811                         /*
2812                          * In this case, we're evicting data while
2813                          * adding metadata. Thus, to prevent recycling a
2814                          * data buffer into a metadata buffer, recycling
2815                          * is disabled in the following arc_evict call.
2816                          */
2817                         recycle = FALSE;
2818         }
2819
2820         if ((buf->b_data = arc_evict(state, 0, size, recycle, evict)) == NULL) {
2821                 if (type == ARC_BUFC_METADATA) {
2822                         buf->b_data = zio_buf_alloc(size);
2823                         arc_space_consume(size, ARC_SPACE_META);
2824
2825                         /*
2826                          * If we are unable to recycle an existing meta buffer
2827                          * signal the reclaim thread.  It will notify users
2828                          * via the prune callback to drop references.  The
2829                          * prune callback in run in the context of the reclaim
2830                          * thread to avoid deadlocking on the hash_lock.
2831                          * Of course, only do this when recycle is true.
2832                          */
2833                         if (recycle)
2834                                 cv_signal(&arc_reclaim_thr_cv);
2835                 } else {
2836                         ASSERT(type == ARC_BUFC_DATA);
2837                         buf->b_data = zio_data_buf_alloc(size);
2838                         arc_space_consume(size, ARC_SPACE_DATA);
2839                 }
2840
2841                 /* Only bump this if we tried to recycle and failed */
2842                 if (recycle)
2843                         ARCSTAT_BUMP(arcstat_recycle_miss);
2844         }
2845         ASSERT(buf->b_data != NULL);
2846 out:
2847         /*
2848          * Update the state size.  Note that ghost states have a
2849          * "ghost size" and so don't need to be updated.
2850          */
2851         if (!GHOST_STATE(buf->b_hdr->b_state)) {
2852                 arc_buf_hdr_t *hdr = buf->b_hdr;
2853
2854                 atomic_add_64(&hdr->b_state->arcs_size, size);
2855                 if (list_link_active(&hdr->b_arc_node)) {
2856                         ASSERT(refcount_is_zero(&hdr->b_refcnt));
2857                         atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2858                 }
2859                 /*
2860                  * If we are growing the cache, and we are adding anonymous
2861                  * data, and we have outgrown arc_p, update arc_p
2862                  */
2863                 if (!zfs_arc_p_aggressive_disable &&
2864                     arc_size < arc_c && hdr->b_state == arc_anon &&
2865                     arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2866                         arc_p = MIN(arc_c, arc_p + size);
2867         }
2868 }
2869
2870 /*
2871  * This routine is called whenever a buffer is accessed.
2872  * NOTE: the hash lock is dropped in this function.
2873  */
2874 static void
2875 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2876 {
2877         clock_t now;
2878
2879         ASSERT(MUTEX_HELD(hash_lock));
2880
2881         if (buf->b_state == arc_anon) {
2882                 /*
2883                  * This buffer is not in the cache, and does not
2884                  * appear in our "ghost" list.  Add the new buffer
2885                  * to the MRU state.
2886                  */
2887
2888                 ASSERT(buf->b_arc_access == 0);
2889                 buf->b_arc_access = ddi_get_lbolt();
2890                 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2891                 arc_change_state(arc_mru, buf, hash_lock);
2892
2893         } else if (buf->b_state == arc_mru) {
2894                 now = ddi_get_lbolt();
2895
2896                 /*
2897                  * If this buffer is here because of a prefetch, then either:
2898                  * - clear the flag if this is a "referencing" read
2899                  *   (any subsequent access will bump this into the MFU state).
2900                  * or
2901                  * - move the buffer to the head of the list if this is
2902                  *   another prefetch (to make it less likely to be evicted).
2903                  */
2904                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2905                         if (refcount_count(&buf->b_refcnt) == 0) {
2906                                 ASSERT(list_link_active(&buf->b_arc_node));
2907                         } else {
2908                                 buf->b_flags &= ~ARC_PREFETCH;
2909                                 atomic_inc_32(&buf->b_mru_hits);
2910                                 ARCSTAT_BUMP(arcstat_mru_hits);
2911                         }
2912                         buf->b_arc_access = now;
2913                         return;
2914                 }
2915
2916                 /*
2917                  * This buffer has been "accessed" only once so far,
2918                  * but it is still in the cache. Move it to the MFU
2919                  * state.
2920                  */
2921                 if (now > buf->b_arc_access + ARC_MINTIME) {
2922                         /*
2923                          * More than 125ms have passed since we
2924                          * instantiated this buffer.  Move it to the
2925                          * most frequently used state.
2926                          */
2927                         buf->b_arc_access = now;
2928                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2929                         arc_change_state(arc_mfu, buf, hash_lock);
2930                 }
2931                 atomic_inc_32(&buf->b_mru_hits);
2932                 ARCSTAT_BUMP(arcstat_mru_hits);
2933         } else if (buf->b_state == arc_mru_ghost) {
2934                 arc_state_t     *new_state;
2935                 /*
2936                  * This buffer has been "accessed" recently, but
2937                  * was evicted from the cache.  Move it to the
2938                  * MFU state.
2939                  */
2940
2941                 if (buf->b_flags & ARC_PREFETCH) {
2942                         new_state = arc_mru;
2943                         if (refcount_count(&buf->b_refcnt) > 0)
2944                                 buf->b_flags &= ~ARC_PREFETCH;
2945                         DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2946                 } else {
2947                         new_state = arc_mfu;
2948                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2949                 }
2950
2951                 buf->b_arc_access = ddi_get_lbolt();
2952                 arc_change_state(new_state, buf, hash_lock);
2953
2954                 atomic_inc_32(&buf->b_mru_ghost_hits);
2955                 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2956         } else if (buf->b_state == arc_mfu) {
2957                 /*
2958                  * This buffer has been accessed more than once and is
2959                  * still in the cache.  Keep it in the MFU state.
2960                  *
2961                  * NOTE: an add_reference() that occurred when we did
2962                  * the arc_read() will have kicked this off the list.
2963                  * If it was a prefetch, we will explicitly move it to
2964                  * the head of the list now.
2965                  */
2966                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2967                         ASSERT(refcount_count(&buf->b_refcnt) == 0);
2968                         ASSERT(list_link_active(&buf->b_arc_node));
2969                 }
2970                 atomic_inc_32(&buf->b_mfu_hits);
2971                 ARCSTAT_BUMP(arcstat_mfu_hits);
2972                 buf->b_arc_access = ddi_get_lbolt();
2973         } else if (buf->b_state == arc_mfu_ghost) {
2974                 arc_state_t     *new_state = arc_mfu;
2975                 /*
2976                  * This buffer has been accessed more than once but has
2977                  * been evicted from the cache.  Move it back to the
2978                  * MFU state.
2979                  */
2980
2981                 if (buf->b_flags & ARC_PREFETCH) {
2982                         /*
2983                          * This is a prefetch access...
2984                          * move this block back to the MRU state.
2985                          */
2986                         ASSERT0(refcount_count(&buf->b_refcnt));
2987                         new_state = arc_mru;
2988                 }
2989
2990                 buf->b_arc_access = ddi_get_lbolt();
2991                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2992                 arc_change_state(new_state, buf, hash_lock);
2993
2994                 atomic_inc_32(&buf->b_mfu_ghost_hits);
2995                 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2996         } else if (buf->b_state == arc_l2c_only) {
2997                 /*
2998                  * This buffer is on the 2nd Level ARC.
2999                  */
3000
3001                 buf->b_arc_access = ddi_get_lbolt();
3002                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
3003                 arc_change_state(arc_mfu, buf, hash_lock);
3004         } else {
3005                 ASSERT(!"invalid arc state");
3006         }
3007 }
3008
3009 /* a generic arc_done_func_t which you can use */
3010 /* ARGSUSED */
3011 void
3012 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
3013 {
3014         if (zio == NULL || zio->io_error == 0)
3015                 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
3016         VERIFY(arc_buf_remove_ref(buf, arg));
3017 }
3018
3019 /* a generic arc_done_func_t */
3020 void
3021 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
3022 {
3023         arc_buf_t **bufp = arg;
3024         if (zio && zio->io_error) {
3025                 VERIFY(arc_buf_remove_ref(buf, arg));
3026                 *bufp = NULL;
3027         } else {
3028                 *bufp = buf;
3029                 ASSERT(buf->b_data);
3030         }
3031 }
3032
3033 static void
3034 arc_read_done(zio_t *zio)
3035 {
3036         arc_buf_hdr_t   *hdr, *found;
3037         arc_buf_t       *buf;
3038         arc_buf_t       *abuf;  /* buffer we're assigning to callback */
3039         kmutex_t        *hash_lock;
3040         arc_callback_t  *callback_list, *acb;
3041         int             freeable = FALSE;
3042
3043         buf = zio->io_private;
3044         hdr = buf->b_hdr;
3045
3046         /*
3047          * The hdr was inserted into hash-table and removed from lists
3048          * prior to starting I/O.  We should find this header, since
3049          * it's in the hash table, and it should be legit since it's
3050          * not possible to evict it during the I/O.  The only possible
3051          * reason for it not to be found is if we were freed during the
3052          * read.
3053          */
3054         found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
3055             &hash_lock);
3056
3057         ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
3058             (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
3059             (found == hdr && HDR_L2_READING(hdr)));
3060
3061         hdr->b_flags &= ~ARC_L2_EVICTED;
3062         if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
3063                 hdr->b_flags &= ~ARC_L2CACHE;
3064
3065         /* byteswap if necessary */
3066         callback_list = hdr->b_acb;
3067         ASSERT(callback_list != NULL);
3068         if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
3069                 dmu_object_byteswap_t bswap =
3070                     DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
3071                 if (BP_GET_LEVEL(zio->io_bp) > 0)
3072                     byteswap_uint64_array(buf->b_data, hdr->b_size);
3073                 else
3074                     dmu_ot_byteswap[bswap].ob_func(buf->b_data, hdr->b_size);
3075         }
3076
3077         arc_cksum_compute(buf, B_FALSE);
3078         arc_buf_watch(buf);
3079
3080         if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
3081                 /*
3082                  * Only call arc_access on anonymous buffers.  This is because
3083                  * if we've issued an I/O for an evicted buffer, we've already
3084                  * called arc_access (to prevent any simultaneous readers from
3085                  * getting confused).
3086                  */
3087                 arc_access(hdr, hash_lock);
3088         }
3089
3090         /* create copies of the data buffer for the callers */
3091         abuf = buf;
3092         for (acb = callback_list; acb; acb = acb->acb_next) {
3093                 if (acb->acb_done) {
3094                         if (abuf == NULL) {
3095                                 ARCSTAT_BUMP(arcstat_duplicate_reads);
3096                                 abuf = arc_buf_clone(buf);
3097                         }
3098                         acb->acb_buf = abuf;
3099                         abuf = NULL;
3100                 }
3101         }
3102         hdr->b_acb = NULL;
3103         hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3104         ASSERT(!HDR_BUF_AVAILABLE(hdr));
3105         if (abuf == buf) {
3106                 ASSERT(buf->b_efunc == NULL);
3107                 ASSERT(hdr->b_datacnt == 1);
3108                 hdr->b_flags |= ARC_BUF_AVAILABLE;
3109         }
3110
3111         ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
3112
3113         if (zio->io_error != 0) {
3114                 hdr->b_flags |= ARC_IO_ERROR;
3115                 if (hdr->b_state != arc_anon)
3116                         arc_change_state(arc_anon, hdr, hash_lock);
3117                 if (HDR_IN_HASH_TABLE(hdr))
3118                         buf_hash_remove(hdr);
3119                 freeable = refcount_is_zero(&hdr->b_refcnt);
3120         }
3121
3122         /*
3123          * Broadcast before we drop the hash_lock to avoid the possibility
3124          * that the hdr (and hence the cv) might be freed before we get to
3125          * the cv_broadcast().
3126          */
3127         cv_broadcast(&hdr->b_cv);
3128
3129         if (hash_lock) {
3130                 mutex_exit(hash_lock);
3131         } else {
3132                 /*
3133                  * This block was freed while we waited for the read to
3134                  * complete.  It has been removed from the hash table and
3135                  * moved to the anonymous state (so that it won't show up
3136                  * in the cache).
3137                  */
3138                 ASSERT3P(hdr->b_state, ==, arc_anon);
3139                 freeable = refcount_is_zero(&hdr->b_refcnt);
3140         }
3141
3142         /* execute each callback and free its structure */
3143         while ((acb = callback_list) != NULL) {
3144                 if (acb->acb_done)
3145                         acb->acb_done(zio, acb->acb_buf, acb->acb_private);
3146
3147                 if (acb->acb_zio_dummy != NULL) {
3148                         acb->acb_zio_dummy->io_error = zio->io_error;
3149                         zio_nowait(acb->acb_zio_dummy);
3150                 }
3151
3152                 callback_list = acb->acb_next;
3153                 kmem_free(acb, sizeof (arc_callback_t));
3154         }
3155
3156         if (freeable)
3157                 arc_hdr_destroy(hdr);
3158 }
3159
3160 /*
3161  * "Read" the block at the specified DVA (in bp) via the
3162  * cache.  If the block is found in the cache, invoke the provided
3163  * callback immediately and return.  Note that the `zio' parameter
3164  * in the callback will be NULL in this case, since no IO was
3165  * required.  If the block is not in the cache pass the read request
3166  * on to the spa with a substitute callback function, so that the
3167  * requested block will be added to the cache.
3168  *
3169  * If a read request arrives for a block that has a read in-progress,
3170  * either wait for the in-progress read to complete (and return the
3171  * results); or, if this is a read with a "done" func, add a record
3172  * to the read to invoke the "done" func when the read completes,
3173  * and return; or just return.
3174  *
3175  * arc_read_done() will invoke all the requested "done" functions
3176  * for readers of this block.
3177  */
3178 int
3179 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
3180     void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
3181     const zbookmark_t *zb)
3182 {
3183         arc_buf_hdr_t *hdr;
3184         arc_buf_t *buf = NULL;
3185         kmutex_t *hash_lock;
3186         zio_t *rzio;
3187         uint64_t guid = spa_load_guid(spa);
3188         int rc = 0;
3189
3190 top:
3191         hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
3192             &hash_lock);
3193         if (hdr && hdr->b_datacnt > 0) {
3194
3195                 *arc_flags |= ARC_CACHED;
3196
3197                 if (HDR_IO_IN_PROGRESS(hdr)) {
3198
3199                         if (*arc_flags & ARC_WAIT) {
3200                                 cv_wait(&hdr->b_cv, hash_lock);
3201                                 mutex_exit(hash_lock);
3202                                 goto top;
3203                         }
3204                         ASSERT(*arc_flags & ARC_NOWAIT);
3205
3206                         if (done) {
3207                                 arc_callback_t  *acb = NULL;
3208
3209                                 acb = kmem_zalloc(sizeof (arc_callback_t),
3210                                     KM_PUSHPAGE);
3211                                 acb->acb_done = done;
3212                                 acb->acb_private = private;
3213                                 if (pio != NULL)
3214                                         acb->acb_zio_dummy = zio_null(pio,
3215                                             spa, NULL, NULL, NULL, zio_flags);
3216
3217                                 ASSERT(acb->acb_done != NULL);
3218                                 acb->acb_next = hdr->b_acb;
3219                                 hdr->b_acb = acb;
3220                                 add_reference(hdr, hash_lock, private);
3221                                 mutex_exit(hash_lock);
3222                                 goto out;
3223                         }
3224                         mutex_exit(hash_lock);
3225                         goto out;
3226                 }
3227
3228                 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3229
3230                 if (done) {
3231                         add_reference(hdr, hash_lock, private);
3232                         /*
3233                          * If this block is already in use, create a new
3234                          * copy of the data so that we will be guaranteed
3235                          * that arc_release() will always succeed.
3236                          */
3237                         buf = hdr->b_buf;
3238                         ASSERT(buf);
3239                         ASSERT(buf->b_data);
3240                         if (HDR_BUF_AVAILABLE(hdr)) {
3241                                 ASSERT(buf->b_efunc == NULL);
3242                                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3243                         } else {
3244                                 buf = arc_buf_clone(buf);
3245                         }
3246
3247                 } else if (*arc_flags & ARC_PREFETCH &&
3248                     refcount_count(&hdr->b_refcnt) == 0) {
3249                         hdr->b_flags |= ARC_PREFETCH;
3250                 }
3251                 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
3252                 arc_access(hdr, hash_lock);
3253                 if (*arc_flags & ARC_L2CACHE)
3254                         hdr->b_flags |= ARC_L2CACHE;
3255                 if (*arc_flags & ARC_L2COMPRESS)
3256                         hdr->b_flags |= ARC_L2COMPRESS;
3257                 mutex_exit(hash_lock);
3258                 ARCSTAT_BUMP(arcstat_hits);
3259                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3260                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3261                     data, metadata, hits);
3262
3263                 if (done)
3264                         done(NULL, buf, private);
3265         } else {
3266                 uint64_t size = BP_GET_LSIZE(bp);
3267                 arc_callback_t  *acb;
3268                 vdev_t *vd = NULL;
3269                 uint64_t addr = 0;
3270                 boolean_t devw = B_FALSE;
3271
3272                 if (hdr == NULL) {
3273                         /* this block is not in the cache */
3274                         arc_buf_hdr_t   *exists;
3275                         arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
3276                         buf = arc_buf_alloc(spa, size, private, type);
3277                         hdr = buf->b_hdr;
3278                         hdr->b_dva = *BP_IDENTITY(bp);
3279                         hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
3280                         hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
3281                         exists = buf_hash_insert(hdr, &hash_lock);
3282                         if (exists) {
3283                                 /* somebody beat us to the hash insert */
3284                                 mutex_exit(hash_lock);
3285                                 buf_discard_identity(hdr);
3286                                 (void) arc_buf_remove_ref(buf, private);
3287                                 goto top; /* restart the IO request */
3288                         }
3289                         /* if this is a prefetch, we don't have a reference */
3290                         if (*arc_flags & ARC_PREFETCH) {
3291                                 (void) remove_reference(hdr, hash_lock,
3292                                     private);
3293                                 hdr->b_flags |= ARC_PREFETCH;
3294                         }
3295                         if (*arc_flags & ARC_L2CACHE)
3296                                 hdr->b_flags |= ARC_L2CACHE;
3297                         if (*arc_flags & ARC_L2COMPRESS)
3298                                 hdr->b_flags |= ARC_L2COMPRESS;
3299                         if (BP_GET_LEVEL(bp) > 0)
3300                                 hdr->b_flags |= ARC_INDIRECT;
3301                 } else {
3302                         /* this block is in the ghost cache */
3303                         ASSERT(GHOST_STATE(hdr->b_state));
3304                         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3305                         ASSERT0(refcount_count(&hdr->b_refcnt));
3306                         ASSERT(hdr->b_buf == NULL);
3307
3308                         /* if this is a prefetch, we don't have a reference */
3309                         if (*arc_flags & ARC_PREFETCH)
3310                                 hdr->b_flags |= ARC_PREFETCH;
3311                         else
3312                                 add_reference(hdr, hash_lock, private);
3313                         if (*arc_flags & ARC_L2CACHE)
3314                                 hdr->b_flags |= ARC_L2CACHE;
3315                         if (*arc_flags & ARC_L2COMPRESS)
3316                                 hdr->b_flags |= ARC_L2COMPRESS;
3317                         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3318                         buf->b_hdr = hdr;
3319                         buf->b_data = NULL;
3320                         buf->b_efunc = NULL;
3321                         buf->b_private = NULL;
3322                         buf->b_next = NULL;
3323                         hdr->b_buf = buf;
3324                         ASSERT(hdr->b_datacnt == 0);
3325                         hdr->b_datacnt = 1;
3326                         arc_get_data_buf(buf);
3327                         arc_access(hdr, hash_lock);
3328                 }
3329
3330                 ASSERT(!GHOST_STATE(hdr->b_state));
3331
3332                 acb = kmem_zalloc(sizeof (arc_callback_t), KM_PUSHPAGE);
3333                 acb->acb_done = done;
3334                 acb->acb_private = private;
3335
3336                 ASSERT(hdr->b_acb == NULL);
3337                 hdr->b_acb = acb;
3338                 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3339
3340                 if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
3341                     (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3342                         devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3343                         addr = hdr->b_l2hdr->b_daddr;
3344                         /*
3345                          * Lock out device removal.
3346                          */
3347                         if (vdev_is_dead(vd) ||
3348                             !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3349                                 vd = NULL;
3350                 }
3351
3352                 mutex_exit(hash_lock);
3353
3354                 /*
3355                  * At this point, we have a level 1 cache miss.  Try again in
3356                  * L2ARC if possible.
3357                  */
3358                 ASSERT3U(hdr->b_size, ==, size);
3359                 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3360                     uint64_t, size, zbookmark_t *, zb);
3361                 ARCSTAT_BUMP(arcstat_misses);
3362                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3363                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3364                     data, metadata, misses);
3365
3366                 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3367                         /*
3368                          * Read from the L2ARC if the following are true:
3369                          * 1. The L2ARC vdev was previously cached.
3370                          * 2. This buffer still has L2ARC metadata.
3371                          * 3. This buffer isn't currently writing to the L2ARC.
3372                          * 4. The L2ARC entry wasn't evicted, which may
3373                          *    also have invalidated the vdev.
3374                          * 5. This isn't prefetch and l2arc_noprefetch is set.
3375                          */
3376                         if (hdr->b_l2hdr != NULL &&
3377                             !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3378                             !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3379                                 l2arc_read_callback_t *cb;
3380
3381                                 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3382                                 ARCSTAT_BUMP(arcstat_l2_hits);
3383                                 atomic_inc_32(&hdr->b_l2hdr->b_hits);
3384
3385                                 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3386                                     KM_PUSHPAGE);
3387                                 cb->l2rcb_buf = buf;
3388                                 cb->l2rcb_spa = spa;
3389                                 cb->l2rcb_bp = *bp;
3390                                 cb->l2rcb_zb = *zb;
3391                                 cb->l2rcb_flags = zio_flags;
3392                                 cb->l2rcb_compress = hdr->b_l2hdr->b_compress;
3393
3394                                 ASSERT(addr >= VDEV_LABEL_START_SIZE &&
3395                                     addr + size < vd->vdev_psize -
3396                                     VDEV_LABEL_END_SIZE);
3397
3398                                 /*
3399                                  * l2arc read.  The SCL_L2ARC lock will be
3400                                  * released by l2arc_read_done().
3401                                  * Issue a null zio if the underlying buffer
3402                                  * was squashed to zero size by compression.
3403                                  */
3404                                 if (hdr->b_l2hdr->b_compress ==
3405                                     ZIO_COMPRESS_EMPTY) {
3406                                         rzio = zio_null(pio, spa, vd,
3407                                             l2arc_read_done, cb,
3408                                             zio_flags | ZIO_FLAG_DONT_CACHE |
3409                                             ZIO_FLAG_CANFAIL |
3410                                             ZIO_FLAG_DONT_PROPAGATE |
3411                                             ZIO_FLAG_DONT_RETRY);
3412                                 } else {
3413                                         rzio = zio_read_phys(pio, vd, addr,
3414                                             hdr->b_l2hdr->b_asize,
3415                                             buf->b_data, ZIO_CHECKSUM_OFF,
3416                                             l2arc_read_done, cb, priority,
3417                                             zio_flags | ZIO_FLAG_DONT_CACHE |
3418                                             ZIO_FLAG_CANFAIL |
3419                                             ZIO_FLAG_DONT_PROPAGATE |
3420                                             ZIO_FLAG_DONT_RETRY, B_FALSE);
3421                                 }
3422                                 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3423                                     zio_t *, rzio);
3424                                 ARCSTAT_INCR(arcstat_l2_read_bytes,
3425                                     hdr->b_l2hdr->b_asize);
3426
3427                                 if (*arc_flags & ARC_NOWAIT) {
3428                                         zio_nowait(rzio);
3429                                         goto out;
3430                                 }
3431
3432                                 ASSERT(*arc_flags & ARC_WAIT);
3433                                 if (zio_wait(rzio) == 0)
3434                                         goto out;
3435
3436                                 /* l2arc read error; goto zio_read() */
3437                         } else {
3438                                 DTRACE_PROBE1(l2arc__miss,
3439                                     arc_buf_hdr_t *, hdr);
3440                                 ARCSTAT_BUMP(arcstat_l2_misses);
3441                                 if (HDR_L2_WRITING(hdr))
3442                                         ARCSTAT_BUMP(arcstat_l2_rw_clash);
3443                                 spa_config_exit(spa, SCL_L2ARC, vd);
3444                         }
3445                 } else {
3446                         if (vd != NULL)
3447                                 spa_config_exit(spa, SCL_L2ARC, vd);
3448                         if (l2arc_ndev != 0) {
3449                                 DTRACE_PROBE1(l2arc__miss,
3450                                     arc_buf_hdr_t *, hdr);
3451                                 ARCSTAT_BUMP(arcstat_l2_misses);
3452                         }
3453                 }
3454
3455                 rzio = zio_read(pio, spa, bp, buf->b_data, size,
3456                     arc_read_done, buf, priority, zio_flags, zb);
3457
3458                 if (*arc_flags & ARC_WAIT) {
3459                         rc = zio_wait(rzio);
3460                         goto out;
3461                 }
3462
3463                 ASSERT(*arc_flags & ARC_NOWAIT);
3464                 zio_nowait(rzio);
3465         }
3466
3467 out:
3468         spa_read_history_add(spa, zb, *arc_flags);
3469         return (rc);
3470 }
3471
3472 arc_prune_t *
3473 arc_add_prune_callback(arc_prune_func_t *func, void *private)
3474 {
3475         arc_prune_t *p;
3476
3477         p = kmem_alloc(sizeof (*p), KM_SLEEP);
3478         p->p_pfunc = func;
3479         p->p_private = private;
3480         list_link_init(&p->p_node);
3481         refcount_create(&p->p_refcnt);
3482
3483         mutex_enter(&arc_prune_mtx);
3484         refcount_add(&p->p_refcnt, &arc_prune_list);
3485         list_insert_head(&arc_prune_list, p);
3486         mutex_exit(&arc_prune_mtx);
3487
3488         return (p);
3489 }
3490
3491 void
3492 arc_remove_prune_callback(arc_prune_t *p)
3493 {
3494         mutex_enter(&arc_prune_mtx);
3495         list_remove(&arc_prune_list, p);
3496         if (refcount_remove(&p->p_refcnt, &arc_prune_list) == 0) {
3497                 refcount_destroy(&p->p_refcnt);
3498                 kmem_free(p, sizeof (*p));
3499         }
3500         mutex_exit(&arc_prune_mtx);
3501 }
3502
3503 void
3504 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3505 {
3506         ASSERT(buf->b_hdr != NULL);
3507         ASSERT(buf->b_hdr->b_state != arc_anon);
3508         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3509         ASSERT(buf->b_efunc == NULL);
3510         ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3511
3512         buf->b_efunc = func;
3513         buf->b_private = private;
3514 }
3515
3516 /*
3517  * Notify the arc that a block was freed, and thus will never be used again.
3518  */
3519 void
3520 arc_freed(spa_t *spa, const blkptr_t *bp)
3521 {
3522         arc_buf_hdr_t *hdr;
3523         kmutex_t *hash_lock;
3524         uint64_t guid = spa_load_guid(spa);
3525
3526         hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
3527             &hash_lock);
3528         if (hdr == NULL)
3529                 return;
3530         if (HDR_BUF_AVAILABLE(hdr)) {
3531                 arc_buf_t *buf = hdr->b_buf;
3532                 add_reference(hdr, hash_lock, FTAG);
3533                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3534                 mutex_exit(hash_lock);
3535
3536                 arc_release(buf, FTAG);
3537                 (void) arc_buf_remove_ref(buf, FTAG);
3538         } else {
3539                 mutex_exit(hash_lock);
3540         }
3541
3542 }
3543
3544 /*
3545  * This is used by the DMU to let the ARC know that a buffer is
3546  * being evicted, so the ARC should clean up.  If this arc buf
3547  * is not yet in the evicted state, it will be put there.
3548  */
3549 int
3550 arc_buf_evict(arc_buf_t *buf)
3551 {
3552         arc_buf_hdr_t *hdr;
3553         kmutex_t *hash_lock;
3554         arc_buf_t **bufp;
3555
3556         mutex_enter(&buf->b_evict_lock);
3557         hdr = buf->b_hdr;
3558         if (hdr == NULL) {
3559                 /*
3560                  * We are in arc_do_user_evicts().
3561                  */
3562                 ASSERT(buf->b_data == NULL);
3563                 mutex_exit(&buf->b_evict_lock);
3564                 return (0);
3565         } else if (buf->b_data == NULL) {
3566                 arc_buf_t copy = *buf; /* structure assignment */
3567                 /*
3568                  * We are on the eviction list; process this buffer now
3569                  * but let arc_do_user_evicts() do the reaping.
3570                  */
3571                 buf->b_efunc = NULL;
3572                 mutex_exit(&buf->b_evict_lock);
3573                 VERIFY(copy.b_efunc(&copy) == 0);
3574                 return (1);
3575         }
3576         hash_lock = HDR_LOCK(hdr);
3577         mutex_enter(hash_lock);
3578         hdr = buf->b_hdr;
3579         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3580
3581         ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3582         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3583
3584         /*
3585          * Pull this buffer off of the hdr
3586          */
3587         bufp = &hdr->b_buf;
3588         while (*bufp != buf)
3589                 bufp = &(*bufp)->b_next;
3590         *bufp = buf->b_next;
3591
3592         ASSERT(buf->b_data != NULL);
3593         arc_buf_destroy(buf, FALSE, FALSE);
3594
3595         if (hdr->b_datacnt == 0) {
3596                 arc_state_t *old_state = hdr->b_state;
3597                 arc_state_t *evicted_state;
3598
3599                 ASSERT(hdr->b_buf == NULL);
3600                 ASSERT(refcount_is_zero(&hdr->b_refcnt));
3601
3602                 evicted_state =
3603                     (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3604
3605                 mutex_enter(&old_state->arcs_mtx);
3606                 mutex_enter(&evicted_state->arcs_mtx);
3607
3608                 arc_change_state(evicted_state, hdr, hash_lock);
3609                 ASSERT(HDR_IN_HASH_TABLE(hdr));
3610                 hdr->b_flags |= ARC_IN_HASH_TABLE;
3611                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3612
3613                 mutex_exit(&evicted_state->arcs_mtx);
3614                 mutex_exit(&old_state->arcs_mtx);
3615         }
3616         mutex_exit(hash_lock);
3617         mutex_exit(&buf->b_evict_lock);
3618
3619         VERIFY(buf->b_efunc(buf) == 0);
3620         buf->b_efunc = NULL;
3621         buf->b_private = NULL;
3622         buf->b_hdr = NULL;
3623         buf->b_next = NULL;
3624         kmem_cache_free(buf_cache, buf);
3625         return (1);
3626 }
3627
3628 /*
3629  * Release this buffer from the cache, making it an anonymous buffer.  This
3630  * must be done after a read and prior to modifying the buffer contents.
3631  * If the buffer has more than one reference, we must make
3632  * a new hdr for the buffer.
3633  */
3634 void
3635 arc_release(arc_buf_t *buf, void *tag)
3636 {
3637         arc_buf_hdr_t *hdr;
3638         kmutex_t *hash_lock = NULL;
3639         l2arc_buf_hdr_t *l2hdr;
3640         uint64_t buf_size = 0;
3641
3642         /*
3643          * It would be nice to assert that if it's DMU metadata (level >
3644          * 0 || it's the dnode file), then it must be syncing context.
3645          * But we don't know that information at this level.
3646          */
3647
3648         mutex_enter(&buf->b_evict_lock);
3649         hdr = buf->b_hdr;
3650
3651         /* this buffer is not on any list */
3652         ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3653
3654         if (hdr->b_state == arc_anon) {
3655                 /* this buffer is already released */
3656                 ASSERT(buf->b_efunc == NULL);
3657         } else {
3658                 hash_lock = HDR_LOCK(hdr);
3659                 mutex_enter(hash_lock);
3660                 hdr = buf->b_hdr;
3661                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3662         }
3663
3664         l2hdr = hdr->b_l2hdr;
3665         if (l2hdr) {
3666                 mutex_enter(&l2arc_buflist_mtx);
3667                 hdr->b_l2hdr = NULL;
3668                 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3669         }
3670         buf_size = hdr->b_size;
3671
3672         /*
3673          * Do we have more than one buf?
3674          */
3675         if (hdr->b_datacnt > 1) {
3676                 arc_buf_hdr_t *nhdr;
3677                 arc_buf_t **bufp;
3678                 uint64_t blksz = hdr->b_size;
3679                 uint64_t spa = hdr->b_spa;
3680                 arc_buf_contents_t type = hdr->b_type;
3681                 uint32_t flags = hdr->b_flags;
3682
3683                 ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3684                 /*
3685                  * Pull the data off of this hdr and attach it to
3686                  * a new anonymous hdr.
3687                  */
3688                 (void) remove_reference(hdr, hash_lock, tag);
3689                 bufp = &hdr->b_buf;
3690                 while (*bufp != buf)
3691                         bufp = &(*bufp)->b_next;
3692                 *bufp = buf->b_next;
3693                 buf->b_next = NULL;
3694
3695                 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3696                 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3697                 if (refcount_is_zero(&hdr->b_refcnt)) {
3698                         uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3699                         ASSERT3U(*size, >=, hdr->b_size);
3700                         atomic_add_64(size, -hdr->b_size);
3701                 }
3702
3703                 /*
3704                  * We're releasing a duplicate user data buffer, update
3705                  * our statistics accordingly.
3706                  */
3707                 if (hdr->b_type == ARC_BUFC_DATA) {
3708                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3709                         ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3710                             -hdr->b_size);
3711                 }
3712                 hdr->b_datacnt -= 1;
3713                 arc_cksum_verify(buf);
3714                 arc_buf_unwatch(buf);
3715
3716                 mutex_exit(hash_lock);
3717
3718                 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3719                 nhdr->b_size = blksz;
3720                 nhdr->b_spa = spa;
3721                 nhdr->b_type = type;
3722                 nhdr->b_buf = buf;
3723                 nhdr->b_state = arc_anon;
3724                 nhdr->b_arc_access = 0;
3725                 nhdr->b_mru_hits = 0;
3726                 nhdr->b_mru_ghost_hits = 0;
3727                 nhdr->b_mfu_hits = 0;
3728                 nhdr->b_mfu_ghost_hits = 0;
3729                 nhdr->b_l2_hits = 0;
3730                 nhdr->b_flags = flags & ARC_L2_WRITING;
3731                 nhdr->b_l2hdr = NULL;
3732                 nhdr->b_datacnt = 1;
3733                 nhdr->b_freeze_cksum = NULL;
3734                 (void) refcount_add(&nhdr->b_refcnt, tag);
3735                 buf->b_hdr = nhdr;
3736                 mutex_exit(&buf->b_evict_lock);
3737                 atomic_add_64(&arc_anon->arcs_size, blksz);
3738         } else {
3739                 mutex_exit(&buf->b_evict_lock);
3740                 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3741                 ASSERT(!list_link_active(&hdr->b_arc_node));
3742                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3743                 if (hdr->b_state != arc_anon)
3744                         arc_change_state(arc_anon, hdr, hash_lock);
3745                 hdr->b_arc_access = 0;
3746                 hdr->b_mru_hits = 0;
3747                 hdr->b_mru_ghost_hits = 0;
3748                 hdr->b_mfu_hits = 0;
3749                 hdr->b_mfu_ghost_hits = 0;
3750                 hdr->b_l2_hits = 0;
3751                 if (hash_lock)
3752                         mutex_exit(hash_lock);
3753
3754                 buf_discard_identity(hdr);
3755                 arc_buf_thaw(buf);
3756         }
3757         buf->b_efunc = NULL;
3758         buf->b_private = NULL;
3759
3760         if (l2hdr) {
3761                 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3762                 kmem_cache_free(l2arc_hdr_cache, l2hdr);
3763                 arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
3764                 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3765                 mutex_exit(&l2arc_buflist_mtx);
3766         }
3767 }
3768
3769 int
3770 arc_released(arc_buf_t *buf)
3771 {
3772         int released;
3773
3774         mutex_enter(&buf->b_evict_lock);
3775         released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3776         mutex_exit(&buf->b_evict_lock);
3777         return (released);
3778 }
3779
3780 int
3781 arc_has_callback(arc_buf_t *buf)
3782 {
3783         int callback;
3784
3785         mutex_enter(&buf->b_evict_lock);
3786         callback = (buf->b_efunc != NULL);
3787         mutex_exit(&buf->b_evict_lock);
3788         return (callback);
3789 }
3790
3791 #ifdef ZFS_DEBUG
3792 int
3793 arc_referenced(arc_buf_t *buf)
3794 {
3795         int referenced;
3796
3797         mutex_enter(&buf->b_evict_lock);
3798         referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3799         mutex_exit(&buf->b_evict_lock);
3800         return (referenced);
3801 }
3802 #endif
3803
3804 static void
3805 arc_write_ready(zio_t *zio)
3806 {
3807         arc_write_callback_t *callback = zio->io_private;
3808         arc_buf_t *buf = callback->awcb_buf;
3809         arc_buf_hdr_t *hdr = buf->b_hdr;
3810
3811         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3812         callback->awcb_ready(zio, buf, callback->awcb_private);
3813
3814         /*
3815          * If the IO is already in progress, then this is a re-write
3816          * attempt, so we need to thaw and re-compute the cksum.
3817          * It is the responsibility of the callback to handle the
3818          * accounting for any re-write attempt.
3819          */
3820         if (HDR_IO_IN_PROGRESS(hdr)) {
3821                 mutex_enter(&hdr->b_freeze_lock);
3822                 if (hdr->b_freeze_cksum != NULL) {
3823                         kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3824                         hdr->b_freeze_cksum = NULL;
3825                 }
3826                 mutex_exit(&hdr->b_freeze_lock);
3827         }
3828         arc_cksum_compute(buf, B_FALSE);
3829         hdr->b_flags |= ARC_IO_IN_PROGRESS;
3830 }
3831
3832 /*
3833  * The SPA calls this callback for each physical write that happens on behalf
3834  * of a logical write.  See the comment in dbuf_write_physdone() for details.
3835  */
3836 static void
3837 arc_write_physdone(zio_t *zio)
3838 {
3839         arc_write_callback_t *cb = zio->io_private;
3840         if (cb->awcb_physdone != NULL)
3841                 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
3842 }
3843
3844 static void
3845 arc_write_done(zio_t *zio)
3846 {
3847         arc_write_callback_t *callback = zio->io_private;
3848         arc_buf_t *buf = callback->awcb_buf;
3849         arc_buf_hdr_t *hdr = buf->b_hdr;
3850
3851         ASSERT(hdr->b_acb == NULL);
3852
3853         if (zio->io_error == 0) {
3854                 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3855                 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3856                 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3857         } else {
3858                 ASSERT(BUF_EMPTY(hdr));
3859         }
3860
3861         /*
3862          * If the block to be written was all-zero, we may have
3863          * compressed it away.  In this case no write was performed
3864          * so there will be no dva/birth/checksum.  The buffer must
3865          * therefore remain anonymous (and uncached).
3866          */
3867         if (!BUF_EMPTY(hdr)) {
3868                 arc_buf_hdr_t *exists;
3869                 kmutex_t *hash_lock;
3870
3871                 ASSERT(zio->io_error == 0);
3872
3873                 arc_cksum_verify(buf);
3874
3875                 exists = buf_hash_insert(hdr, &hash_lock);
3876                 if (exists) {
3877                         /*
3878                          * This can only happen if we overwrite for
3879                          * sync-to-convergence, because we remove
3880                          * buffers from the hash table when we arc_free().
3881                          */
3882                         if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3883                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3884                                         panic("bad overwrite, hdr=%p exists=%p",
3885                                             (void *)hdr, (void *)exists);
3886                                 ASSERT(refcount_is_zero(&exists->b_refcnt));
3887                                 arc_change_state(arc_anon, exists, hash_lock);
3888                                 mutex_exit(hash_lock);
3889                                 arc_hdr_destroy(exists);
3890                                 exists = buf_hash_insert(hdr, &hash_lock);
3891                                 ASSERT3P(exists, ==, NULL);
3892                         } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3893                                 /* nopwrite */
3894                                 ASSERT(zio->io_prop.zp_nopwrite);
3895                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3896                                         panic("bad nopwrite, hdr=%p exists=%p",
3897                                             (void *)hdr, (void *)exists);
3898                         } else {
3899                                 /* Dedup */
3900                                 ASSERT(hdr->b_datacnt == 1);
3901                                 ASSERT(hdr->b_state == arc_anon);
3902                                 ASSERT(BP_GET_DEDUP(zio->io_bp));
3903                                 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3904                         }
3905                 }
3906                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3907                 /* if it's not anon, we are doing a scrub */
3908                 if (!exists && hdr->b_state == arc_anon)
3909                         arc_access(hdr, hash_lock);
3910                 mutex_exit(hash_lock);
3911         } else {
3912                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3913         }
3914
3915         ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3916         callback->awcb_done(zio, buf, callback->awcb_private);
3917
3918         kmem_free(callback, sizeof (arc_write_callback_t));
3919 }
3920
3921 zio_t *
3922 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3923     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3924     const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
3925     arc_done_func_t *done, void *private, zio_priority_t priority,
3926     int zio_flags, const zbookmark_t *zb)
3927 {
3928         arc_buf_hdr_t *hdr = buf->b_hdr;
3929         arc_write_callback_t *callback;
3930         zio_t *zio;
3931
3932         ASSERT(ready != NULL);
3933         ASSERT(done != NULL);
3934         ASSERT(!HDR_IO_ERROR(hdr));
3935         ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3936         ASSERT(hdr->b_acb == NULL);
3937         if (l2arc)
3938                 hdr->b_flags |= ARC_L2CACHE;
3939         if (l2arc_compress)
3940                 hdr->b_flags |= ARC_L2COMPRESS;
3941         callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_PUSHPAGE);
3942         callback->awcb_ready = ready;
3943         callback->awcb_physdone = physdone;
3944         callback->awcb_done = done;
3945         callback->awcb_private = private;
3946         callback->awcb_buf = buf;
3947
3948         zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3949             arc_write_ready, arc_write_physdone, arc_write_done, callback,
3950             priority, zio_flags, zb);
3951
3952         return (zio);
3953 }
3954
3955 static int
3956 arc_memory_throttle(uint64_t reserve, uint64_t txg)
3957 {
3958 #ifdef _KERNEL
3959         if (zfs_arc_memory_throttle_disable)
3960                 return (0);
3961
3962         if (freemem <= physmem * arc_lotsfree_percent / 100) {
3963                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3964                 DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
3965                 return (SET_ERROR(EAGAIN));
3966         }
3967 #endif
3968         return (0);
3969 }
3970
3971 void
3972 arc_tempreserve_clear(uint64_t reserve)
3973 {
3974         atomic_add_64(&arc_tempreserve, -reserve);
3975         ASSERT((int64_t)arc_tempreserve >= 0);
3976 }
3977
3978 int
3979 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3980 {
3981         int error;
3982         uint64_t anon_size;
3983
3984         if (reserve > arc_c/4 && !arc_no_grow)
3985                 arc_c = MIN(arc_c_max, reserve * 4);
3986         if (reserve > arc_c) {
3987                 DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
3988                 return (SET_ERROR(ENOMEM));
3989         }
3990
3991         /*
3992          * Don't count loaned bufs as in flight dirty data to prevent long
3993          * network delays from blocking transactions that are ready to be
3994          * assigned to a txg.
3995          */
3996         anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3997
3998         /*
3999          * Writes will, almost always, require additional memory allocations
4000          * in order to compress/encrypt/etc the data.  We therefore need to
4001          * make sure that there is sufficient available memory for this.
4002          */
4003         error = arc_memory_throttle(reserve, txg);
4004         if (error != 0)
4005                 return (error);
4006
4007         /*
4008          * Throttle writes when the amount of dirty data in the cache
4009          * gets too large.  We try to keep the cache less than half full
4010          * of dirty blocks so that our sync times don't grow too large.
4011          * Note: if two requests come in concurrently, we might let them
4012          * both succeed, when one of them should fail.  Not a huge deal.
4013          */
4014
4015         if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
4016             anon_size > arc_c / 4) {
4017                 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
4018                     "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
4019                     arc_tempreserve>>10,
4020                     arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
4021                     arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
4022                     reserve>>10, arc_c>>10);
4023                 DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
4024                 return (SET_ERROR(ERESTART));
4025         }
4026         atomic_add_64(&arc_tempreserve, reserve);
4027         return (0);
4028 }
4029
4030 static void
4031 arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
4032     kstat_named_t *evict_data, kstat_named_t *evict_metadata)
4033 {
4034         size->value.ui64 = state->arcs_size;
4035         evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA];
4036         evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA];
4037 }
4038
4039 static int
4040 arc_kstat_update(kstat_t *ksp, int rw)
4041 {
4042         arc_stats_t *as = ksp->ks_data;
4043
4044         if (rw == KSTAT_WRITE) {
4045                 return (SET_ERROR(EACCES));
4046         } else {
4047                 arc_kstat_update_state(arc_anon,
4048                     &as->arcstat_anon_size,
4049                     &as->arcstat_anon_evict_data,
4050                     &as->arcstat_anon_evict_metadata);
4051                 arc_kstat_update_state(arc_mru,
4052                     &as->arcstat_mru_size,
4053                     &as->arcstat_mru_evict_data,
4054                     &as->arcstat_mru_evict_metadata);
4055                 arc_kstat_update_state(arc_mru_ghost,
4056                     &as->arcstat_mru_ghost_size,
4057                     &as->arcstat_mru_ghost_evict_data,
4058                     &as->arcstat_mru_ghost_evict_metadata);
4059                 arc_kstat_update_state(arc_mfu,
4060                     &as->arcstat_mfu_size,
4061                     &as->arcstat_mfu_evict_data,
4062                     &as->arcstat_mfu_evict_metadata);
4063                 arc_kstat_update_state(arc_mfu_ghost,
4064                     &as->arcstat_mfu_ghost_size,
4065                     &as->arcstat_mfu_ghost_evict_data,
4066                     &as->arcstat_mfu_ghost_evict_metadata);
4067         }
4068
4069         return (0);
4070 }
4071
4072 void
4073 arc_init(void)
4074 {
4075         mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4076         cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
4077
4078         /* Convert seconds to clock ticks */
4079         zfs_arc_min_prefetch_lifespan = 1 * hz;
4080
4081         /* Start out with 1/8 of all memory */
4082         arc_c = physmem * PAGESIZE / 8;
4083
4084 #ifdef _KERNEL
4085         /*
4086          * On architectures where the physical memory can be larger
4087          * than the addressable space (intel in 32-bit mode), we may
4088          * need to limit the cache to 1/8 of VM size.
4089          */
4090         arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
4091         /*
4092          * Register a shrinker to support synchronous (direct) memory
4093          * reclaim from the arc.  This is done to prevent kswapd from
4094          * swapping out pages when it is preferable to shrink the arc.
4095          */
4096         spl_register_shrinker(&arc_shrinker);
4097 #endif
4098
4099         /* set min cache to zero */
4100         arc_c_min = 4<<20;
4101         /* set max to 1/2 of all memory */
4102         arc_c_max = arc_c * 4;
4103
4104         /*
4105          * Allow the tunables to override our calculations if they are
4106          * reasonable (ie. over 64MB)
4107          */
4108         if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
4109                 arc_c_max = zfs_arc_max;
4110         if (zfs_arc_min > 0 && zfs_arc_min <= arc_c_max)
4111                 arc_c_min = zfs_arc_min;
4112
4113         arc_c = arc_c_max;
4114         arc_p = (arc_c >> 1);
4115
4116         /* limit meta-data to 3/4 of the arc capacity */
4117         arc_meta_limit = (3 * arc_c_max) / 4;
4118         arc_meta_max = 0;
4119
4120         /* Allow the tunable to override if it is reasonable */
4121         if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
4122                 arc_meta_limit = zfs_arc_meta_limit;
4123
4124         /* if kmem_flags are set, lets try to use less memory */
4125         if (kmem_debugging())
4126                 arc_c = arc_c / 2;
4127         if (arc_c < arc_c_min)
4128                 arc_c = arc_c_min;
4129
4130         arc_anon = &ARC_anon;
4131         arc_mru = &ARC_mru;
4132         arc_mru_ghost = &ARC_mru_ghost;
4133         arc_mfu = &ARC_mfu;
4134         arc_mfu_ghost = &ARC_mfu_ghost;
4135         arc_l2c_only = &ARC_l2c_only;
4136         arc_size = 0;
4137
4138         mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4139         mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4140         mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4141         mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4142         mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4143         mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4144
4145         list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
4146             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4147         list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
4148             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4149         list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
4150             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4151         list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
4152             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4153         list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
4154             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4155         list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
4156             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4157         list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
4158             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4159         list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
4160             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4161         list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
4162             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4163         list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
4164             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4165
4166         arc_anon->arcs_state = ARC_STATE_ANON;
4167         arc_mru->arcs_state = ARC_STATE_MRU;
4168         arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
4169         arc_mfu->arcs_state = ARC_STATE_MFU;
4170         arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
4171         arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
4172
4173         buf_init();
4174
4175         arc_thread_exit = 0;
4176         list_create(&arc_prune_list, sizeof (arc_prune_t),
4177             offsetof(arc_prune_t, p_node));
4178         arc_eviction_list = NULL;
4179         mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
4180         mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
4181         bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
4182
4183         arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
4184             sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4185
4186         if (arc_ksp != NULL) {
4187                 arc_ksp->ks_data = &arc_stats;
4188                 arc_ksp->ks_update = arc_kstat_update;
4189                 kstat_install(arc_ksp);
4190         }
4191
4192         (void) thread_create(NULL, 0, arc_adapt_thread, NULL, 0, &p0,
4193             TS_RUN, minclsyspri);
4194
4195         arc_dead = FALSE;
4196         arc_warm = B_FALSE;
4197
4198         /*
4199          * Calculate maximum amount of dirty data per pool.
4200          *
4201          * If it has been set by a module parameter, take that.
4202          * Otherwise, use a percentage of physical memory defined by
4203          * zfs_dirty_data_max_percent (default 10%) with a cap at
4204          * zfs_dirty_data_max_max (default 25% of physical memory).
4205          */
4206         if (zfs_dirty_data_max_max == 0)
4207                 zfs_dirty_data_max_max = physmem * PAGESIZE *
4208                     zfs_dirty_data_max_max_percent / 100;
4209
4210         if (zfs_dirty_data_max == 0) {
4211                 zfs_dirty_data_max = physmem * PAGESIZE *
4212                     zfs_dirty_data_max_percent / 100;
4213                 zfs_dirty_data_max = MIN(zfs_dirty_data_max,
4214                     zfs_dirty_data_max_max);
4215         }
4216 }
4217
4218 void
4219 arc_fini(void)
4220 {
4221         arc_prune_t *p;
4222
4223         mutex_enter(&arc_reclaim_thr_lock);
4224 #ifdef _KERNEL
4225         spl_unregister_shrinker(&arc_shrinker);
4226 #endif /* _KERNEL */
4227
4228         arc_thread_exit = 1;
4229         while (arc_thread_exit != 0)
4230                 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
4231         mutex_exit(&arc_reclaim_thr_lock);
4232
4233         arc_flush(NULL);
4234
4235         arc_dead = TRUE;
4236
4237         if (arc_ksp != NULL) {
4238                 kstat_delete(arc_ksp);
4239                 arc_ksp = NULL;
4240         }
4241
4242         mutex_enter(&arc_prune_mtx);
4243         while ((p = list_head(&arc_prune_list)) != NULL) {
4244                 list_remove(&arc_prune_list, p);
4245                 refcount_remove(&p->p_refcnt, &arc_prune_list);
4246                 refcount_destroy(&p->p_refcnt);
4247                 kmem_free(p, sizeof (*p));
4248         }
4249         mutex_exit(&arc_prune_mtx);
4250
4251         list_destroy(&arc_prune_list);
4252         mutex_destroy(&arc_prune_mtx);
4253         mutex_destroy(&arc_eviction_mtx);
4254         mutex_destroy(&arc_reclaim_thr_lock);
4255         cv_destroy(&arc_reclaim_thr_cv);
4256
4257         list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
4258         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
4259         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
4260         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
4261         list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
4262         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
4263         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
4264         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
4265
4266         mutex_destroy(&arc_anon->arcs_mtx);
4267         mutex_destroy(&arc_mru->arcs_mtx);
4268         mutex_destroy(&arc_mru_ghost->arcs_mtx);
4269         mutex_destroy(&arc_mfu->arcs_mtx);
4270         mutex_destroy(&arc_mfu_ghost->arcs_mtx);
4271         mutex_destroy(&arc_l2c_only->arcs_mtx);
4272
4273         buf_fini();
4274
4275         ASSERT(arc_loaned_bytes == 0);
4276 }
4277
4278 /*
4279  * Level 2 ARC
4280  *
4281  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
4282  * It uses dedicated storage devices to hold cached data, which are populated
4283  * using large infrequent writes.  The main role of this cache is to boost
4284  * the performance of random read workloads.  The intended L2ARC devices
4285  * include short-stroked disks, solid state disks, and other media with
4286  * substantially faster read latency than disk.
4287  *
4288  *                 +-----------------------+
4289  *                 |         ARC           |
4290  *                 +-----------------------+
4291  *                    |         ^     ^
4292  *                    |         |     |
4293  *      l2arc_feed_thread()    arc_read()
4294  *                    |         |     |
4295  *                    |  l2arc read   |
4296  *                    V         |     |
4297  *               +---------------+    |
4298  *               |     L2ARC     |    |
4299  *               +---------------+    |
4300  *                   |    ^           |
4301  *          l2arc_write() |           |
4302  *                   |    |           |
4303  *                   V    |           |
4304  *                 +-------+      +-------+
4305  *                 | vdev  |      | vdev  |
4306  *                 | cache |      | cache |
4307  *                 +-------+      +-------+
4308  *                 +=========+     .-----.
4309  *                 :  L2ARC  :    |-_____-|
4310  *                 : devices :    | Disks |
4311  *                 +=========+    `-_____-'
4312  *
4313  * Read requests are satisfied from the following sources, in order:
4314  *
4315  *      1) ARC
4316  *      2) vdev cache of L2ARC devices
4317  *      3) L2ARC devices
4318  *      4) vdev cache of disks
4319  *      5) disks
4320  *
4321  * Some L2ARC device types exhibit extremely slow write performance.
4322  * To accommodate for this there are some significant differences between
4323  * the L2ARC and traditional cache design:
4324  *
4325  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
4326  * the ARC behave as usual, freeing buffers and placing headers on ghost
4327  * lists.  The ARC does not send buffers to the L2ARC during eviction as
4328  * this would add inflated write latencies for all ARC memory pressure.
4329  *
4330  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
4331  * It does this by periodically scanning buffers from the eviction-end of
4332  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
4333  * not already there. It scans until a headroom of buffers is satisfied,
4334  * which itself is a buffer for ARC eviction. If a compressible buffer is
4335  * found during scanning and selected for writing to an L2ARC device, we
4336  * temporarily boost scanning headroom during the next scan cycle to make
4337  * sure we adapt to compression effects (which might significantly reduce
4338  * the data volume we write to L2ARC). The thread that does this is
4339  * l2arc_feed_thread(), illustrated below; example sizes are included to
4340  * provide a better sense of ratio than this diagram:
4341  *
4342  *             head -->                        tail
4343  *              +---------------------+----------+
4344  *      ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
4345  *              +---------------------+----------+   |   o L2ARC eligible
4346  *      ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
4347  *              +---------------------+----------+   |
4348  *                   15.9 Gbytes      ^ 32 Mbytes    |
4349  *                                 headroom          |
4350  *                                            l2arc_feed_thread()
4351  *                                                   |
4352  *                       l2arc write hand <--[oooo]--'
4353  *                               |           8 Mbyte
4354  *                               |          write max
4355  *                               V
4356  *                +==============================+
4357  *      L2ARC dev |####|#|###|###|    |####| ... |
4358  *                +==============================+
4359  *                           32 Gbytes
4360  *
4361  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
4362  * evicted, then the L2ARC has cached a buffer much sooner than it probably
4363  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
4364  * safe to say that this is an uncommon case, since buffers at the end of
4365  * the ARC lists have moved there due to inactivity.
4366  *
4367  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
4368  * then the L2ARC simply misses copying some buffers.  This serves as a
4369  * pressure valve to prevent heavy read workloads from both stalling the ARC
4370  * with waits and clogging the L2ARC with writes.  This also helps prevent
4371  * the potential for the L2ARC to churn if it attempts to cache content too
4372  * quickly, such as during backups of the entire pool.
4373  *
4374  * 5. After system boot and before the ARC has filled main memory, there are
4375  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
4376  * lists can remain mostly static.  Instead of searching from tail of these
4377  * lists as pictured, the l2arc_feed_thread() will search from the list heads
4378  * for eligible buffers, greatly increasing its chance of finding them.
4379  *
4380  * The L2ARC device write speed is also boosted during this time so that
4381  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
4382  * there are no L2ARC reads, and no fear of degrading read performance
4383  * through increased writes.
4384  *
4385  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
4386  * the vdev queue can aggregate them into larger and fewer writes.  Each
4387  * device is written to in a rotor fashion, sweeping writes through
4388  * available space then repeating.
4389  *
4390  * 7. The L2ARC does not store dirty content.  It never needs to flush
4391  * write buffers back to disk based storage.
4392  *
4393  * 8. If an ARC buffer is written (and dirtied) which also exists in the
4394  * L2ARC, the now stale L2ARC buffer is immediately dropped.
4395  *
4396  * The performance of the L2ARC can be tweaked by a number of tunables, which
4397  * may be necessary for different workloads:
4398  *
4399  *      l2arc_write_max         max write bytes per interval
4400  *      l2arc_write_boost       extra write bytes during device warmup
4401  *      l2arc_noprefetch        skip caching prefetched buffers
4402  *      l2arc_nocompress        skip compressing buffers
4403  *      l2arc_headroom          number of max device writes to precache
4404  *      l2arc_headroom_boost    when we find compressed buffers during ARC
4405  *                              scanning, we multiply headroom by this
4406  *                              percentage factor for the next scan cycle,
4407  *                              since more compressed buffers are likely to
4408  *                              be present
4409  *      l2arc_feed_secs         seconds between L2ARC writing
4410  *
4411  * Tunables may be removed or added as future performance improvements are
4412  * integrated, and also may become zpool properties.
4413  *
4414  * There are three key functions that control how the L2ARC warms up:
4415  *
4416  *      l2arc_write_eligible()  check if a buffer is eligible to cache
4417  *      l2arc_write_size()      calculate how much to write
4418  *      l2arc_write_interval()  calculate sleep delay between writes
4419  *
4420  * These three functions determine what to write, how much, and how quickly
4421  * to send writes.
4422  */
4423
4424 static boolean_t
4425 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4426 {
4427         /*
4428          * A buffer is *not* eligible for the L2ARC if it:
4429          * 1. belongs to a different spa.
4430          * 2. is already cached on the L2ARC.
4431          * 3. has an I/O in progress (it may be an incomplete read).
4432          * 4. is flagged not eligible (zfs property).
4433          */
4434         if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
4435             HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
4436                 return (B_FALSE);
4437
4438         return (B_TRUE);
4439 }
4440
4441 static uint64_t
4442 l2arc_write_size(void)
4443 {
4444         uint64_t size;
4445
4446         /*
4447          * Make sure our globals have meaningful values in case the user
4448          * altered them.
4449          */
4450         size = l2arc_write_max;
4451         if (size == 0) {
4452                 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
4453                     "be greater than zero, resetting it to the default (%d)",
4454                     L2ARC_WRITE_SIZE);
4455                 size = l2arc_write_max = L2ARC_WRITE_SIZE;
4456         }
4457
4458         if (arc_warm == B_FALSE)
4459                 size += l2arc_write_boost;
4460
4461         return (size);
4462
4463 }
4464
4465 static clock_t
4466 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4467 {
4468         clock_t interval, next, now;
4469
4470         /*
4471          * If the ARC lists are busy, increase our write rate; if the
4472          * lists are stale, idle back.  This is achieved by checking
4473          * how much we previously wrote - if it was more than half of
4474          * what we wanted, schedule the next write much sooner.
4475          */
4476         if (l2arc_feed_again && wrote > (wanted / 2))
4477                 interval = (hz * l2arc_feed_min_ms) / 1000;
4478         else
4479                 interval = hz * l2arc_feed_secs;
4480
4481         now = ddi_get_lbolt();
4482         next = MAX(now, MIN(now + interval, began + interval));
4483
4484         return (next);
4485 }
4486
4487 static void
4488 l2arc_hdr_stat_add(void)
4489 {
4490         ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE);
4491         ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4492 }
4493
4494 static void
4495 l2arc_hdr_stat_remove(void)
4496 {
4497         ARCSTAT_INCR(arcstat_l2_hdr_size, -HDR_SIZE);
4498         ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4499 }
4500
4501 /*
4502  * Cycle through L2ARC devices.  This is how L2ARC load balances.
4503  * If a device is returned, this also returns holding the spa config lock.
4504  */
4505 static l2arc_dev_t *
4506 l2arc_dev_get_next(void)
4507 {
4508         l2arc_dev_t *first, *next = NULL;
4509
4510         /*
4511          * Lock out the removal of spas (spa_namespace_lock), then removal
4512          * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
4513          * both locks will be dropped and a spa config lock held instead.
4514          */
4515         mutex_enter(&spa_namespace_lock);
4516         mutex_enter(&l2arc_dev_mtx);
4517
4518         /* if there are no vdevs, there is nothing to do */
4519         if (l2arc_ndev == 0)
4520                 goto out;
4521
4522         first = NULL;
4523         next = l2arc_dev_last;
4524         do {
4525                 /* loop around the list looking for a non-faulted vdev */
4526                 if (next == NULL) {
4527                         next = list_head(l2arc_dev_list);
4528                 } else {
4529                         next = list_next(l2arc_dev_list, next);
4530                         if (next == NULL)
4531                                 next = list_head(l2arc_dev_list);
4532                 }
4533
4534                 /* if we have come back to the start, bail out */
4535                 if (first == NULL)
4536                         first = next;
4537                 else if (next == first)
4538                         break;
4539
4540         } while (vdev_is_dead(next->l2ad_vdev));
4541
4542         /* if we were unable to find any usable vdevs, return NULL */
4543         if (vdev_is_dead(next->l2ad_vdev))
4544                 next = NULL;
4545
4546         l2arc_dev_last = next;
4547
4548 out:
4549         mutex_exit(&l2arc_dev_mtx);
4550
4551         /*
4552          * Grab the config lock to prevent the 'next' device from being
4553          * removed while we are writing to it.
4554          */
4555         if (next != NULL)
4556                 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4557         mutex_exit(&spa_namespace_lock);
4558
4559         return (next);
4560 }
4561
4562 /*
4563  * Free buffers that were tagged for destruction.
4564  */
4565 static void
4566 l2arc_do_free_on_write(void)
4567 {
4568         list_t *buflist;
4569         l2arc_data_free_t *df, *df_prev;
4570
4571         mutex_enter(&l2arc_free_on_write_mtx);
4572         buflist = l2arc_free_on_write;
4573
4574         for (df = list_tail(buflist); df; df = df_prev) {
4575                 df_prev = list_prev(buflist, df);
4576                 ASSERT(df->l2df_data != NULL);
4577                 ASSERT(df->l2df_func != NULL);
4578                 df->l2df_func(df->l2df_data, df->l2df_size);
4579                 list_remove(buflist, df);
4580                 kmem_free(df, sizeof (l2arc_data_free_t));
4581         }
4582
4583         mutex_exit(&l2arc_free_on_write_mtx);
4584 }
4585
4586 /*
4587  * A write to a cache device has completed.  Update all headers to allow
4588  * reads from these buffers to begin.
4589  */
4590 static void
4591 l2arc_write_done(zio_t *zio)
4592 {
4593         l2arc_write_callback_t *cb;
4594         l2arc_dev_t *dev;
4595         list_t *buflist;
4596         arc_buf_hdr_t *head, *ab, *ab_prev;
4597         l2arc_buf_hdr_t *abl2;
4598         kmutex_t *hash_lock;
4599
4600         cb = zio->io_private;
4601         ASSERT(cb != NULL);
4602         dev = cb->l2wcb_dev;
4603         ASSERT(dev != NULL);
4604         head = cb->l2wcb_head;
4605         ASSERT(head != NULL);
4606         buflist = dev->l2ad_buflist;
4607         ASSERT(buflist != NULL);
4608         DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4609             l2arc_write_callback_t *, cb);
4610
4611         if (zio->io_error != 0)
4612                 ARCSTAT_BUMP(arcstat_l2_writes_error);
4613
4614         mutex_enter(&l2arc_buflist_mtx);
4615
4616         /*
4617          * All writes completed, or an error was hit.
4618          */
4619         for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4620                 ab_prev = list_prev(buflist, ab);
4621                 abl2 = ab->b_l2hdr;
4622
4623                 /*
4624                  * Release the temporary compressed buffer as soon as possible.
4625                  */
4626                 if (abl2->b_compress != ZIO_COMPRESS_OFF)
4627                         l2arc_release_cdata_buf(ab);
4628
4629                 hash_lock = HDR_LOCK(ab);
4630                 if (!mutex_tryenter(hash_lock)) {
4631                         /*
4632                          * This buffer misses out.  It may be in a stage
4633                          * of eviction.  Its ARC_L2_WRITING flag will be
4634                          * left set, denying reads to this buffer.
4635                          */
4636                         ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4637                         continue;
4638                 }
4639
4640                 if (zio->io_error != 0) {
4641                         /*
4642                          * Error - drop L2ARC entry.
4643                          */
4644                         list_remove(buflist, ab);
4645                         ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4646                         ab->b_l2hdr = NULL;
4647                         kmem_cache_free(l2arc_hdr_cache, abl2);
4648                         arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
4649                         ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4650                 }
4651
4652                 /*
4653                  * Allow ARC to begin reads to this L2ARC entry.
4654                  */
4655                 ab->b_flags &= ~ARC_L2_WRITING;
4656
4657                 mutex_exit(hash_lock);
4658         }
4659
4660         atomic_inc_64(&l2arc_writes_done);
4661         list_remove(buflist, head);
4662         kmem_cache_free(hdr_cache, head);
4663         mutex_exit(&l2arc_buflist_mtx);
4664
4665         l2arc_do_free_on_write();
4666
4667         kmem_free(cb, sizeof (l2arc_write_callback_t));
4668 }
4669
4670 /*
4671  * A read to a cache device completed.  Validate buffer contents before
4672  * handing over to the regular ARC routines.
4673  */
4674 static void
4675 l2arc_read_done(zio_t *zio)
4676 {
4677         l2arc_read_callback_t *cb;
4678         arc_buf_hdr_t *hdr;
4679         arc_buf_t *buf;
4680         kmutex_t *hash_lock;
4681         int equal;
4682
4683         ASSERT(zio->io_vd != NULL);
4684         ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4685
4686         spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4687
4688         cb = zio->io_private;
4689         ASSERT(cb != NULL);
4690         buf = cb->l2rcb_buf;
4691         ASSERT(buf != NULL);
4692
4693         hash_lock = HDR_LOCK(buf->b_hdr);
4694         mutex_enter(hash_lock);
4695         hdr = buf->b_hdr;
4696         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4697
4698         /*
4699          * If the buffer was compressed, decompress it first.
4700          */
4701         if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
4702                 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
4703         ASSERT(zio->io_data != NULL);
4704
4705         /*
4706          * Check this survived the L2ARC journey.
4707          */
4708         equal = arc_cksum_equal(buf);
4709         if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4710                 mutex_exit(hash_lock);
4711                 zio->io_private = buf;
4712                 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
4713                 zio->io_bp = &zio->io_bp_copy;  /* XXX fix in L2ARC 2.0 */
4714                 arc_read_done(zio);
4715         } else {
4716                 mutex_exit(hash_lock);
4717                 /*
4718                  * Buffer didn't survive caching.  Increment stats and
4719                  * reissue to the original storage device.
4720                  */
4721                 if (zio->io_error != 0) {
4722                         ARCSTAT_BUMP(arcstat_l2_io_error);
4723                 } else {
4724                         zio->io_error = SET_ERROR(EIO);
4725                 }
4726                 if (!equal)
4727                         ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4728
4729                 /*
4730                  * If there's no waiter, issue an async i/o to the primary
4731                  * storage now.  If there *is* a waiter, the caller must
4732                  * issue the i/o in a context where it's OK to block.
4733                  */
4734                 if (zio->io_waiter == NULL) {
4735                         zio_t *pio = zio_unique_parent(zio);
4736
4737                         ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4738
4739                         zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4740                             buf->b_data, zio->io_size, arc_read_done, buf,
4741                             zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4742                 }
4743         }
4744
4745         kmem_free(cb, sizeof (l2arc_read_callback_t));
4746 }
4747
4748 /*
4749  * This is the list priority from which the L2ARC will search for pages to
4750  * cache.  This is used within loops (0..3) to cycle through lists in the
4751  * desired order.  This order can have a significant effect on cache
4752  * performance.
4753  *
4754  * Currently the metadata lists are hit first, MFU then MRU, followed by
4755  * the data lists.  This function returns a locked list, and also returns
4756  * the lock pointer.
4757  */
4758 static list_t *
4759 l2arc_list_locked(int list_num, kmutex_t **lock)
4760 {
4761         list_t *list = NULL;
4762
4763         ASSERT(list_num >= 0 && list_num <= 3);
4764
4765         switch (list_num) {
4766         case 0:
4767                 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4768                 *lock = &arc_mfu->arcs_mtx;
4769                 break;
4770         case 1:
4771                 list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4772                 *lock = &arc_mru->arcs_mtx;
4773                 break;
4774         case 2:
4775                 list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4776                 *lock = &arc_mfu->arcs_mtx;
4777                 break;
4778         case 3:
4779                 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4780                 *lock = &arc_mru->arcs_mtx;
4781                 break;
4782         }
4783
4784         ASSERT(!(MUTEX_HELD(*lock)));
4785         mutex_enter(*lock);
4786         return (list);
4787 }
4788
4789 /*
4790  * Evict buffers from the device write hand to the distance specified in
4791  * bytes.  This distance may span populated buffers, it may span nothing.
4792  * This is clearing a region on the L2ARC device ready for writing.
4793  * If the 'all' boolean is set, every buffer is evicted.
4794  */
4795 static void
4796 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4797 {
4798         list_t *buflist;
4799         l2arc_buf_hdr_t *abl2;
4800         arc_buf_hdr_t *ab, *ab_prev;
4801         kmutex_t *hash_lock;
4802         uint64_t taddr;
4803
4804         buflist = dev->l2ad_buflist;
4805
4806         if (buflist == NULL)
4807                 return;
4808
4809         if (!all && dev->l2ad_first) {
4810                 /*
4811                  * This is the first sweep through the device.  There is
4812                  * nothing to evict.
4813                  */
4814                 return;
4815         }
4816
4817         if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4818                 /*
4819                  * When nearing the end of the device, evict to the end
4820                  * before the device write hand jumps to the start.
4821                  */
4822                 taddr = dev->l2ad_end;
4823         } else {
4824                 taddr = dev->l2ad_hand + distance;
4825         }
4826         DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4827             uint64_t, taddr, boolean_t, all);
4828
4829 top:
4830         mutex_enter(&l2arc_buflist_mtx);
4831         for (ab = list_tail(buflist); ab; ab = ab_prev) {
4832                 ab_prev = list_prev(buflist, ab);
4833
4834                 hash_lock = HDR_LOCK(ab);
4835                 if (!mutex_tryenter(hash_lock)) {
4836                         /*
4837                          * Missed the hash lock.  Retry.
4838                          */
4839                         ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4840                         mutex_exit(&l2arc_buflist_mtx);
4841                         mutex_enter(hash_lock);
4842                         mutex_exit(hash_lock);
4843                         goto top;
4844                 }
4845
4846                 if (HDR_L2_WRITE_HEAD(ab)) {
4847                         /*
4848                          * We hit a write head node.  Leave it for
4849                          * l2arc_write_done().
4850                          */
4851                         list_remove(buflist, ab);
4852                         mutex_exit(hash_lock);
4853                         continue;
4854                 }
4855
4856                 if (!all && ab->b_l2hdr != NULL &&
4857                     (ab->b_l2hdr->b_daddr > taddr ||
4858                     ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4859                         /*
4860                          * We've evicted to the target address,
4861                          * or the end of the device.
4862                          */
4863                         mutex_exit(hash_lock);
4864                         break;
4865                 }
4866
4867                 if (HDR_FREE_IN_PROGRESS(ab)) {
4868                         /*
4869                          * Already on the path to destruction.
4870                          */
4871                         mutex_exit(hash_lock);
4872                         continue;
4873                 }
4874
4875                 if (ab->b_state == arc_l2c_only) {
4876                         ASSERT(!HDR_L2_READING(ab));
4877                         /*
4878                          * This doesn't exist in the ARC.  Destroy.
4879                          * arc_hdr_destroy() will call list_remove()
4880                          * and decrement arcstat_l2_size.
4881                          */
4882                         arc_change_state(arc_anon, ab, hash_lock);
4883                         arc_hdr_destroy(ab);
4884                 } else {
4885                         /*
4886                          * Invalidate issued or about to be issued
4887                          * reads, since we may be about to write
4888                          * over this location.
4889                          */
4890                         if (HDR_L2_READING(ab)) {
4891                                 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4892                                 ab->b_flags |= ARC_L2_EVICTED;
4893                         }
4894
4895                         /*
4896                          * Tell ARC this no longer exists in L2ARC.
4897                          */
4898                         if (ab->b_l2hdr != NULL) {
4899                                 abl2 = ab->b_l2hdr;
4900                                 ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4901                                 ab->b_l2hdr = NULL;
4902                                 kmem_cache_free(l2arc_hdr_cache, abl2);
4903                                 arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS);
4904                                 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4905                         }
4906                         list_remove(buflist, ab);
4907
4908                         /*
4909                          * This may have been leftover after a
4910                          * failed write.
4911                          */
4912                         ab->b_flags &= ~ARC_L2_WRITING;
4913                 }
4914                 mutex_exit(hash_lock);
4915         }
4916         mutex_exit(&l2arc_buflist_mtx);
4917
4918         vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4919         dev->l2ad_evict = taddr;
4920 }
4921
4922 /*
4923  * Find and write ARC buffers to the L2ARC device.
4924  *
4925  * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4926  * for reading until they have completed writing.
4927  * The headroom_boost is an in-out parameter used to maintain headroom boost
4928  * state between calls to this function.
4929  *
4930  * Returns the number of bytes actually written (which may be smaller than
4931  * the delta by which the device hand has changed due to alignment).
4932  */
4933 static uint64_t
4934 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4935     boolean_t *headroom_boost)
4936 {
4937         arc_buf_hdr_t *ab, *ab_prev, *head;
4938         list_t *list;
4939         uint64_t write_asize, write_psize, write_sz, headroom,
4940             buf_compress_minsz;
4941         void *buf_data;
4942         kmutex_t *list_lock = NULL;
4943         boolean_t full;
4944         l2arc_write_callback_t *cb;
4945         zio_t *pio, *wzio;
4946         uint64_t guid = spa_load_guid(spa);
4947         int try;
4948         const boolean_t do_headroom_boost = *headroom_boost;
4949
4950         ASSERT(dev->l2ad_vdev != NULL);
4951
4952         /* Lower the flag now, we might want to raise it again later. */
4953         *headroom_boost = B_FALSE;
4954
4955         pio = NULL;
4956         write_sz = write_asize = write_psize = 0;
4957         full = B_FALSE;
4958         head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4959         head->b_flags |= ARC_L2_WRITE_HEAD;
4960
4961         /*
4962          * We will want to try to compress buffers that are at least 2x the
4963          * device sector size.
4964          */
4965         buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4966
4967         /*
4968          * Copy buffers for L2ARC writing.
4969          */
4970         mutex_enter(&l2arc_buflist_mtx);
4971         for (try = 0; try <= 3; try++) {
4972                 uint64_t passed_sz = 0;
4973
4974                 list = l2arc_list_locked(try, &list_lock);
4975
4976                 /*
4977                  * L2ARC fast warmup.
4978                  *
4979                  * Until the ARC is warm and starts to evict, read from the
4980                  * head of the ARC lists rather than the tail.
4981                  */
4982                 if (arc_warm == B_FALSE)
4983                         ab = list_head(list);
4984                 else
4985                         ab = list_tail(list);
4986
4987                 headroom = target_sz * l2arc_headroom;
4988                 if (do_headroom_boost)
4989                         headroom = (headroom * l2arc_headroom_boost) / 100;
4990
4991                 for (; ab; ab = ab_prev) {
4992                         l2arc_buf_hdr_t *l2hdr;
4993                         kmutex_t *hash_lock;
4994                         uint64_t buf_sz;
4995
4996                         if (arc_warm == B_FALSE)
4997                                 ab_prev = list_next(list, ab);
4998                         else
4999                                 ab_prev = list_prev(list, ab);
5000
5001                         hash_lock = HDR_LOCK(ab);
5002                         if (!mutex_tryenter(hash_lock)) {
5003                                 /*
5004                                  * Skip this buffer rather than waiting.
5005                                  */
5006                                 continue;
5007                         }
5008
5009                         passed_sz += ab->b_size;
5010                         if (passed_sz > headroom) {
5011                                 /*
5012                                  * Searched too far.
5013                                  */
5014                                 mutex_exit(hash_lock);
5015                                 break;
5016                         }
5017
5018                         if (!l2arc_write_eligible(guid, ab)) {
5019                                 mutex_exit(hash_lock);
5020                                 continue;
5021                         }
5022
5023                         if ((write_sz + ab->b_size) > target_sz) {
5024                                 full = B_TRUE;
5025                                 mutex_exit(hash_lock);
5026                                 break;
5027                         }
5028
5029                         if (pio == NULL) {
5030                                 /*
5031                                  * Insert a dummy header on the buflist so
5032                                  * l2arc_write_done() can find where the
5033                                  * write buffers begin without searching.
5034                                  */
5035                                 list_insert_head(dev->l2ad_buflist, head);
5036
5037                                 cb = kmem_alloc(sizeof (l2arc_write_callback_t),
5038                                     KM_PUSHPAGE);
5039                                 cb->l2wcb_dev = dev;
5040                                 cb->l2wcb_head = head;
5041                                 pio = zio_root(spa, l2arc_write_done, cb,
5042                                     ZIO_FLAG_CANFAIL);
5043                         }
5044
5045                         /*
5046                          * Create and add a new L2ARC header.
5047                          */
5048                         l2hdr = kmem_cache_alloc(l2arc_hdr_cache, KM_PUSHPAGE);
5049                         l2hdr->b_dev = dev;
5050                         l2hdr->b_daddr = 0;
5051                         arc_space_consume(L2HDR_SIZE, ARC_SPACE_L2HDRS);
5052
5053                         ab->b_flags |= ARC_L2_WRITING;
5054
5055                         /*
5056                          * Temporarily stash the data buffer in b_tmp_cdata.
5057                          * The subsequent write step will pick it up from
5058                          * there. This is because can't access ab->b_buf
5059                          * without holding the hash_lock, which we in turn
5060                          * can't access without holding the ARC list locks
5061                          * (which we want to avoid during compression/writing)
5062                          */
5063                         l2hdr->b_compress = ZIO_COMPRESS_OFF;
5064                         l2hdr->b_asize = ab->b_size;
5065                         l2hdr->b_tmp_cdata = ab->b_buf->b_data;
5066                         l2hdr->b_hits = 0;
5067
5068                         buf_sz = ab->b_size;
5069                         ab->b_l2hdr = l2hdr;
5070
5071                         list_insert_head(dev->l2ad_buflist, ab);
5072
5073                         /*
5074                          * Compute and store the buffer cksum before
5075                          * writing.  On debug the cksum is verified first.
5076                          */
5077                         arc_cksum_verify(ab->b_buf);
5078                         arc_cksum_compute(ab->b_buf, B_TRUE);
5079
5080                         mutex_exit(hash_lock);
5081
5082                         write_sz += buf_sz;
5083                 }
5084
5085                 mutex_exit(list_lock);
5086
5087                 if (full == B_TRUE)
5088                         break;
5089         }
5090
5091         /* No buffers selected for writing? */
5092         if (pio == NULL) {
5093                 ASSERT0(write_sz);
5094                 mutex_exit(&l2arc_buflist_mtx);
5095                 kmem_cache_free(hdr_cache, head);
5096                 return (0);
5097         }
5098
5099         /*
5100          * Now start writing the buffers. We're starting at the write head
5101          * and work backwards, retracing the course of the buffer selector
5102          * loop above.
5103          */
5104         for (ab = list_prev(dev->l2ad_buflist, head); ab;
5105             ab = list_prev(dev->l2ad_buflist, ab)) {
5106                 l2arc_buf_hdr_t *l2hdr;
5107                 uint64_t buf_sz;
5108
5109                 /*
5110                  * We shouldn't need to lock the buffer here, since we flagged
5111                  * it as ARC_L2_WRITING in the previous step, but we must take
5112                  * care to only access its L2 cache parameters. In particular,
5113                  * ab->b_buf may be invalid by now due to ARC eviction.
5114                  */
5115                 l2hdr = ab->b_l2hdr;
5116                 l2hdr->b_daddr = dev->l2ad_hand;
5117
5118                 if (!l2arc_nocompress && (ab->b_flags & ARC_L2COMPRESS) &&
5119                     l2hdr->b_asize >= buf_compress_minsz) {
5120                         if (l2arc_compress_buf(l2hdr)) {
5121                                 /*
5122                                  * If compression succeeded, enable headroom
5123                                  * boost on the next scan cycle.
5124                                  */
5125                                 *headroom_boost = B_TRUE;
5126                         }
5127                 }
5128
5129                 /*
5130                  * Pick up the buffer data we had previously stashed away
5131                  * (and now potentially also compressed).
5132                  */
5133                 buf_data = l2hdr->b_tmp_cdata;
5134                 buf_sz = l2hdr->b_asize;
5135
5136                 /* Compression may have squashed the buffer to zero length. */
5137                 if (buf_sz != 0) {
5138                         uint64_t buf_p_sz;
5139
5140                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
5141                             dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
5142                             NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
5143                             ZIO_FLAG_CANFAIL, B_FALSE);
5144
5145                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
5146                             zio_t *, wzio);
5147                         (void) zio_nowait(wzio);
5148
5149                         write_asize += buf_sz;
5150                         /*
5151                          * Keep the clock hand suitably device-aligned.
5152                          */
5153                         buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
5154                         write_psize += buf_p_sz;
5155                         dev->l2ad_hand += buf_p_sz;
5156                 }
5157         }
5158
5159         mutex_exit(&l2arc_buflist_mtx);
5160
5161         ASSERT3U(write_asize, <=, target_sz);
5162         ARCSTAT_BUMP(arcstat_l2_writes_sent);
5163         ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
5164         ARCSTAT_INCR(arcstat_l2_size, write_sz);
5165         ARCSTAT_INCR(arcstat_l2_asize, write_asize);
5166         vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
5167
5168         /*
5169          * Bump device hand to the device start if it is approaching the end.
5170          * l2arc_evict() will already have evicted ahead for this case.
5171          */
5172         if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
5173                 vdev_space_update(dev->l2ad_vdev,
5174                     dev->l2ad_end - dev->l2ad_hand, 0, 0);
5175                 dev->l2ad_hand = dev->l2ad_start;
5176                 dev->l2ad_evict = dev->l2ad_start;
5177                 dev->l2ad_first = B_FALSE;
5178         }
5179
5180         dev->l2ad_writing = B_TRUE;
5181         (void) zio_wait(pio);
5182         dev->l2ad_writing = B_FALSE;
5183
5184         return (write_asize);
5185 }
5186
5187 /*
5188  * Compresses an L2ARC buffer.
5189  * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
5190  * size in l2hdr->b_asize. This routine tries to compress the data and
5191  * depending on the compression result there are three possible outcomes:
5192  * *) The buffer was incompressible. The original l2hdr contents were left
5193  *    untouched and are ready for writing to an L2 device.
5194  * *) The buffer was all-zeros, so there is no need to write it to an L2
5195  *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
5196  *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
5197  * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
5198  *    data buffer which holds the compressed data to be written, and b_asize
5199  *    tells us how much data there is. b_compress is set to the appropriate
5200  *    compression algorithm. Once writing is done, invoke
5201  *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
5202  *
5203  * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
5204  * buffer was incompressible).
5205  */
5206 static boolean_t
5207 l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
5208 {
5209         void *cdata;
5210         size_t csize, len;
5211
5212         ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
5213         ASSERT(l2hdr->b_tmp_cdata != NULL);
5214
5215         len = l2hdr->b_asize;
5216         cdata = zio_data_buf_alloc(len);
5217         csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
5218             cdata, l2hdr->b_asize);
5219
5220         if (csize == 0) {
5221                 /* zero block, indicate that there's nothing to write */
5222                 zio_data_buf_free(cdata, len);
5223                 l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
5224                 l2hdr->b_asize = 0;
5225                 l2hdr->b_tmp_cdata = NULL;
5226                 ARCSTAT_BUMP(arcstat_l2_compress_zeros);
5227                 return (B_TRUE);
5228         } else if (csize > 0 && csize < len) {
5229                 /*
5230                  * Compression succeeded, we'll keep the cdata around for
5231                  * writing and release it afterwards.
5232                  */
5233                 l2hdr->b_compress = ZIO_COMPRESS_LZ4;
5234                 l2hdr->b_asize = csize;
5235                 l2hdr->b_tmp_cdata = cdata;
5236                 ARCSTAT_BUMP(arcstat_l2_compress_successes);
5237                 return (B_TRUE);
5238         } else {
5239                 /*
5240                  * Compression failed, release the compressed buffer.
5241                  * l2hdr will be left unmodified.
5242                  */
5243                 zio_data_buf_free(cdata, len);
5244                 ARCSTAT_BUMP(arcstat_l2_compress_failures);
5245                 return (B_FALSE);
5246         }
5247 }
5248
5249 /*
5250  * Decompresses a zio read back from an l2arc device. On success, the
5251  * underlying zio's io_data buffer is overwritten by the uncompressed
5252  * version. On decompression error (corrupt compressed stream), the
5253  * zio->io_error value is set to signal an I/O error.
5254  *
5255  * Please note that the compressed data stream is not checksummed, so
5256  * if the underlying device is experiencing data corruption, we may feed
5257  * corrupt data to the decompressor, so the decompressor needs to be
5258  * able to handle this situation (LZ4 does).
5259  */
5260 static void
5261 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
5262 {
5263         uint64_t csize;
5264         void *cdata;
5265
5266         ASSERT(L2ARC_IS_VALID_COMPRESS(c));
5267
5268         if (zio->io_error != 0) {
5269                 /*
5270                  * An io error has occured, just restore the original io
5271                  * size in preparation for a main pool read.
5272                  */
5273                 zio->io_orig_size = zio->io_size = hdr->b_size;
5274                 return;
5275         }
5276
5277         if (c == ZIO_COMPRESS_EMPTY) {
5278                 /*
5279                  * An empty buffer results in a null zio, which means we
5280                  * need to fill its io_data after we're done restoring the
5281                  * buffer's contents.
5282                  */
5283                 ASSERT(hdr->b_buf != NULL);
5284                 bzero(hdr->b_buf->b_data, hdr->b_size);
5285                 zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
5286         } else {
5287                 ASSERT(zio->io_data != NULL);
5288                 /*
5289                  * We copy the compressed data from the start of the arc buffer
5290                  * (the zio_read will have pulled in only what we need, the
5291                  * rest is garbage which we will overwrite at decompression)
5292                  * and then decompress back to the ARC data buffer. This way we
5293                  * can minimize copying by simply decompressing back over the
5294                  * original compressed data (rather than decompressing to an
5295                  * aux buffer and then copying back the uncompressed buffer,
5296                  * which is likely to be much larger).
5297                  */
5298                 csize = zio->io_size;
5299                 cdata = zio_data_buf_alloc(csize);
5300                 bcopy(zio->io_data, cdata, csize);
5301                 if (zio_decompress_data(c, cdata, zio->io_data, csize,
5302                     hdr->b_size) != 0)
5303                         zio->io_error = SET_ERROR(EIO);
5304                 zio_data_buf_free(cdata, csize);
5305         }
5306
5307         /* Restore the expected uncompressed IO size. */
5308         zio->io_orig_size = zio->io_size = hdr->b_size;
5309 }
5310
5311 /*
5312  * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
5313  * This buffer serves as a temporary holder of compressed data while
5314  * the buffer entry is being written to an l2arc device. Once that is
5315  * done, we can dispose of it.
5316  */
5317 static void
5318 l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
5319 {
5320         l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
5321
5322         if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
5323                 /*
5324                  * If the data was compressed, then we've allocated a
5325                  * temporary buffer for it, so now we need to release it.
5326                  */
5327                 ASSERT(l2hdr->b_tmp_cdata != NULL);
5328                 zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
5329         }
5330         l2hdr->b_tmp_cdata = NULL;
5331 }
5332
5333 /*
5334  * This thread feeds the L2ARC at regular intervals.  This is the beating
5335  * heart of the L2ARC.
5336  */
5337 static void
5338 l2arc_feed_thread(void)
5339 {
5340         callb_cpr_t cpr;
5341         l2arc_dev_t *dev;
5342         spa_t *spa;
5343         uint64_t size, wrote;
5344         clock_t begin, next = ddi_get_lbolt();
5345         boolean_t headroom_boost = B_FALSE;
5346
5347         CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
5348
5349         mutex_enter(&l2arc_feed_thr_lock);
5350
5351         while (l2arc_thread_exit == 0) {
5352                 CALLB_CPR_SAFE_BEGIN(&cpr);
5353                 (void) cv_timedwait_interruptible(&l2arc_feed_thr_cv,
5354                     &l2arc_feed_thr_lock, next);
5355                 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
5356                 next = ddi_get_lbolt() + hz;
5357
5358                 /*
5359                  * Quick check for L2ARC devices.
5360                  */
5361                 mutex_enter(&l2arc_dev_mtx);
5362                 if (l2arc_ndev == 0) {
5363                         mutex_exit(&l2arc_dev_mtx);
5364                         continue;
5365                 }
5366                 mutex_exit(&l2arc_dev_mtx);
5367                 begin = ddi_get_lbolt();
5368
5369                 /*
5370                  * This selects the next l2arc device to write to, and in
5371                  * doing so the next spa to feed from: dev->l2ad_spa.   This
5372                  * will return NULL if there are now no l2arc devices or if
5373                  * they are all faulted.
5374                  *
5375                  * If a device is returned, its spa's config lock is also
5376                  * held to prevent device removal.  l2arc_dev_get_next()
5377                  * will grab and release l2arc_dev_mtx.
5378                  */
5379                 if ((dev = l2arc_dev_get_next()) == NULL)
5380                         continue;
5381
5382                 spa = dev->l2ad_spa;
5383                 ASSERT(spa != NULL);
5384
5385                 /*
5386                  * If the pool is read-only then force the feed thread to
5387                  * sleep a little longer.
5388                  */
5389                 if (!spa_writeable(spa)) {
5390                         next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
5391                         spa_config_exit(spa, SCL_L2ARC, dev);
5392                         continue;
5393                 }
5394
5395                 /*
5396                  * Avoid contributing to memory pressure.
5397                  */
5398                 if (arc_no_grow) {
5399                         ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
5400                         spa_config_exit(spa, SCL_L2ARC, dev);
5401                         continue;
5402                 }
5403
5404                 ARCSTAT_BUMP(arcstat_l2_feeds);
5405
5406                 size = l2arc_write_size();
5407
5408                 /*
5409                  * Evict L2ARC buffers that will be overwritten.
5410                  */
5411                 l2arc_evict(dev, size, B_FALSE);
5412
5413                 /*
5414                  * Write ARC buffers.
5415                  */
5416                 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
5417
5418                 /*
5419                  * Calculate interval between writes.
5420                  */
5421                 next = l2arc_write_interval(begin, size, wrote);
5422                 spa_config_exit(spa, SCL_L2ARC, dev);
5423         }
5424
5425         l2arc_thread_exit = 0;
5426         cv_broadcast(&l2arc_feed_thr_cv);
5427         CALLB_CPR_EXIT(&cpr);           /* drops l2arc_feed_thr_lock */
5428         thread_exit();
5429 }
5430
5431 boolean_t
5432 l2arc_vdev_present(vdev_t *vd)
5433 {
5434         l2arc_dev_t *dev;
5435
5436         mutex_enter(&l2arc_dev_mtx);
5437         for (dev = list_head(l2arc_dev_list); dev != NULL;
5438             dev = list_next(l2arc_dev_list, dev)) {
5439                 if (dev->l2ad_vdev == vd)
5440                         break;
5441         }
5442         mutex_exit(&l2arc_dev_mtx);
5443
5444         return (dev != NULL);
5445 }
5446
5447 /*
5448  * Add a vdev for use by the L2ARC.  By this point the spa has already
5449  * validated the vdev and opened it.
5450  */
5451 void
5452 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
5453 {
5454         l2arc_dev_t *adddev;
5455
5456         ASSERT(!l2arc_vdev_present(vd));
5457
5458         /*
5459          * Create a new l2arc device entry.
5460          */
5461         adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5462         adddev->l2ad_spa = spa;
5463         adddev->l2ad_vdev = vd;
5464         adddev->l2ad_start = VDEV_LABEL_START_SIZE;
5465         adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5466         adddev->l2ad_hand = adddev->l2ad_start;
5467         adddev->l2ad_evict = adddev->l2ad_start;
5468         adddev->l2ad_first = B_TRUE;
5469         adddev->l2ad_writing = B_FALSE;
5470         list_link_init(&adddev->l2ad_node);
5471
5472         /*
5473          * This is a list of all ARC buffers that are still valid on the
5474          * device.
5475          */
5476         adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5477         list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5478             offsetof(arc_buf_hdr_t, b_l2node));
5479
5480         vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5481
5482         /*
5483          * Add device to global list
5484          */
5485         mutex_enter(&l2arc_dev_mtx);
5486         list_insert_head(l2arc_dev_list, adddev);
5487         atomic_inc_64(&l2arc_ndev);
5488         mutex_exit(&l2arc_dev_mtx);
5489 }
5490
5491 /*
5492  * Remove a vdev from the L2ARC.
5493  */
5494 void
5495 l2arc_remove_vdev(vdev_t *vd)
5496 {
5497         l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5498
5499         /*
5500          * Find the device by vdev
5501          */
5502         mutex_enter(&l2arc_dev_mtx);
5503         for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5504                 nextdev = list_next(l2arc_dev_list, dev);
5505                 if (vd == dev->l2ad_vdev) {
5506                         remdev = dev;
5507                         break;
5508                 }
5509         }
5510         ASSERT(remdev != NULL);
5511
5512         /*
5513          * Remove device from global list
5514          */
5515         list_remove(l2arc_dev_list, remdev);
5516         l2arc_dev_last = NULL;          /* may have been invalidated */
5517         atomic_dec_64(&l2arc_ndev);
5518         mutex_exit(&l2arc_dev_mtx);
5519
5520         /*
5521          * Clear all buflists and ARC references.  L2ARC device flush.
5522          */
5523         l2arc_evict(remdev, 0, B_TRUE);
5524         list_destroy(remdev->l2ad_buflist);
5525         kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5526         kmem_free(remdev, sizeof (l2arc_dev_t));
5527 }
5528
5529 void
5530 l2arc_init(void)
5531 {
5532         l2arc_thread_exit = 0;
5533         l2arc_ndev = 0;
5534         l2arc_writes_sent = 0;
5535         l2arc_writes_done = 0;
5536
5537         mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5538         cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5539         mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5540         mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5541         mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5542
5543         l2arc_dev_list = &L2ARC_dev_list;
5544         l2arc_free_on_write = &L2ARC_free_on_write;
5545         list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
5546             offsetof(l2arc_dev_t, l2ad_node));
5547         list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
5548             offsetof(l2arc_data_free_t, l2df_list_node));
5549 }
5550
5551 void
5552 l2arc_fini(void)
5553 {
5554         /*
5555          * This is called from dmu_fini(), which is called from spa_fini();
5556          * Because of this, we can assume that all l2arc devices have
5557          * already been removed when the pools themselves were removed.
5558          */
5559
5560         l2arc_do_free_on_write();
5561
5562         mutex_destroy(&l2arc_feed_thr_lock);
5563         cv_destroy(&l2arc_feed_thr_cv);
5564         mutex_destroy(&l2arc_dev_mtx);
5565         mutex_destroy(&l2arc_buflist_mtx);
5566         mutex_destroy(&l2arc_free_on_write_mtx);
5567
5568         list_destroy(l2arc_dev_list);
5569         list_destroy(l2arc_free_on_write);
5570 }
5571
5572 void
5573 l2arc_start(void)
5574 {
5575         if (!(spa_mode_global & FWRITE))
5576                 return;
5577
5578         (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5579             TS_RUN, minclsyspri);
5580 }
5581
5582 void
5583 l2arc_stop(void)
5584 {
5585         if (!(spa_mode_global & FWRITE))
5586                 return;
5587
5588         mutex_enter(&l2arc_feed_thr_lock);
5589         cv_signal(&l2arc_feed_thr_cv);  /* kick thread out of startup */
5590         l2arc_thread_exit = 1;
5591         while (l2arc_thread_exit != 0)
5592                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5593         mutex_exit(&l2arc_feed_thr_lock);
5594 }
5595
5596 #if defined(_KERNEL) && defined(HAVE_SPL)
5597 EXPORT_SYMBOL(arc_read);
5598 EXPORT_SYMBOL(arc_buf_remove_ref);
5599 EXPORT_SYMBOL(arc_buf_info);
5600 EXPORT_SYMBOL(arc_getbuf_func);
5601 EXPORT_SYMBOL(arc_add_prune_callback);
5602 EXPORT_SYMBOL(arc_remove_prune_callback);
5603
5604 module_param(zfs_arc_min, ulong, 0644);
5605 MODULE_PARM_DESC(zfs_arc_min, "Min arc size");
5606
5607 module_param(zfs_arc_max, ulong, 0644);
5608 MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
5609
5610 module_param(zfs_arc_meta_limit, ulong, 0644);
5611 MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
5612
5613 module_param(zfs_arc_meta_prune, int, 0644);
5614 MODULE_PARM_DESC(zfs_arc_meta_prune, "Bytes of meta data to prune");
5615
5616 module_param(zfs_arc_grow_retry, int, 0644);
5617 MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
5618
5619 module_param(zfs_arc_p_aggressive_disable, int, 0644);
5620 MODULE_PARM_DESC(zfs_arc_p_aggressive_disable, "disable aggressive arc_p grow");
5621
5622 module_param(zfs_arc_p_dampener_disable, int, 0644);
5623 MODULE_PARM_DESC(zfs_arc_p_dampener_disable, "disable arc_p adapt dampener");
5624
5625 module_param(zfs_arc_shrink_shift, int, 0644);
5626 MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
5627
5628 module_param(zfs_disable_dup_eviction, int, 0644);
5629 MODULE_PARM_DESC(zfs_disable_dup_eviction, "disable duplicate buffer eviction");
5630
5631 module_param(zfs_arc_memory_throttle_disable, int, 0644);
5632 MODULE_PARM_DESC(zfs_arc_memory_throttle_disable, "disable memory throttle");
5633
5634 module_param(zfs_arc_min_prefetch_lifespan, int, 0644);
5635 MODULE_PARM_DESC(zfs_arc_min_prefetch_lifespan, "Min life of prefetch block");
5636
5637 module_param(l2arc_write_max, ulong, 0644);
5638 MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
5639
5640 module_param(l2arc_write_boost, ulong, 0644);
5641 MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup");
5642
5643 module_param(l2arc_headroom, ulong, 0644);
5644 MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache");
5645
5646 module_param(l2arc_headroom_boost, ulong, 0644);
5647 MODULE_PARM_DESC(l2arc_headroom_boost, "Compressed l2arc_headroom multiplier");
5648
5649 module_param(l2arc_feed_secs, ulong, 0644);
5650 MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing");
5651
5652 module_param(l2arc_feed_min_ms, ulong, 0644);
5653 MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds");
5654
5655 module_param(l2arc_noprefetch, int, 0644);
5656 MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers");
5657
5658 module_param(l2arc_nocompress, int, 0644);
5659 MODULE_PARM_DESC(l2arc_nocompress, "Skip compressing L2ARC buffers");
5660
5661 module_param(l2arc_feed_again, int, 0644);
5662 MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup");
5663
5664 module_param(l2arc_norw, int, 0644);
5665 MODULE_PARM_DESC(l2arc_norw, "No reads during writes");
5666
5667 #endif