sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 /*
  27  * DVA-based Adjustable Replacement Cache
  28  *
  29  * While much of the theory of operation used here is
  30  * based on the self-tuning, low overhead replacement cache
  31  * presented by Megiddo and Modha at FAST 2003, there are some
  32  * significant differences:
  33  *
  34  * 1. The Megiddo and Modha model assumes any page is evictable.
  35  * Pages in its cache cannot be "locked" into memory.  This makes
  36  * the eviction algorithm simple: evict the last page in the list.
  37  * This also make the performance characteristics easy to reason
  38  * about.  Our cache is not so simple.  At any given moment, some
  39  * subset of the blocks in the cache are un-evictable because we
  40  * have handed out a reference to them.  Blocks are only evictable
  41  * when there are no external references active.  This makes
  42  * eviction far more problematic:  we choose to evict the evictable
  43  * blocks that are the "lowest" in the list.
  44  *
  45  * There are times when it is not possible to evict the requested
  46  * space.  In these circumstances we are unable to adjust the cache
  47  * size.  To prevent the cache growing unbounded at these times we
  48  * implement a "cache throttle" that slows the flow of new data
  49  * into the cache until we can make space available.
  50  *
  51  * 2. The Megiddo and Modha model assumes a fixed cache size.
  52  * Pages are evicted when the cache is full and there is a cache
  53  * miss.  Our model has a variable sized cache.  It grows with
  54  * high use, but also tries to react to memory pressure from the
  55  * operating system: decreasing its size when system memory is
  56  * tight.
  57  *
  58  * 3. The Megiddo and Modha model assumes a fixed page size. All
  59  * elements of the cache are therefor exactly the same size.  So
  60  * when adjusting the cache size following a cache miss, its simply
  61  * a matter of choosing a single page to evict.  In our model, we
  62  * have variable sized cache blocks (rangeing from 512 bytes to
  63  * 128K bytes).  We therefor choose a set of blocks to evict to make
  64  * space for a cache miss that approximates as closely as possible
  65  * the space used by the new block.
  66  *
  67  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  68  * by N. Megiddo & D. Modha, FAST 2003
  69  */
  70
  71 /*
  72  * The locking model:
  73  *
  74  * A new reference to a cache buffer can be obtained in two
  75  * ways: 1) via a hash table lookup using the DVA as a key,
  76  * or 2) via one of the ARC lists.  The arc_read() interface
  77  * uses method 1, while the internal arc algorithms for
  78  * adjusting the cache use method 2.  We therefor provide two
  79  * types of locks: 1) the hash table lock array, and 2) the
  80  * arc list locks.
  81  *
  82  * Buffers do not have their own mutexs, rather they rely on the
  83  * hash table mutexs for the bulk of their protection (i.e. most
  84  * fields in the arc_buf_hdr_t are protected by these mutexs).
  85  *
  86  * buf_hash_find() returns the appropriate mutex (held) when it
  87  * locates the requested buffer in the hash table.  It returns
  88  * NULL for the mutex if the buffer was not in the table.
  89  *
  90  * buf_hash_remove() expects the appropriate hash mutex to be
  91  * already held before it is invoked.
  92  *
  93  * Each arc state also has a mutex which is used to protect the
  94  * buffer list associated with the state.  When attempting to
  95  * obtain a hash table lock while holding an arc list lock you
  96  * must use: mutex_tryenter() to avoid deadlock.  Also note that
  97  * the active state mutex must be held before the ghost state mutex.
  98  *
  99  * Arc buffers may have an associated eviction callback function.
 100  * This function will be invoked prior to removing the buffer (e.g.
 101  * in arc_do_user_evicts()).  Note however that the data associated
 102  * with the buffer may be evicted prior to the callback.  The callback
 103  * must be made with *no locks held* (to prevent deadlock).  Additionally,
 104  * the users of callbacks must ensure that their private data is
 105  * protected from simultaneous callbacks from arc_buf_evict()
 106  * and arc_do_user_evicts().
 107  *
 108  * Note that the majority of the performance stats are manipulated
 109  * with atomic operations.
 110  *
 111  * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
 112  *
 113  *      - L2ARC buflist creation
 114  *      - L2ARC buflist eviction
 115  *      - L2ARC write completion, which walks L2ARC buflists
 116  *      - ARC header destruction, as it removes from L2ARC buflists
 117  *      - ARC header release, as it removes from L2ARC buflists
 118  */
 119
 120 #include <sys/spa.h>
 121 #include <sys/zio.h>
 122 #include <sys/zio_checksum.h>
 123 #include <sys/zfs_context.h>
 124 #include <sys/arc.h>
 125 #include <sys/refcount.h>
 126 #include <sys/vdev.h>
 127 #ifdef _KERNEL
 128 #include <sys/dnlc.h>
 129 #endif
 130 #include <sys/callb.h>
 131 #include <sys/kstat.h>
 132 #include <sys/sdt.h>
 133
 134 #include <vm/vm_pageout.h>
 135
 136 static kmutex_t         arc_reclaim_thr_lock;
 137 static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 138 static uint8_t          arc_thread_exit;
 139
 140 extern int zfs_write_limit_shift;
 141 extern uint64_t zfs_write_limit_max;
 142 extern kmutex_t zfs_write_limit_lock;
 143
 144 #define ARC_REDUCE_DNLC_PERCENT 3
 145 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
 146
 147 typedef enum arc_reclaim_strategy {
 148         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
 149         ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
 150 } arc_reclaim_strategy_t;
 151
 152 /* number of seconds before growing cache again */
 153 static int              arc_grow_retry = 60;
 154
 155 /*
 156  * minimum lifespan of a prefetch block in clock ticks
 157  * (initialized in arc_init())
 158  */
 159 static int              arc_min_prefetch_lifespan;
 160
 161 extern int zfs_prefetch_disable;
 162 static int arc_dead;
 163
 164 /*
 165  * The arc has filled available memory and has now warmed up.
 166  */
 167 static boolean_t arc_warm;
 168
 169 /*
 170  * These tunables are for performance analysis.
 171  */
 172 uint64_t zfs_arc_max;
 173 uint64_t zfs_arc_min;
 174 uint64_t zfs_arc_meta_limit = 0;
 175 int zfs_mdcomp_disable = 0;
 176
 177 TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
 178 TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
 179 TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
 180 TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
 181 SYSCTL_DECL(_vfs_zfs);
 182 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
 183     "Maximum ARC size");
 184 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
 185     "Minimum ARC size");
 186 SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
 187     &zfs_mdcomp_disable, 0, "Disable metadata compression");
 188
 189 /*
 190  * Note that buffers can be in one of 6 states:
 191  *      ARC_anon        - anonymous (discussed below)
 192  *      ARC_mru         - recently used, currently cached
 193  *      ARC_mru_ghost   - recentely used, no longer in cache
 194  *      ARC_mfu         - frequently used, currently cached
 195  *      ARC_mfu_ghost   - frequently used, no longer in cache
 196  *      ARC_l2c_only    - exists in L2ARC but not other states
 197  * When there are no active references to the buffer, they are
 198  * are linked onto a list in one of these arc states.  These are
 199  * the only buffers that can be evicted or deleted.  Within each
 200  * state there are multiple lists, one for meta-data and one for
 201  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
 202  * etc.) is tracked separately so that it can be managed more
 203  * explicitly: favored over data, limited explicitly.
 204  *
 205  * Anonymous buffers are buffers that are not associated with
 206  * a DVA.  These are buffers that hold dirty block copies
 207  * before they are written to stable storage.  By definition,
 208  * they are "ref'd" and are considered part of arc_mru
 209  * that cannot be freed.  Generally, they will aquire a DVA
 210  * as they are written and migrate onto the arc_mru list.
 211  *
 212  * The ARC_l2c_only state is for buffers that are in the second
 213  * level ARC but no longer in any of the ARC_m* lists.  The second
 214  * level ARC itself may also contain buffers that are in any of
 215  * the ARC_m* states - meaning that a buffer can exist in two
 216  * places.  The reason for the ARC_l2c_only state is to keep the
 217  * buffer header in the hash table, so that reads that hit the
 218  * second level ARC benefit from these fast lookups.
 219  */
 220
 221 typedef struct arc_state {
 222         list_t  arcs_list[ARC_BUFC_NUMTYPES];   /* list of evictable buffers */
 223         uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
 224         uint64_t arcs_size;     /* total amount of data in this state */
 225         kmutex_t arcs_mtx;
 226 } arc_state_t;
 227
 228 /* The 6 states: */
 229 static arc_state_t ARC_anon;
 230 static arc_state_t ARC_mru;
 231 static arc_state_t ARC_mru_ghost;
 232 static arc_state_t ARC_mfu;
 233 static arc_state_t ARC_mfu_ghost;
 234 static arc_state_t ARC_l2c_only;
 235
 236 typedef struct arc_stats {
 237         kstat_named_t arcstat_hits;
 238         kstat_named_t arcstat_misses;
 239         kstat_named_t arcstat_demand_data_hits;
 240         kstat_named_t arcstat_demand_data_misses;
 241         kstat_named_t arcstat_demand_metadata_hits;
 242         kstat_named_t arcstat_demand_metadata_misses;
 243         kstat_named_t arcstat_prefetch_data_hits;
 244         kstat_named_t arcstat_prefetch_data_misses;
 245         kstat_named_t arcstat_prefetch_metadata_hits;
 246         kstat_named_t arcstat_prefetch_metadata_misses;
 247         kstat_named_t arcstat_mru_hits;
 248         kstat_named_t arcstat_mru_ghost_hits;
 249         kstat_named_t arcstat_mfu_hits;
 250         kstat_named_t arcstat_mfu_ghost_hits;
 251         kstat_named_t arcstat_deleted;
 252         kstat_named_t arcstat_recycle_miss;
 253         kstat_named_t arcstat_mutex_miss;
 254         kstat_named_t arcstat_evict_skip;
 255         kstat_named_t arcstat_hash_elements;
 256         kstat_named_t arcstat_hash_elements_max;
 257         kstat_named_t arcstat_hash_collisions;
 258         kstat_named_t arcstat_hash_chains;
 259         kstat_named_t arcstat_hash_chain_max;
 260         kstat_named_t arcstat_p;
 261         kstat_named_t arcstat_c;
 262         kstat_named_t arcstat_c_min;
 263         kstat_named_t arcstat_c_max;
 264         kstat_named_t arcstat_size;
 265         kstat_named_t arcstat_hdr_size;
 266         kstat_named_t arcstat_l2_hits;
 267         kstat_named_t arcstat_l2_misses;
 268         kstat_named_t arcstat_l2_feeds;
 269         kstat_named_t arcstat_l2_rw_clash;
 270         kstat_named_t arcstat_l2_writes_sent;
 271         kstat_named_t arcstat_l2_writes_done;
 272         kstat_named_t arcstat_l2_writes_error;
 273         kstat_named_t arcstat_l2_writes_hdr_miss;
 274         kstat_named_t arcstat_l2_evict_lock_retry;
 275         kstat_named_t arcstat_l2_evict_reading;
 276         kstat_named_t arcstat_l2_free_on_write;
 277         kstat_named_t arcstat_l2_abort_lowmem;
 278         kstat_named_t arcstat_l2_cksum_bad;
 279         kstat_named_t arcstat_l2_io_error;
 280         kstat_named_t arcstat_l2_size;
 281         kstat_named_t arcstat_l2_hdr_size;
 282         kstat_named_t arcstat_memory_throttle_count;
 283 } arc_stats_t;
 284
 285 static arc_stats_t arc_stats = {
 286         { "hits",                       KSTAT_DATA_UINT64 },
 287         { "misses",                     KSTAT_DATA_UINT64 },
 288         { "demand_data_hits",           KSTAT_DATA_UINT64 },
 289         { "demand_data_misses",         KSTAT_DATA_UINT64 },
 290         { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
 291         { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
 292         { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
 293         { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
 294         { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
 295         { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },
 296         { "mru_hits",                   KSTAT_DATA_UINT64 },
 297         { "mru_ghost_hits",             KSTAT_DATA_UINT64 },
 298         { "mfu_hits",                   KSTAT_DATA_UINT64 },
 299         { "mfu_ghost_hits",             KSTAT_DATA_UINT64 },
 300         { "deleted",                    KSTAT_DATA_UINT64 },
 301         { "recycle_miss",               KSTAT_DATA_UINT64 },
 302         { "mutex_miss",                 KSTAT_DATA_UINT64 },
 303         { "evict_skip",                 KSTAT_DATA_UINT64 },
 304         { "hash_elements",              KSTAT_DATA_UINT64 },
 305         { "hash_elements_max",          KSTAT_DATA_UINT64 },
 306         { "hash_collisions",            KSTAT_DATA_UINT64 },
 307         { "hash_chains",                KSTAT_DATA_UINT64 },
 308         { "hash_chain_max",             KSTAT_DATA_UINT64 },
 309         { "p",                          KSTAT_DATA_UINT64 },
 310         { "c",                          KSTAT_DATA_UINT64 },
 311         { "c_min",                      KSTAT_DATA_UINT64 },
 312         { "c_max",                      KSTAT_DATA_UINT64 },
 313         { "size",                       KSTAT_DATA_UINT64 },
 314         { "hdr_size",                   KSTAT_DATA_UINT64 },
 315         { "l2_hits",                    KSTAT_DATA_UINT64 },
 316         { "l2_misses",                  KSTAT_DATA_UINT64 },
 317         { "l2_feeds",                   KSTAT_DATA_UINT64 },
 318         { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 319         { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 320         { "l2_writes_done",             KSTAT_DATA_UINT64 },
 321         { "l2_writes_error",            KSTAT_DATA_UINT64 },
 322         { "l2_writes_hdr_miss",         KSTAT_DATA_UINT64 },
 323         { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 324         { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 325         { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 326         { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 327         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 328         { "l2_io_error",                KSTAT_DATA_UINT64 },
 329         { "l2_size",                    KSTAT_DATA_UINT64 },
 330         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 331         { "memory_throttle_count",      KSTAT_DATA_UINT64 }
 332 };
 333
 334 #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
 335
 336 #define ARCSTAT_INCR(stat, val) \
 337         atomic_add_64(&arc_stats.stat.value.ui64, (val));
 338
 339 #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 340 #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 341
 342 #define ARCSTAT_MAX(stat, val) {                                        \
 343         uint64_t m;                                                     \
 344         while ((val) > (m = arc_stats.stat.value.ui64) &&               \
 345             (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
 346                 continue;                                               \
 347 }
 348
 349 #define ARCSTAT_MAXSTAT(stat) \
 350         ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
 351
 352 /*
 353  * We define a macro to allow ARC hits/misses to be easily broken down by
 354  * two separate conditions, giving a total of four different subtypes for
 355  * each of hits and misses (so eight statistics total).
 356  */
 357 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 358         if (cond1) {                                                    \
 359                 if (cond2) {                                            \
 360                         ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 361                 } else {                                                \
 362                         ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 363                 }                                                       \
 364         } else {                                                        \
 365                 if (cond2) {                                            \
 366                         ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 367                 } else {                                                \
 368                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 369                 }                                                       \
 370         }
 371
 372 kstat_t                 *arc_ksp;
 373 static arc_state_t      *arc_anon;
 374 static arc_state_t      *arc_mru;
 375 static arc_state_t      *arc_mru_ghost;
 376 static arc_state_t      *arc_mfu;
 377 static arc_state_t      *arc_mfu_ghost;
 378 static arc_state_t      *arc_l2c_only;
 379
 380 /*
 381  * There are several ARC variables that are critical to export as kstats --
 382  * but we don't want to have to grovel around in the kstat whenever we wish to
 383  * manipulate them.  For these variables, we therefore define them to be in
 384  * terms of the statistic variable.  This assures that we are not introducing
 385  * the possibility of inconsistency by having shadow copies of the variables,
 386  * while still allowing the code to be readable.
 387  */
 388 #define arc_size        ARCSTAT(arcstat_size)   /* actual total arc size */
 389 #define arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
 390 #define arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
 391 #define arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
 392 #define arc_c_max       ARCSTAT(arcstat_c_max)  /* max target cache size */
 393
 394 static int              arc_no_grow;    /* Don't try to grow cache size */
 395 static uint64_t         arc_tempreserve;
 396 static uint64_t         arc_meta_used;
 397 static uint64_t         arc_meta_limit;
 398 static uint64_t         arc_meta_max = 0;
 399 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RDTUN,
 400     &arc_meta_used, 0, "ARC metadata used");
 401 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RDTUN,
 402     &arc_meta_limit, 0, "ARC metadata limit");
 403
 404 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 405
 406 typedef struct arc_callback arc_callback_t;
 407
 408 struct arc_callback {
 409         void                    *acb_private;
 410         arc_done_func_t         *acb_done;
 411         arc_buf_t               *acb_buf;
 412         zio_t                   *acb_zio_dummy;
 413         arc_callback_t          *acb_next;
 414 };
 415
 416 typedef struct arc_write_callback arc_write_callback_t;
 417
 418 struct arc_write_callback {
 419         void            *awcb_private;
 420         arc_done_func_t *awcb_ready;
 421         arc_done_func_t *awcb_done;
 422         arc_buf_t       *awcb_buf;
 423 };
 424
 425 struct arc_buf_hdr {
 426         /* protected by hash lock */
 427         dva_t                   b_dva;
 428         uint64_t                b_birth;
 429         uint64_t                b_cksum0;
 430
 431         kmutex_t                b_freeze_lock;
 432         zio_cksum_t             *b_freeze_cksum;
 433
 434         arc_buf_hdr_t           *b_hash_next;
 435         arc_buf_t               *b_buf;
 436         uint32_t                b_flags;
 437         uint32_t                b_datacnt;
 438
 439         arc_callback_t          *b_acb;
 440         kcondvar_t              b_cv;
 441
 442         /* immutable */
 443         arc_buf_contents_t      b_type;
 444         uint64_t                b_size;
 445         spa_t                   *b_spa;
 446
 447         /* protected by arc state mutex */
 448         arc_state_t             *b_state;
 449         list_node_t             b_arc_node;
 450
 451         /* updated atomically */
 452         clock_t                 b_arc_access;
 453
 454         /* self protecting */
 455         refcount_t              b_refcnt;
 456
 457         l2arc_buf_hdr_t         *b_l2hdr;
 458         list_node_t             b_l2node;
 459 };
 460
 461 static arc_buf_t *arc_eviction_list;
 462 static kmutex_t arc_eviction_mtx;
 463 static arc_buf_hdr_t arc_eviction_hdr;
 464 static void arc_get_data_buf(arc_buf_t *buf);
 465 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
 466 static int arc_evict_needed(arc_buf_contents_t type);
 467 static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes);
 468
 469 #define GHOST_STATE(state)      \
 470         ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
 471         (state) == arc_l2c_only)
 472
 473 /*
 474  * Private ARC flags.  These flags are private ARC only flags that will show up
 475  * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
 476  * be passed in as arc_flags in things like arc_read.  However, these flags
 477  * should never be passed and should only be set by ARC code.  When adding new
 478  * public flags, make sure not to smash the private ones.
 479  */
 480
 481 #define ARC_IN_HASH_TABLE       (1 << 9)        /* this buffer is hashed */
 482 #define ARC_IO_IN_PROGRESS      (1 << 10)       /* I/O in progress for buf */
 483 #define ARC_IO_ERROR            (1 << 11)       /* I/O failed for buf */
 484 #define ARC_FREED_IN_READ       (1 << 12)       /* buf freed while in read */
 485 #define ARC_BUF_AVAILABLE       (1 << 13)       /* block not in active use */
 486 #define ARC_INDIRECT            (1 << 14)       /* this is an indirect block */
 487 #define ARC_FREE_IN_PROGRESS    (1 << 15)       /* hdr about to be freed */
 488 #define ARC_L2_WRITING          (1 << 16)       /* L2ARC write in progress */
 489 #define ARC_L2_EVICTED          (1 << 17)       /* evicted during I/O */
 490 #define ARC_L2_WRITE_HEAD       (1 << 18)       /* head of write list */
 491 #define ARC_STORED              (1 << 19)       /* has been store()d to */
 492
 493 #define HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_IN_HASH_TABLE)
 494 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
 495 #define HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_IO_ERROR)
 496 #define HDR_FREED_IN_READ(hdr)  ((hdr)->b_flags & ARC_FREED_IN_READ)
 497 #define HDR_BUF_AVAILABLE(hdr)  ((hdr)->b_flags & ARC_BUF_AVAILABLE)
 498 #define HDR_FREE_IN_PROGRESS(hdr)       ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
 499 #define HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_L2CACHE)
 500 #define HDR_L2_READING(hdr)     ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \
 501                                     (hdr)->b_l2hdr != NULL)
 502 #define HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_L2_WRITING)
 503 #define HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_L2_EVICTED)
 504 #define HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
 505
 506 /*
 507  * Other sizes
 508  */
 509
 510 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 511 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
 512
 513 /*
 514  * Hash table routines
 515  */
 516
 517 #define HT_LOCK_PAD     128
 518
 519 struct ht_lock {
 520         kmutex_t        ht_lock;
 521 #ifdef _KERNEL
 522         unsigned char   pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
 523 #endif
 524 };
 525
 526 #define BUF_LOCKS 256
 527 typedef struct buf_hash_table {
 528         uint64_t ht_mask;
 529         arc_buf_hdr_t **ht_table;
 530         struct ht_lock ht_locks[BUF_LOCKS];
 531 } buf_hash_table_t;
 532
 533 static buf_hash_table_t buf_hash_table;
 534
 535 #define BUF_HASH_INDEX(spa, dva, birth) \
 536         (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 537 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 538 #define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 539 #define HDR_LOCK(buf) \
 540         (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
 541
 542 uint64_t zfs_crc64_table[256];
 543
 544 /*
 545  * Level 2 ARC
 546  */
 547
 548 #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
 549 #define L2ARC_HEADROOM          4               /* num of writes */
 550 #define L2ARC_FEED_SECS         1               /* caching interval */
 551
 552 #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 553 #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 554
 555 /*
 556  * L2ARC Performance Tunables
 557  */
 558 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;    /* default max write size */
 559 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;  /* extra write during warmup */
 560 uint64_t l2arc_headroom = L2ARC_HEADROOM;       /* number of dev writes */
 561 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     /* interval seconds */
 562 boolean_t l2arc_noprefetch = B_TRUE;            /* don't cache prefetch bufs */
 563
 564 /*
 565  * L2ARC Internals
 566  */
 567 typedef struct l2arc_dev {
 568         vdev_t                  *l2ad_vdev;     /* vdev */
 569         spa_t                   *l2ad_spa;      /* spa */
 570         uint64_t                l2ad_hand;      /* next write location */
 571         uint64_t                l2ad_write;     /* desired write size, bytes */
 572         uint64_t                l2ad_boost;     /* warmup write boost, bytes */
 573         uint64_t                l2ad_start;     /* first addr on device */
 574         uint64_t                l2ad_end;       /* last addr on device */
 575         uint64_t                l2ad_evict;     /* last addr eviction reached */
 576         boolean_t               l2ad_first;     /* first sweep through */
 577         list_t                  *l2ad_buflist;  /* buffer list */
 578         list_node_t             l2ad_node;      /* device list node */
 579 } l2arc_dev_t;
 580
 581 static list_t L2ARC_dev_list;                   /* device list */
 582 static list_t *l2arc_dev_list;                  /* device list pointer */
 583 static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 584 static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 585 static kmutex_t l2arc_buflist_mtx;              /* mutex for all buflists */
 586 static list_t L2ARC_free_on_write;              /* free after write buf list */
 587 static list_t *l2arc_free_on_write;             /* free after write list ptr */
 588 static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 589 static uint64_t l2arc_ndev;                     /* number of devices */
 590
 591 typedef struct l2arc_read_callback {
 592         arc_buf_t       *l2rcb_buf;             /* read buffer */
 593         spa_t           *l2rcb_spa;             /* spa */
 594         blkptr_t        l2rcb_bp;               /* original blkptr */
 595         zbookmark_t     l2rcb_zb;               /* original bookmark */
 596         int             l2rcb_flags;            /* original flags */
 597 } l2arc_read_callback_t;
 598
 599 typedef struct l2arc_write_callback {
 600         l2arc_dev_t     *l2wcb_dev;             /* device info */
 601         arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
 602 } l2arc_write_callback_t;
 603
 604 struct l2arc_buf_hdr {
 605         /* protected by arc_buf_hdr  mutex */
 606         l2arc_dev_t     *b_dev;                 /* L2ARC device */
 607         daddr_t         b_daddr;                /* disk address, offset byte */
 608 };
 609
 610 typedef struct l2arc_data_free {
 611         /* protected by l2arc_free_on_write_mtx */
 612         void            *l2df_data;
 613         size_t          l2df_size;
 614         void            (*l2df_func)(void *, size_t);
 615         list_node_t     l2df_list_node;
 616 } l2arc_data_free_t;
 617
 618 static kmutex_t l2arc_feed_thr_lock;
 619 static kcondvar_t l2arc_feed_thr_cv;
 620 static uint8_t l2arc_thread_exit;
 621
 622 static void l2arc_read_done(zio_t *zio);
 623 static void l2arc_hdr_stat_add(void);
 624 static void l2arc_hdr_stat_remove(void);
 625
 626 static uint64_t
 627 buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth)
 628 {
 629         uintptr_t spav = (uintptr_t)spa;
 630         uint8_t *vdva = (uint8_t *)dva;
 631         uint64_t crc = -1ULL;
 632         int i;
 633
 634         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 635
 636         for (i = 0; i < sizeof (dva_t); i++)
 637                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 638
 639         crc ^= (spav>>8) ^ birth;
 640
 641         return (crc);
 642 }
 643
 644 #define BUF_EMPTY(buf)                                          \
 645         ((buf)->b_dva.dva_word[0] == 0 &&                       \
 646         (buf)->b_dva.dva_word[1] == 0 &&                        \
 647         (buf)->b_birth == 0)
 648
 649 #define BUF_EQUAL(spa, dva, birth, buf)                         \
 650         ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&     \
 651         ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&     \
 652         ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 653
 654 static arc_buf_hdr_t *
 655 buf_hash_find(spa_t *spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
 656 {
 657         uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 658         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 659         arc_buf_hdr_t *buf;
 660
 661         mutex_enter(hash_lock);
 662         for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
 663             buf = buf->b_hash_next) {
 664                 if (BUF_EQUAL(spa, dva, birth, buf)) {
 665                         *lockp = hash_lock;
 666                         return (buf);
 667                 }
 668         }
 669         mutex_exit(hash_lock);
 670         *lockp = NULL;
 671         return (NULL);
 672 }
 673
 674 /*
 675  * Insert an entry into the hash table.  If there is already an element
 676  * equal to elem in the hash table, then the already existing element
 677  * will be returned and the new element will not be inserted.
 678  * Otherwise returns NULL.
 679  */
 680 static arc_buf_hdr_t *
 681 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
 682 {
 683         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 684         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 685         arc_buf_hdr_t *fbuf;
 686         uint32_t i;
 687
 688         ASSERT(!HDR_IN_HASH_TABLE(buf));
 689         *lockp = hash_lock;
 690         mutex_enter(hash_lock);
 691         for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
 692             fbuf = fbuf->b_hash_next, i++) {
 693                 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
 694                         return (fbuf);
 695         }
 696
 697         buf->b_hash_next = buf_hash_table.ht_table[idx];
 698         buf_hash_table.ht_table[idx] = buf;
 699         buf->b_flags |= ARC_IN_HASH_TABLE;
 700
 701         /* collect some hash table performance data */
 702         if (i > 0) {
 703                 ARCSTAT_BUMP(arcstat_hash_collisions);
 704                 if (i == 1)
 705                         ARCSTAT_BUMP(arcstat_hash_chains);
 706
 707                 ARCSTAT_MAX(arcstat_hash_chain_max, i);
 708         }
 709
 710         ARCSTAT_BUMP(arcstat_hash_elements);
 711         ARCSTAT_MAXSTAT(arcstat_hash_elements);
 712
 713         return (NULL);
 714 }
 715
 716 static void
 717 buf_hash_remove(arc_buf_hdr_t *buf)
 718 {
 719         arc_buf_hdr_t *fbuf, **bufp;
 720         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 721
 722         ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 723         ASSERT(HDR_IN_HASH_TABLE(buf));
 724
 725         bufp = &buf_hash_table.ht_table[idx];
 726         while ((fbuf = *bufp) != buf) {
 727                 ASSERT(fbuf != NULL);
 728                 bufp = &fbuf->b_hash_next;
 729         }
 730         *bufp = buf->b_hash_next;
 731         buf->b_hash_next = NULL;
 732         buf->b_flags &= ~ARC_IN_HASH_TABLE;
 733
 734         /* collect some hash table performance data */
 735         ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 736
 737         if (buf_hash_table.ht_table[idx] &&
 738             buf_hash_table.ht_table[idx]->b_hash_next == NULL)
 739                 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 740 }
 741
 742 /*
 743  * Global data structures and functions for the buf kmem cache.
 744  */
 745 static kmem_cache_t *hdr_cache;
 746 static kmem_cache_t *buf_cache;
 747
 748 static void
 749 buf_fini(void)
 750 {
 751         int i;
 752
 753         kmem_free(buf_hash_table.ht_table,
 754             (buf_hash_table.ht_mask + 1) * sizeof (void *));
 755         for (i = 0; i < BUF_LOCKS; i++)
 756                 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
 757         kmem_cache_destroy(hdr_cache);
 758         kmem_cache_destroy(buf_cache);
 759 }
 760
 761 /*
 762  * Constructor callback - called when the cache is empty
 763  * and a new buf is requested.
 764  */
 765 /* ARGSUSED */
 766 static int
 767 hdr_cons(void *vbuf, void *unused, int kmflag)
 768 {
 769         arc_buf_hdr_t *buf = vbuf;
 770
 771         bzero(buf, sizeof (arc_buf_hdr_t));
 772         refcount_create(&buf->b_refcnt);
 773         cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
 774         mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
 775
 776         ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
 777         return (0);
 778 }
 779
 780 /* ARGSUSED */
 781 static int
 782 buf_cons(void *vbuf, void *unused, int kmflag)
 783 {
 784         arc_buf_t *buf = vbuf;
 785
 786         bzero(buf, sizeof (arc_buf_t));
 787         rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL);
 788         return (0);
 789 }
 790
 791 /*
 792  * Destructor callback - called when a cached buf is
 793  * no longer required.
 794  */
 795 /* ARGSUSED */
 796 static void
 797 hdr_dest(void *vbuf, void *unused)
 798 {
 799         arc_buf_hdr_t *buf = vbuf;
 800
 801         refcount_destroy(&buf->b_refcnt);
 802         cv_destroy(&buf->b_cv);
 803         mutex_destroy(&buf->b_freeze_lock);
 804
 805         ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
 806 }
 807
 808 /* ARGSUSED */
 809 static void
 810 buf_dest(void *vbuf, void *unused)
 811 {
 812         arc_buf_t *buf = vbuf;
 813
 814         rw_destroy(&buf->b_lock);
 815 }
 816
 817 /*
 818  * Reclaim callback -- invoked when memory is low.
 819  */
 820 /* ARGSUSED */
 821 static void
 822 hdr_recl(void *unused)
 823 {
 824         dprintf("hdr_recl called\n");
 825         /*
 826          * umem calls the reclaim func when we destroy the buf cache,
 827          * which is after we do arc_fini().
 828          */
 829         if (!arc_dead)
 830                 cv_signal(&arc_reclaim_thr_cv);
 831 }
 832
 833 static void
 834 buf_init(void)
 835 {
 836         uint64_t *ct;
 837         uint64_t hsize = 1ULL << 12;
 838         int i, j;
 839
 840         /*
 841          * The hash table is big enough to fill all of physical memory
 842          * with an average 64K block size.  The table will take up
 843          * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
 844          */
 845         while (hsize * 65536 < (uint64_t)physmem * PAGESIZE)
 846                 hsize <<= 1;
 847 retry:
 848         buf_hash_table.ht_mask = hsize - 1;
 849         buf_hash_table.ht_table =
 850             kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
 851         if (buf_hash_table.ht_table == NULL) {
 852                 ASSERT(hsize > (1ULL << 8));
 853                 hsize >>= 1;
 854                 goto retry;
 855         }
 856
 857         hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
 858             0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
 859         buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 860             0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 861
 862         for (i = 0; i < 256; i++)
 863                 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
 864                         *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
 865
 866         for (i = 0; i < BUF_LOCKS; i++) {
 867                 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
 868                     NULL, MUTEX_DEFAULT, NULL);
 869         }
 870 }
 871
 872 #define ARC_MINTIME     (hz>>4) /* 62 ms */
 873
 874 static void
 875 arc_cksum_verify(arc_buf_t *buf)
 876 {
 877         zio_cksum_t zc;
 878
 879         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 880                 return;
 881
 882         mutex_enter(&buf->b_hdr->b_freeze_lock);
 883         if (buf->b_hdr->b_freeze_cksum == NULL ||
 884             (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
 885                 mutex_exit(&buf->b_hdr->b_freeze_lock);
 886                 return;
 887         }
 888         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 889         if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
 890                 panic("buffer modified while frozen!");
 891         mutex_exit(&buf->b_hdr->b_freeze_lock);
 892 }
 893
 894 static int
 895 arc_cksum_equal(arc_buf_t *buf)
 896 {
 897         zio_cksum_t zc;
 898         int equal;
 899
 900         mutex_enter(&buf->b_hdr->b_freeze_lock);
 901         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 902         equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
 903         mutex_exit(&buf->b_hdr->b_freeze_lock);
 904
 905         return (equal);
 906 }
 907
 908 static void
 909 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
 910 {
 911         if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
 912                 return;
 913
 914         mutex_enter(&buf->b_hdr->b_freeze_lock);
 915         if (buf->b_hdr->b_freeze_cksum != NULL) {
 916                 mutex_exit(&buf->b_hdr->b_freeze_lock);
 917                 return;
 918         }
 919         buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
 920         fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
 921             buf->b_hdr->b_freeze_cksum);
 922         mutex_exit(&buf->b_hdr->b_freeze_lock);
 923 }
 924
 925 void
 926 arc_buf_thaw(arc_buf_t *buf)
 927 {
 928         if (zfs_flags & ZFS_DEBUG_MODIFY) {
 929                 if (buf->b_hdr->b_state != arc_anon)
 930                         panic("modifying non-anon buffer!");
 931                 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
 932                         panic("modifying buffer while i/o in progress!");
 933                 arc_cksum_verify(buf);
 934         }
 935
 936         mutex_enter(&buf->b_hdr->b_freeze_lock);
 937         if (buf->b_hdr->b_freeze_cksum != NULL) {
 938                 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
 939                 buf->b_hdr->b_freeze_cksum = NULL;
 940         }
 941         mutex_exit(&buf->b_hdr->b_freeze_lock);
 942 }
 943
 944 void
 945 arc_buf_freeze(arc_buf_t *buf)
 946 {
 947         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 948                 return;
 949
 950         ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
 951             buf->b_hdr->b_state == arc_anon);
 952         arc_cksum_compute(buf, B_FALSE);
 953 }
 954
 955 static void
 956 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
 957 {
 958         ASSERT(MUTEX_HELD(hash_lock));
 959
 960         if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
 961             (ab->b_state != arc_anon)) {
 962                 uint64_t delta = ab->b_size * ab->b_datacnt;
 963                 list_t *list = &ab->b_state->arcs_list[ab->b_type];
 964                 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
 965
 966                 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
 967                 mutex_enter(&ab->b_state->arcs_mtx);
 968                 ASSERT(list_link_active(&ab->b_arc_node));
 969                 list_remove(list, ab);
 970                 if (GHOST_STATE(ab->b_state)) {
 971                         ASSERT3U(ab->b_datacnt, ==, 0);
 972                         ASSERT3P(ab->b_buf, ==, NULL);
 973                         delta = ab->b_size;
 974                 }
 975                 ASSERT(delta > 0);
 976                 ASSERT3U(*size, >=, delta);
 977                 atomic_add_64(size, -delta);
 978                 mutex_exit(&ab->b_state->arcs_mtx);
 979                 /* remove the prefetch flag if we get a reference */
 980                 if (ab->b_flags & ARC_PREFETCH)
 981                         ab->b_flags &= ~ARC_PREFETCH;
 982         }
 983 }
 984
 985 static int
 986 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
 987 {
 988         int cnt;
 989         arc_state_t *state = ab->b_state;
 990
 991         ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
 992         ASSERT(!GHOST_STATE(state));
 993
 994         if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
 995             (state != arc_anon)) {
 996                 uint64_t *size = &state->arcs_lsize[ab->b_type];
 997
 998                 ASSERT(!MUTEX_HELD(&state->arcs_mtx));
 999                 mutex_enter(&state->arcs_mtx);
1000                 ASSERT(!list_link_active(&ab->b_arc_node));
1001                 list_insert_head(&state->arcs_list[ab->b_type], ab);
1002                 ASSERT(ab->b_datacnt > 0);
1003                 atomic_add_64(size, ab->b_size * ab->b_datacnt);
1004                 mutex_exit(&state->arcs_mtx);
1005         }
1006         return (cnt);
1007 }
1008
1009 /*
1010  * Move the supplied buffer to the indicated state.  The mutex
1011  * for the buffer must be held by the caller.
1012  */
1013 static void
1014 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1015 {
1016         arc_state_t *old_state = ab->b_state;
1017         int64_t refcnt = refcount_count(&ab->b_refcnt);
1018         uint64_t from_delta, to_delta;
1019
1020         ASSERT(MUTEX_HELD(hash_lock));
1021         ASSERT(new_state != old_state);
1022         ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1023         ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1024
1025         from_delta = to_delta = ab->b_datacnt * ab->b_size;
1026
1027         /*
1028          * If this buffer is evictable, transfer it from the
1029          * old state list to the new state list.
1030          */
1031         if (refcnt == 0) {
1032                 if (old_state != arc_anon) {
1033                         int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1034                         uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1035
1036                         if (use_mutex)
1037                                 mutex_enter(&old_state->arcs_mtx);
1038
1039                         ASSERT(list_link_active(&ab->b_arc_node));
1040                         list_remove(&old_state->arcs_list[ab->b_type], ab);
1041
1042                         /*
1043                          * If prefetching out of the ghost cache,
1044                          * we will have a non-null datacnt.
1045                          */
1046                         if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1047                                 /* ghost elements have a ghost size */
1048                                 ASSERT(ab->b_buf == NULL);
1049                                 from_delta = ab->b_size;
1050                         }
1051                         ASSERT3U(*size, >=, from_delta);
1052                         atomic_add_64(size, -from_delta);
1053
1054                         if (use_mutex)
1055                                 mutex_exit(&old_state->arcs_mtx);
1056                 }
1057                 if (new_state != arc_anon) {
1058                         int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1059                         uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1060
1061                         if (use_mutex)
1062                                 mutex_enter(&new_state->arcs_mtx);
1063
1064                         list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1065
1066                         /* ghost elements have a ghost size */
1067                         if (GHOST_STATE(new_state)) {
1068                                 ASSERT(ab->b_datacnt == 0);
1069                                 ASSERT(ab->b_buf == NULL);
1070                                 to_delta = ab->b_size;
1071                         }
1072                         atomic_add_64(size, to_delta);
1073
1074                         if (use_mutex)
1075                                 mutex_exit(&new_state->arcs_mtx);
1076                 }
1077         }
1078
1079         ASSERT(!BUF_EMPTY(ab));
1080         if (new_state == arc_anon) {
1081                 buf_hash_remove(ab);
1082         }
1083
1084         /* adjust state sizes */
1085         if (to_delta)
1086                 atomic_add_64(&new_state->arcs_size, to_delta);
1087         if (from_delta) {
1088                 ASSERT3U(old_state->arcs_size, >=, from_delta);
1089                 atomic_add_64(&old_state->arcs_size, -from_delta);
1090         }
1091         ab->b_state = new_state;
1092
1093         /* adjust l2arc hdr stats */
1094         if (new_state == arc_l2c_only)
1095                 l2arc_hdr_stat_add();
1096         else if (old_state == arc_l2c_only)
1097                 l2arc_hdr_stat_remove();
1098 }
1099
1100 void
1101 arc_space_consume(uint64_t space)
1102 {
1103         atomic_add_64(&arc_meta_used, space);
1104         atomic_add_64(&arc_size, space);
1105 }
1106
1107 void
1108 arc_space_return(uint64_t space)
1109 {
1110         ASSERT(arc_meta_used >= space);
1111         if (arc_meta_max < arc_meta_used)
1112                 arc_meta_max = arc_meta_used;
1113         atomic_add_64(&arc_meta_used, -space);
1114         ASSERT(arc_size >= space);
1115         atomic_add_64(&arc_size, -space);
1116 }
1117
1118 void *
1119 arc_data_buf_alloc(uint64_t size)
1120 {
1121         if (arc_evict_needed(ARC_BUFC_DATA))
1122                 cv_signal(&arc_reclaim_thr_cv);
1123         atomic_add_64(&arc_size, size);
1124         return (zio_data_buf_alloc(size));
1125 }
1126
1127 void
1128 arc_data_buf_free(void *buf, uint64_t size)
1129 {
1130         zio_data_buf_free(buf, size);
1131         ASSERT(arc_size >= size);
1132         atomic_add_64(&arc_size, -size);
1133 }
1134
1135 arc_buf_t *
1136 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1137 {
1138         arc_buf_hdr_t *hdr;
1139         arc_buf_t *buf;
1140
1141         ASSERT3U(size, >, 0);
1142         hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1143         ASSERT(BUF_EMPTY(hdr));
1144         hdr->b_size = size;
1145         hdr->b_type = type;
1146         hdr->b_spa = spa;
1147         hdr->b_state = arc_anon;
1148         hdr->b_arc_access = 0;
1149         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1150         buf->b_hdr = hdr;
1151         buf->b_data = NULL;
1152         buf->b_efunc = NULL;
1153         buf->b_private = NULL;
1154         buf->b_next = NULL;
1155         hdr->b_buf = buf;
1156         arc_get_data_buf(buf);
1157         hdr->b_datacnt = 1;
1158         hdr->b_flags = 0;
1159         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1160         (void) refcount_add(&hdr->b_refcnt, tag);
1161
1162         return (buf);
1163 }
1164
1165 static arc_buf_t *
1166 arc_buf_clone(arc_buf_t *from)
1167 {
1168         arc_buf_t *buf;
1169         arc_buf_hdr_t *hdr = from->b_hdr;
1170         uint64_t size = hdr->b_size;
1171
1172         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1173         buf->b_hdr = hdr;
1174         buf->b_data = NULL;
1175         buf->b_efunc = NULL;
1176         buf->b_private = NULL;
1177         buf->b_next = hdr->b_buf;
1178         hdr->b_buf = buf;
1179         arc_get_data_buf(buf);
1180         bcopy(from->b_data, buf->b_data, size);
1181         hdr->b_datacnt += 1;
1182         return (buf);
1183 }
1184
1185 void
1186 arc_buf_add_ref(arc_buf_t *buf, void* tag)
1187 {
1188         arc_buf_hdr_t *hdr;
1189         kmutex_t *hash_lock;
1190
1191         /*
1192          * Check to see if this buffer is evicted.  Callers
1193          * must verify b_data != NULL to know if the add_ref
1194          * was successful.
1195          */
1196         rw_enter(&buf->b_lock, RW_READER);
1197         if (buf->b_data == NULL) {
1198                 rw_exit(&buf->b_lock);
1199                 return;
1200         }
1201         hdr = buf->b_hdr;
1202         ASSERT(hdr != NULL);
1203         hash_lock = HDR_LOCK(hdr);
1204         mutex_enter(hash_lock);
1205         rw_exit(&buf->b_lock);
1206
1207         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1208         add_reference(hdr, hash_lock, tag);
1209         arc_access(hdr, hash_lock);
1210         mutex_exit(hash_lock);
1211         ARCSTAT_BUMP(arcstat_hits);
1212         ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1213             demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1214             data, metadata, hits);
1215 }
1216
1217 /*
1218  * Free the arc data buffer.  If it is an l2arc write in progress,
1219  * the buffer is placed on l2arc_free_on_write to be freed later.
1220  */
1221 static void
1222 arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t),
1223     void *data, size_t size)
1224 {
1225         if (HDR_L2_WRITING(hdr)) {
1226                 l2arc_data_free_t *df;
1227                 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1228                 df->l2df_data = data;
1229                 df->l2df_size = size;
1230                 df->l2df_func = free_func;
1231                 mutex_enter(&l2arc_free_on_write_mtx);
1232                 list_insert_head(l2arc_free_on_write, df);
1233                 mutex_exit(&l2arc_free_on_write_mtx);
1234                 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1235         } else {
1236                 free_func(data, size);
1237         }
1238 }
1239
1240 static void
1241 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1242 {
1243         arc_buf_t **bufp;
1244
1245         /* free up data associated with the buf */
1246         if (buf->b_data) {
1247                 arc_state_t *state = buf->b_hdr->b_state;
1248                 uint64_t size = buf->b_hdr->b_size;
1249                 arc_buf_contents_t type = buf->b_hdr->b_type;
1250
1251                 arc_cksum_verify(buf);
1252                 if (!recycle) {
1253                         if (type == ARC_BUFC_METADATA) {
1254                                 arc_buf_data_free(buf->b_hdr, zio_buf_free,
1255                                     buf->b_data, size);
1256                                 arc_space_return(size);
1257                         } else {
1258                                 ASSERT(type == ARC_BUFC_DATA);
1259                                 arc_buf_data_free(buf->b_hdr,
1260                                     zio_data_buf_free, buf->b_data, size);
1261                                 atomic_add_64(&arc_size, -size);
1262                         }
1263                 }
1264                 if (list_link_active(&buf->b_hdr->b_arc_node)) {
1265                         uint64_t *cnt = &state->arcs_lsize[type];
1266
1267                         ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1268                         ASSERT(state != arc_anon);
1269
1270                         ASSERT3U(*cnt, >=, size);
1271                         atomic_add_64(cnt, -size);
1272                 }
1273                 ASSERT3U(state->arcs_size, >=, size);
1274                 atomic_add_64(&state->arcs_size, -size);
1275                 buf->b_data = NULL;
1276                 ASSERT(buf->b_hdr->b_datacnt > 0);
1277                 buf->b_hdr->b_datacnt -= 1;
1278         }
1279
1280         /* only remove the buf if requested */
1281         if (!all)
1282                 return;
1283
1284         /* remove the buf from the hdr list */
1285         for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1286                 continue;
1287         *bufp = buf->b_next;
1288
1289         ASSERT(buf->b_efunc == NULL);
1290
1291         /* clean up the buf */
1292         buf->b_hdr = NULL;
1293         kmem_cache_free(buf_cache, buf);
1294 }
1295
1296 static void
1297 arc_hdr_destroy(arc_buf_hdr_t *hdr)
1298 {
1299         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1300         ASSERT3P(hdr->b_state, ==, arc_anon);
1301         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1302         ASSERT(!(hdr->b_flags & ARC_STORED));
1303
1304         if (hdr->b_l2hdr != NULL) {
1305                 if (!MUTEX_HELD(&l2arc_buflist_mtx)) {
1306                         /*
1307                          * To prevent arc_free() and l2arc_evict() from
1308                          * attempting to free the same buffer at the same time,
1309                          * a FREE_IN_PROGRESS flag is given to arc_free() to
1310                          * give it priority.  l2arc_evict() can't destroy this
1311                          * header while we are waiting on l2arc_buflist_mtx.
1312                          *
1313                          * The hdr may be removed from l2ad_buflist before we
1314                          * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1315                          */
1316                         mutex_enter(&l2arc_buflist_mtx);
1317                         if (hdr->b_l2hdr != NULL) {
1318                                 list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist,
1319                                     hdr);
1320                         }
1321                         mutex_exit(&l2arc_buflist_mtx);
1322                 } else {
1323                         list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr);
1324                 }
1325                 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1326                 kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t));
1327                 if (hdr->b_state == arc_l2c_only)
1328                         l2arc_hdr_stat_remove();
1329                 hdr->b_l2hdr = NULL;
1330         }
1331
1332         if (!BUF_EMPTY(hdr)) {
1333                 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1334                 bzero(&hdr->b_dva, sizeof (dva_t));
1335                 hdr->b_birth = 0;
1336                 hdr->b_cksum0 = 0;
1337         }
1338         while (hdr->b_buf) {
1339                 arc_buf_t *buf = hdr->b_buf;
1340
1341                 if (buf->b_efunc) {
1342                         mutex_enter(&arc_eviction_mtx);
1343                         rw_enter(&buf->b_lock, RW_WRITER);
1344                         ASSERT(buf->b_hdr != NULL);
1345                         arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1346                         hdr->b_buf = buf->b_next;
1347                         buf->b_hdr = &arc_eviction_hdr;
1348                         buf->b_next = arc_eviction_list;
1349                         arc_eviction_list = buf;
1350                         rw_exit(&buf->b_lock);
1351                         mutex_exit(&arc_eviction_mtx);
1352                 } else {
1353                         arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1354                 }
1355         }
1356         if (hdr->b_freeze_cksum != NULL) {
1357                 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1358                 hdr->b_freeze_cksum = NULL;
1359         }
1360
1361         ASSERT(!list_link_active(&hdr->b_arc_node));
1362         ASSERT3P(hdr->b_hash_next, ==, NULL);
1363         ASSERT3P(hdr->b_acb, ==, NULL);
1364         kmem_cache_free(hdr_cache, hdr);
1365 }
1366
1367 void
1368 arc_buf_free(arc_buf_t *buf, void *tag)
1369 {
1370         arc_buf_hdr_t *hdr = buf->b_hdr;
1371         int hashed = hdr->b_state != arc_anon;
1372
1373         ASSERT(buf->b_efunc == NULL);
1374         ASSERT(buf->b_data != NULL);
1375
1376         if (hashed) {
1377                 kmutex_t *hash_lock = HDR_LOCK(hdr);
1378
1379                 mutex_enter(hash_lock);
1380                 (void) remove_reference(hdr, hash_lock, tag);
1381                 if (hdr->b_datacnt > 1)
1382                         arc_buf_destroy(buf, FALSE, TRUE);
1383                 else
1384                         hdr->b_flags |= ARC_BUF_AVAILABLE;
1385                 mutex_exit(hash_lock);
1386         } else if (HDR_IO_IN_PROGRESS(hdr)) {
1387                 int destroy_hdr;
1388                 /*
1389                  * We are in the middle of an async write.  Don't destroy
1390                  * this buffer unless the write completes before we finish
1391                  * decrementing the reference count.
1392                  */
1393                 mutex_enter(&arc_eviction_mtx);
1394                 (void) remove_reference(hdr, NULL, tag);
1395                 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1396                 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1397                 mutex_exit(&arc_eviction_mtx);
1398                 if (destroy_hdr)
1399                         arc_hdr_destroy(hdr);
1400         } else {
1401                 if (remove_reference(hdr, NULL, tag) > 0) {
1402                         ASSERT(HDR_IO_ERROR(hdr));
1403                         arc_buf_destroy(buf, FALSE, TRUE);
1404                 } else {
1405                         arc_hdr_destroy(hdr);
1406                 }
1407         }
1408 }
1409
1410 int
1411 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1412 {
1413         arc_buf_hdr_t *hdr = buf->b_hdr;
1414         kmutex_t *hash_lock = HDR_LOCK(hdr);
1415         int no_callback = (buf->b_efunc == NULL);
1416
1417         if (hdr->b_state == arc_anon) {
1418                 arc_buf_free(buf, tag);
1419                 return (no_callback);
1420         }
1421
1422         mutex_enter(hash_lock);
1423         ASSERT(hdr->b_state != arc_anon);
1424         ASSERT(buf->b_data != NULL);
1425
1426         (void) remove_reference(hdr, hash_lock, tag);
1427         if (hdr->b_datacnt > 1) {
1428                 if (no_callback)
1429                         arc_buf_destroy(buf, FALSE, TRUE);
1430         } else if (no_callback) {
1431                 ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1432                 hdr->b_flags |= ARC_BUF_AVAILABLE;
1433         }
1434         ASSERT(no_callback || hdr->b_datacnt > 1 ||
1435             refcount_is_zero(&hdr->b_refcnt));
1436         mutex_exit(hash_lock);
1437         return (no_callback);
1438 }
1439
1440 int
1441 arc_buf_size(arc_buf_t *buf)
1442 {
1443         return (buf->b_hdr->b_size);
1444 }
1445
1446 /*
1447  * Evict buffers from list until we've removed the specified number of
1448  * bytes.  Move the removed buffers to the appropriate evict state.
1449  * If the recycle flag is set, then attempt to "recycle" a buffer:
1450  * - look for a buffer to evict that is `bytes' long.
1451  * - return the data block from this buffer rather than freeing it.
1452  * This flag is used by callers that are trying to make space for a
1453  * new buffer in a full arc cache.
1454  *
1455  * This function makes a "best effort".  It skips over any buffers
1456  * it can't get a hash_lock on, and so may not catch all candidates.
1457  * It may also return without evicting as much space as requested.
1458  */
1459 static void *
1460 arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle,
1461     arc_buf_contents_t type)
1462 {
1463         arc_state_t *evicted_state;
1464         uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1465         arc_buf_hdr_t *ab, *ab_prev = NULL;
1466         list_t *list = &state->arcs_list[type];
1467         kmutex_t *hash_lock;
1468         boolean_t have_lock;
1469         void *stolen = NULL;
1470
1471         ASSERT(state == arc_mru || state == arc_mfu);
1472
1473         evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1474
1475         mutex_enter(&state->arcs_mtx);
1476         mutex_enter(&evicted_state->arcs_mtx);
1477
1478         for (ab = list_tail(list); ab; ab = ab_prev) {
1479                 ab_prev = list_prev(list, ab);
1480                 /* prefetch buffers have a minimum lifespan */
1481                 if (HDR_IO_IN_PROGRESS(ab) ||
1482                     (spa && ab->b_spa != spa) ||
1483                     (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1484                     LBOLT - ab->b_arc_access < arc_min_prefetch_lifespan)) {
1485                         skipped++;
1486                         continue;
1487                 }
1488                 /* "lookahead" for better eviction candidate */
1489                 if (recycle && ab->b_size != bytes &&
1490                     ab_prev && ab_prev->b_size == bytes)
1491                         continue;
1492                 hash_lock = HDR_LOCK(ab);
1493                 have_lock = MUTEX_HELD(hash_lock);
1494                 if (have_lock || mutex_tryenter(hash_lock)) {
1495                         ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
1496                         ASSERT(ab->b_datacnt > 0);
1497                         while (ab->b_buf) {
1498                                 arc_buf_t *buf = ab->b_buf;
1499                                 if (!rw_tryenter(&buf->b_lock, RW_WRITER)) {
1500                                         missed += 1;
1501                                         break;
1502                                 }
1503                                 if (buf->b_data) {
1504                                         bytes_evicted += ab->b_size;
1505                                         if (recycle && ab->b_type == type &&
1506                                             ab->b_size == bytes &&
1507                                             !HDR_L2_WRITING(ab)) {
1508                                                 stolen = buf->b_data;
1509                                                 recycle = FALSE;
1510                                         }
1511                                 }
1512                                 if (buf->b_efunc) {
1513                                         mutex_enter(&arc_eviction_mtx);
1514                                         arc_buf_destroy(buf,
1515                                             buf->b_data == stolen, FALSE);
1516                                         ab->b_buf = buf->b_next;
1517                                         buf->b_hdr = &arc_eviction_hdr;
1518                                         buf->b_next = arc_eviction_list;
1519                                         arc_eviction_list = buf;
1520                                         mutex_exit(&arc_eviction_mtx);
1521                                         rw_exit(&buf->b_lock);
1522                                 } else {
1523                                         rw_exit(&buf->b_lock);
1524                                         arc_buf_destroy(buf,
1525                                             buf->b_data == stolen, TRUE);
1526                                 }
1527                         }
1528                         if (ab->b_datacnt == 0) {
1529                                 arc_change_state(evicted_state, ab, hash_lock);
1530                                 ASSERT(HDR_IN_HASH_TABLE(ab));
1531                                 ab->b_flags |= ARC_IN_HASH_TABLE;
1532                                 ab->b_flags &= ~ARC_BUF_AVAILABLE;
1533                                 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
1534                         }
1535                         if (!have_lock)
1536                                 mutex_exit(hash_lock);
1537                         if (bytes >= 0 && bytes_evicted >= bytes)
1538                                 break;
1539                 } else {
1540                         missed += 1;
1541                 }
1542         }
1543
1544         mutex_exit(&evicted_state->arcs_mtx);
1545         mutex_exit(&state->arcs_mtx);
1546
1547         if (bytes_evicted < bytes)
1548                 dprintf("only evicted %lld bytes from %x",
1549                     (longlong_t)bytes_evicted, state);
1550
1551         if (skipped)
1552                 ARCSTAT_INCR(arcstat_evict_skip, skipped);
1553
1554         if (missed)
1555                 ARCSTAT_INCR(arcstat_mutex_miss, missed);
1556
1557         /*
1558          * We have just evicted some date into the ghost state, make
1559          * sure we also adjust the ghost state size if necessary.
1560          */
1561         if (arc_no_grow &&
1562             arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
1563                 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
1564                     arc_mru_ghost->arcs_size - arc_c;
1565
1566                 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
1567                         int64_t todelete =
1568                             MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
1569                         arc_evict_ghost(arc_mru_ghost, NULL, todelete);
1570                 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
1571                         int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
1572                             arc_mru_ghost->arcs_size +
1573                             arc_mfu_ghost->arcs_size - arc_c);
1574                         arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
1575                 }
1576         }
1577
1578         return (stolen);
1579 }
1580
1581 /*
1582  * Remove buffers from list until we've removed the specified number of
1583  * bytes.  Destroy the buffers that are removed.
1584  */
1585 static void
1586 arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes)
1587 {
1588         arc_buf_hdr_t *ab, *ab_prev;
1589         list_t *list = &state->arcs_list[ARC_BUFC_DATA];
1590         kmutex_t *hash_lock;
1591         uint64_t bytes_deleted = 0;
1592         uint64_t bufs_skipped = 0;
1593
1594         ASSERT(GHOST_STATE(state));
1595 top:
1596         mutex_enter(&state->arcs_mtx);
1597         for (ab = list_tail(list); ab; ab = ab_prev) {
1598                 ab_prev = list_prev(list, ab);
1599                 if (spa && ab->b_spa != spa)
1600                         continue;
1601                 hash_lock = HDR_LOCK(ab);
1602                 if (mutex_tryenter(hash_lock)) {
1603                         ASSERT(!HDR_IO_IN_PROGRESS(ab));
1604                         ASSERT(ab->b_buf == NULL);
1605                         ARCSTAT_BUMP(arcstat_deleted);
1606                         bytes_deleted += ab->b_size;
1607
1608                         if (ab->b_l2hdr != NULL) {
1609                                 /*
1610                                  * This buffer is cached on the 2nd Level ARC;
1611                                  * don't destroy the header.
1612                                  */
1613                                 arc_change_state(arc_l2c_only, ab, hash_lock);
1614                                 mutex_exit(hash_lock);
1615                         } else {
1616                                 arc_change_state(arc_anon, ab, hash_lock);
1617                                 mutex_exit(hash_lock);
1618                                 arc_hdr_destroy(ab);
1619                         }
1620
1621                         DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
1622                         if (bytes >= 0 && bytes_deleted >= bytes)
1623                                 break;
1624                 } else {
1625                         if (bytes < 0) {
1626                                 mutex_exit(&state->arcs_mtx);
1627                                 mutex_enter(hash_lock);
1628                                 mutex_exit(hash_lock);
1629                                 goto top;
1630                         }
1631                         bufs_skipped += 1;
1632                 }
1633         }
1634         mutex_exit(&state->arcs_mtx);
1635
1636         if (list == &state->arcs_list[ARC_BUFC_DATA] &&
1637             (bytes < 0 || bytes_deleted < bytes)) {
1638                 list = &state->arcs_list[ARC_BUFC_METADATA];
1639                 goto top;
1640         }
1641
1642         if (bufs_skipped) {
1643                 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
1644                 ASSERT(bytes >= 0);
1645         }
1646
1647         if (bytes_deleted < bytes)
1648                 dprintf("only deleted %lld bytes from %p",
1649                     (longlong_t)bytes_deleted, state);
1650 }
1651
1652 static void
1653 arc_adjust(void)
1654 {
1655         int64_t top_sz, mru_over, arc_over, todelete;
1656
1657         top_sz = arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used;
1658
1659         if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
1660                 int64_t toevict =
1661                     MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], top_sz - arc_p);
1662                 (void) arc_evict(arc_mru, NULL, toevict, FALSE, ARC_BUFC_DATA);
1663                 top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
1664         }
1665
1666         if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
1667                 int64_t toevict =
1668                     MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], top_sz - arc_p);
1669                 (void) arc_evict(arc_mru, NULL, toevict, FALSE,
1670                     ARC_BUFC_METADATA);
1671                 top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
1672         }
1673
1674         mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c;
1675
1676         if (mru_over > 0) {
1677                 if (arc_mru_ghost->arcs_size > 0) {
1678                         todelete = MIN(arc_mru_ghost->arcs_size, mru_over);
1679                         arc_evict_ghost(arc_mru_ghost, NULL, todelete);
1680                 }
1681         }
1682
1683         if ((arc_over = arc_size - arc_c) > 0) {
1684                 int64_t tbl_over;
1685
1686                 if (arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
1687                         int64_t toevict =
1688                             MIN(arc_mfu->arcs_lsize[ARC_BUFC_DATA], arc_over);
1689                         (void) arc_evict(arc_mfu, NULL, toevict, FALSE,
1690                             ARC_BUFC_DATA);
1691                         arc_over = arc_size - arc_c;
1692                 }
1693
1694                 if (arc_over > 0 &&
1695                     arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
1696                         int64_t toevict =
1697                             MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA],
1698                             arc_over);
1699                         (void) arc_evict(arc_mfu, NULL, toevict, FALSE,
1700                             ARC_BUFC_METADATA);
1701                 }
1702
1703                 tbl_over = arc_size + arc_mru_ghost->arcs_size +
1704                     arc_mfu_ghost->arcs_size - arc_c * 2;
1705
1706                 if (tbl_over > 0 && arc_mfu_ghost->arcs_size > 0) {
1707                         todelete = MIN(arc_mfu_ghost->arcs_size, tbl_over);
1708                         arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
1709                 }
1710         }
1711 }
1712
1713 static void
1714 arc_do_user_evicts(void)
1715 {
1716         static arc_buf_t *tmp_arc_eviction_list;
1717
1718         /*
1719          * Move list over to avoid LOR
1720          */
1721 restart:
1722         mutex_enter(&arc_eviction_mtx);
1723         tmp_arc_eviction_list = arc_eviction_list;
1724         arc_eviction_list = NULL;
1725         mutex_exit(&arc_eviction_mtx);
1726
1727         while (tmp_arc_eviction_list != NULL) {
1728                 arc_buf_t *buf = tmp_arc_eviction_list;
1729                 tmp_arc_eviction_list = buf->b_next;
1730                 rw_enter(&buf->b_lock, RW_WRITER);
1731                 buf->b_hdr = NULL;
1732                 rw_exit(&buf->b_lock);
1733
1734                 if (buf->b_efunc != NULL)
1735                         VERIFY(buf->b_efunc(buf) == 0);
1736
1737                 buf->b_efunc = NULL;
1738                 buf->b_private = NULL;
1739                 kmem_cache_free(buf_cache, buf);
1740         }
1741
1742         if (arc_eviction_list != NULL)
1743                 goto restart;
1744 }
1745
1746 /*
1747  * Flush all *evictable* data from the cache for the given spa.
1748  * NOTE: this will not touch "active" (i.e. referenced) data.
1749  */
1750 void
1751 arc_flush(spa_t *spa)
1752 {
1753         while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
1754                 (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA);
1755                 if (spa)
1756                         break;
1757         }
1758         while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
1759                 (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA);
1760                 if (spa)
1761                         break;
1762         }
1763         while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
1764                 (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA);
1765                 if (spa)
1766                         break;
1767         }
1768         while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
1769                 (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA);
1770                 if (spa)
1771                         break;
1772         }
1773
1774         arc_evict_ghost(arc_mru_ghost, spa, -1);
1775         arc_evict_ghost(arc_mfu_ghost, spa, -1);
1776
1777         mutex_enter(&arc_reclaim_thr_lock);
1778         arc_do_user_evicts();
1779         mutex_exit(&arc_reclaim_thr_lock);
1780         ASSERT(spa || arc_eviction_list == NULL);
1781 }
1782
1783 int arc_shrink_shift = 5;               /* log2(fraction of arc to reclaim) */
1784
1785 void
1786 arc_shrink(void)
1787 {
1788         if (arc_c > arc_c_min) {
1789                 uint64_t to_free;
1790
1791 #ifdef _KERNEL
1792                 to_free = arc_c >> arc_shrink_shift;
1793 #else
1794                 to_free = arc_c >> arc_shrink_shift;
1795 #endif
1796                 if (arc_c > arc_c_min + to_free)
1797                         atomic_add_64(&arc_c, -to_free);
1798                 else
1799                         arc_c = arc_c_min;
1800
1801                 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
1802                 if (arc_c > arc_size)
1803                         arc_c = MAX(arc_size, arc_c_min);
1804                 if (arc_p > arc_c)
1805                         arc_p = (arc_c >> 1);
1806                 ASSERT(arc_c >= arc_c_min);
1807                 ASSERT((int64_t)arc_p >= 0);
1808         }
1809
1810         if (arc_size > arc_c)
1811                 arc_adjust();
1812 }
1813
1814 static int needfree = 0;
1815
1816 static int
1817 arc_reclaim_needed(void)
1818 {
1819 #if 0
1820         uint64_t extra;
1821 #endif
1822
1823 #ifdef _KERNEL
1824
1825         /*
1826          * If pages are needed or we're within 2048 pages
1827          * of needing to page need to reclaim
1828          */
1829         if (vm_pages_needed || (vm_paging_target() > -2048))
1830                 return (1);
1831
1832         if (needfree)
1833                 return (1);
1834
1835 #if 0
1836         /*
1837          * take 'desfree' extra pages, so we reclaim sooner, rather than later
1838          */
1839         extra = desfree;
1840
1841         /*
1842          * check that we're out of range of the pageout scanner.  It starts to
1843          * schedule paging if freemem is less than lotsfree and needfree.
1844          * lotsfree is the high-water mark for pageout, and needfree is the
1845          * number of needed free pages.  We add extra pages here to make sure
1846          * the scanner doesn't start up while we're freeing memory.
1847          */
1848         if (freemem < lotsfree + needfree + extra)
1849                 return (1);
1850
1851         /*
1852          * check to make sure that swapfs has enough space so that anon
1853          * reservations can still succeed. anon_resvmem() checks that the
1854          * availrmem is greater than swapfs_minfree, and the number of reserved
1855          * swap pages.  We also add a bit of extra here just to prevent
1856          * circumstances from getting really dire.
1857          */
1858         if (availrmem < swapfs_minfree + swapfs_reserve + extra)
1859                 return (1);
1860
1861 #if defined(__i386)
1862         /*
1863          * If we're on an i386 platform, it's possible that we'll exhaust the
1864          * kernel heap space before we ever run out of available physical
1865          * memory.  Most checks of the size of the heap_area compare against
1866          * tune.t_minarmem, which is the minimum available real memory that we
1867          * can have in the system.  However, this is generally fixed at 25 pages
1868          * which is so low that it's useless.  In this comparison, we seek to
1869          * calculate the total heap-size, and reclaim if more than 3/4ths of the
1870          * heap is allocated.  (Or, in the calculation, if less than 1/4th is
1871          * free)
1872          */
1873         if (btop(vmem_size(heap_arena, VMEM_FREE)) <
1874             (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
1875                 return (1);
1876 #endif
1877 #else
1878         if (kmem_used() > (kmem_size() * 3) / 4)
1879                 return (1);
1880 #endif
1881
1882 #else
1883         if (spa_get_random(100) == 0)
1884                 return (1);
1885 #endif
1886         return (0);
1887 }
1888
1889 static void
1890 arc_kmem_reap_now(arc_reclaim_strategy_t strat)
1891 {
1892 #ifdef ZIO_USE_UMA
1893         size_t                  i;
1894         kmem_cache_t            *prev_cache = NULL;
1895         kmem_cache_t            *prev_data_cache = NULL;
1896         extern kmem_cache_t     *zio_buf_cache[];
1897         extern kmem_cache_t     *zio_data_buf_cache[];
1898 #endif
1899
1900 #ifdef _KERNEL
1901         if (arc_meta_used >= arc_meta_limit) {
1902                 /*
1903                  * We are exceeding our meta-data cache limit.
1904                  * Purge some DNLC entries to release holds on meta-data.
1905                  */
1906                 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
1907         }
1908 #if defined(__i386)
1909         /*
1910          * Reclaim unused memory from all kmem caches.
1911          */
1912         kmem_reap();
1913 #endif
1914 #endif
1915
1916         /*
1917          * An aggressive reclamation will shrink the cache size as well as
1918          * reap free buffers from the arc kmem caches.
1919          */
1920         if (strat == ARC_RECLAIM_AGGR)
1921                 arc_shrink();
1922
1923 #ifdef ZIO_USE_UMA
1924         for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
1925                 if (zio_buf_cache[i] != prev_cache) {
1926                         prev_cache = zio_buf_cache[i];
1927                         kmem_cache_reap_now(zio_buf_cache[i]);
1928                 }
1929                 if (zio_data_buf_cache[i] != prev_data_cache) {
1930                         prev_data_cache = zio_data_buf_cache[i];
1931                         kmem_cache_reap_now(zio_data_buf_cache[i]);
1932                 }
1933         }
1934 #endif
1935         kmem_cache_reap_now(buf_cache);
1936         kmem_cache_reap_now(hdr_cache);
1937 }
1938
1939 static void
1940 arc_reclaim_thread(void *dummy __unused)
1941 {
1942         clock_t                 growtime = 0;
1943         arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
1944         callb_cpr_t             cpr;
1945
1946         CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
1947
1948         mutex_enter(&arc_reclaim_thr_lock);
1949         while (arc_thread_exit == 0) {
1950                 if (arc_reclaim_needed()) {
1951
1952                         if (arc_no_grow) {
1953                                 if (last_reclaim == ARC_RECLAIM_CONS) {
1954                                         last_reclaim = ARC_RECLAIM_AGGR;
1955                                 } else {
1956                                         last_reclaim = ARC_RECLAIM_CONS;
1957                                 }
1958                         } else {
1959                                 arc_no_grow = TRUE;
1960                                 last_reclaim = ARC_RECLAIM_AGGR;
1961                                 membar_producer();
1962                         }
1963
1964                         /* reset the growth delay for every reclaim */
1965                         growtime = LBOLT + (arc_grow_retry * hz);
1966
1967                         if (needfree && last_reclaim == ARC_RECLAIM_CONS) {
1968                                 /*
1969                                  * If needfree is TRUE our vm_lowmem hook
1970                                  * was called and in that case we must free some
1971                                  * memory, so switch to aggressive mode.
1972                                  */
1973                                 arc_no_grow = TRUE;
1974                                 last_reclaim = ARC_RECLAIM_AGGR;
1975                         }
1976                         arc_kmem_reap_now(last_reclaim);
1977                         arc_warm = B_TRUE;
1978
1979                 } else if (arc_no_grow && LBOLT >= growtime) {
1980                         arc_no_grow = FALSE;
1981                 }
1982
1983                 if (needfree ||
1984                     (2 * arc_c < arc_size +
1985                     arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size))
1986                         arc_adjust();
1987
1988                 if (arc_eviction_list != NULL)
1989                         arc_do_user_evicts();
1990
1991                 if (arc_reclaim_needed()) {
1992                         needfree = 0;
1993 #ifdef _KERNEL
1994                         wakeup(&needfree);
1995 #endif
1996                 }
1997
1998                 /* block until needed, or one second, whichever is shorter */
1999                 CALLB_CPR_SAFE_BEGIN(&cpr);
2000                 (void) cv_timedwait(&arc_reclaim_thr_cv,
2001                     &arc_reclaim_thr_lock, hz);
2002                 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2003         }
2004
2005         arc_thread_exit = 0;
2006         cv_broadcast(&arc_reclaim_thr_cv);
2007         CALLB_CPR_EXIT(&cpr);           /* drops arc_reclaim_thr_lock */
2008         thread_exit();
2009 }
2010
2011 /*
2012  * Adapt arc info given the number of bytes we are trying to add and
2013  * the state that we are comming from.  This function is only called
2014  * when we are adding new content to the cache.
2015  */
2016 static void
2017 arc_adapt(int bytes, arc_state_t *state)
2018 {
2019         int mult;
2020
2021         if (state == arc_l2c_only)
2022                 return;
2023
2024         ASSERT(bytes > 0);
2025         /*
2026          * Adapt the target size of the MRU list:
2027          *      - if we just hit in the MRU ghost list, then increase
2028          *        the target size of the MRU list.
2029          *      - if we just hit in the MFU ghost list, then increase
2030          *        the target size of the MFU list by decreasing the
2031          *        target size of the MRU list.
2032          */
2033         if (state == arc_mru_ghost) {
2034                 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2035                     1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2036
2037                 arc_p = MIN(arc_c, arc_p + bytes * mult);
2038         } else if (state == arc_mfu_ghost) {
2039                 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2040                     1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2041
2042                 arc_p = MAX(0, (int64_t)arc_p - bytes * mult);
2043         }
2044         ASSERT((int64_t)arc_p >= 0);
2045
2046         if (arc_reclaim_needed()) {
2047                 cv_signal(&arc_reclaim_thr_cv);
2048                 return;
2049         }
2050
2051         if (arc_no_grow)
2052                 return;
2053
2054         if (arc_c >= arc_c_max)
2055                 return;
2056
2057         /*
2058          * If we're within (2 * maxblocksize) bytes of the target
2059          * cache size, increment the target cache size
2060          */
2061         if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2062                 atomic_add_64(&arc_c, (int64_t)bytes);
2063                 if (arc_c > arc_c_max)
2064                         arc_c = arc_c_max;
2065                 else if (state == arc_anon)
2066                         atomic_add_64(&arc_p, (int64_t)bytes);
2067                 if (arc_p > arc_c)
2068                         arc_p = arc_c;
2069         }
2070         ASSERT((int64_t)arc_p >= 0);
2071 }
2072
2073 /*
2074  * Check if the cache has reached its limits and eviction is required
2075  * prior to insert.
2076  */
2077 static int
2078 arc_evict_needed(arc_buf_contents_t type)
2079 {
2080         if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2081                 return (1);
2082
2083 #if 0
2084 #ifdef _KERNEL
2085         /*
2086          * If zio data pages are being allocated out of a separate heap segment,
2087          * then enforce that the size of available vmem for this area remains
2088          * above about 1/32nd free.
2089          */
2090         if (type == ARC_BUFC_DATA && zio_arena != NULL &&
2091             vmem_size(zio_arena, VMEM_FREE) <
2092             (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
2093                 return (1);
2094 #endif
2095 #endif
2096
2097         if (arc_reclaim_needed())
2098                 return (1);
2099
2100         return (arc_size > arc_c);
2101 }
2102
2103 /*
2104  * The buffer, supplied as the first argument, needs a data block.
2105  * So, if we are at cache max, determine which cache should be victimized.
2106  * We have the following cases:
2107  *
2108  * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2109  * In this situation if we're out of space, but the resident size of the MFU is
2110  * under the limit, victimize the MFU cache to satisfy this insertion request.
2111  *
2112  * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2113  * Here, we've used up all of the available space for the MRU, so we need to
2114  * evict from our own cache instead.  Evict from the set of resident MRU
2115  * entries.
2116  *
2117  * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2118  * c minus p represents the MFU space in the cache, since p is the size of the
2119  * cache that is dedicated to the MRU.  In this situation there's still space on
2120  * the MFU side, so the MRU side needs to be victimized.
2121  *
2122  * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2123  * MFU's resident set is consuming more space than it has been allotted.  In
2124  * this situation, we must victimize our own cache, the MFU, for this insertion.
2125  */
2126 static void
2127 arc_get_data_buf(arc_buf_t *buf)
2128 {
2129         arc_state_t             *state = buf->b_hdr->b_state;
2130         uint64_t                size = buf->b_hdr->b_size;
2131         arc_buf_contents_t      type = buf->b_hdr->b_type;
2132
2133         arc_adapt(size, state);
2134
2135         /*
2136          * We have not yet reached cache maximum size,
2137          * just allocate a new buffer.
2138          */
2139         if (!arc_evict_needed(type)) {
2140                 if (type == ARC_BUFC_METADATA) {
2141                         buf->b_data = zio_buf_alloc(size);
2142                         arc_space_consume(size);
2143                 } else {
2144                         ASSERT(type == ARC_BUFC_DATA);
2145                         buf->b_data = zio_data_buf_alloc(size);
2146                         atomic_add_64(&arc_size, size);
2147                 }
2148                 goto out;
2149         }
2150
2151         /*
2152          * If we are prefetching from the mfu ghost list, this buffer
2153          * will end up on the mru list; so steal space from there.
2154          */
2155         if (state == arc_mfu_ghost)
2156                 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2157         else if (state == arc_mru_ghost)
2158                 state = arc_mru;
2159
2160         if (state == arc_mru || state == arc_anon) {
2161                 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2162                 state = (arc_mfu->arcs_lsize[type] > 0 &&
2163                     arc_p > mru_used) ? arc_mfu : arc_mru;
2164         } else {
2165                 /* MFU cases */
2166                 uint64_t mfu_space = arc_c - arc_p;
2167                 state =  (arc_mru->arcs_lsize[type] > 0 &&
2168                     mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2169         }
2170         if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
2171                 if (type == ARC_BUFC_METADATA) {
2172                         buf->b_data = zio_buf_alloc(size);
2173                         arc_space_consume(size);
2174                 } else {
2175                         ASSERT(type == ARC_BUFC_DATA);
2176                         buf->b_data = zio_data_buf_alloc(size);
2177                         atomic_add_64(&arc_size, size);
2178                 }
2179                 ARCSTAT_BUMP(arcstat_recycle_miss);
2180         }
2181         ASSERT(buf->b_data != NULL);
2182 out:
2183         /*
2184          * Update the state size.  Note that ghost states have a
2185          * "ghost size" and so don't need to be updated.
2186          */
2187         if (!GHOST_STATE(buf->b_hdr->b_state)) {
2188                 arc_buf_hdr_t *hdr = buf->b_hdr;
2189
2190                 atomic_add_64(&hdr->b_state->arcs_size, size);
2191                 if (list_link_active(&hdr->b_arc_node)) {
2192                         ASSERT(refcount_is_zero(&hdr->b_refcnt));
2193                         atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2194                 }
2195                 /*
2196                  * If we are growing the cache, and we are adding anonymous
2197                  * data, and we have outgrown arc_p, update arc_p
2198                  */
2199                 if (arc_size < arc_c && hdr->b_state == arc_anon &&
2200                     arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2201                         arc_p = MIN(arc_c, arc_p + size);
2202         }
2203 }
2204
2205 /*
2206  * This routine is called whenever a buffer is accessed.
2207  * NOTE: the hash lock is dropped in this function.
2208  */
2209 static void
2210 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2211 {
2212         ASSERT(MUTEX_HELD(hash_lock));
2213
2214         if (buf->b_state == arc_anon) {
2215                 /*
2216                  * This buffer is not in the cache, and does not
2217                  * appear in our "ghost" list.  Add the new buffer
2218                  * to the MRU state.
2219                  */
2220
2221                 ASSERT(buf->b_arc_access == 0);
2222                 buf->b_arc_access = LBOLT;
2223                 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2224                 arc_change_state(arc_mru, buf, hash_lock);
2225
2226         } else if (buf->b_state == arc_mru) {
2227                 /*
2228                  * If this buffer is here because of a prefetch, then either:
2229                  * - clear the flag if this is a "referencing" read
2230                  *   (any subsequent access will bump this into the MFU state).
2231                  * or
2232                  * - move the buffer to the head of the list if this is
2233                  *   another prefetch (to make it less likely to be evicted).
2234                  */
2235                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2236                         if (refcount_count(&buf->b_refcnt) == 0) {
2237                                 ASSERT(list_link_active(&buf->b_arc_node));
2238                         } else {
2239                                 buf->b_flags &= ~ARC_PREFETCH;
2240                                 ARCSTAT_BUMP(arcstat_mru_hits);
2241                         }
2242                         buf->b_arc_access = LBOLT;
2243                         return;
2244                 }
2245
2246                 /*
2247                  * This buffer has been "accessed" only once so far,
2248                  * but it is still in the cache. Move it to the MFU
2249                  * state.
2250                  */
2251                 if (LBOLT > buf->b_arc_access + ARC_MINTIME) {
2252                         /*
2253                          * More than 125ms have passed since we
2254                          * instantiated this buffer.  Move it to the
2255                          * most frequently used state.
2256                          */
2257                         buf->b_arc_access = LBOLT;
2258                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2259                         arc_change_state(arc_mfu, buf, hash_lock);
2260                 }
2261                 ARCSTAT_BUMP(arcstat_mru_hits);
2262         } else if (buf->b_state == arc_mru_ghost) {
2263                 arc_state_t     *new_state;
2264                 /*
2265                  * This buffer has been "accessed" recently, but
2266                  * was evicted from the cache.  Move it to the
2267                  * MFU state.
2268                  */
2269
2270                 if (buf->b_flags & ARC_PREFETCH) {
2271                         new_state = arc_mru;
2272                         if (refcount_count(&buf->b_refcnt) > 0)
2273                                 buf->b_flags &= ~ARC_PREFETCH;
2274                         DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2275                 } else {
2276                         new_state = arc_mfu;
2277                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2278                 }
2279
2280                 buf->b_arc_access = LBOLT;
2281                 arc_change_state(new_state, buf, hash_lock);
2282
2283                 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2284         } else if (buf->b_state == arc_mfu) {
2285                 /*
2286                  * This buffer has been accessed more than once and is
2287                  * still in the cache.  Keep it in the MFU state.
2288                  *
2289                  * NOTE: an add_reference() that occurred when we did
2290                  * the arc_read() will have kicked this off the list.
2291                  * If it was a prefetch, we will explicitly move it to
2292                  * the head of the list now.
2293                  */
2294                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2295                         ASSERT(refcount_count(&buf->b_refcnt) == 0);
2296                         ASSERT(list_link_active(&buf->b_arc_node));
2297                 }
2298                 ARCSTAT_BUMP(arcstat_mfu_hits);
2299                 buf->b_arc_access = LBOLT;
2300         } else if (buf->b_state == arc_mfu_ghost) {
2301                 arc_state_t     *new_state = arc_mfu;
2302                 /*
2303                  * This buffer has been accessed more than once but has
2304                  * been evicted from the cache.  Move it back to the
2305                  * MFU state.
2306                  */
2307
2308                 if (buf->b_flags & ARC_PREFETCH) {
2309                         /*
2310                          * This is a prefetch access...
2311                          * move this block back to the MRU state.
2312                          */
2313                         ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
2314                         new_state = arc_mru;
2315                 }
2316
2317                 buf->b_arc_access = LBOLT;
2318                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2319                 arc_change_state(new_state, buf, hash_lock);
2320
2321                 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2322         } else if (buf->b_state == arc_l2c_only) {
2323                 /*
2324                  * This buffer is on the 2nd Level ARC.
2325                  */
2326
2327                 buf->b_arc_access = LBOLT;
2328                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2329                 arc_change_state(arc_mfu, buf, hash_lock);
2330         } else {
2331                 ASSERT(!"invalid arc state");
2332         }
2333 }
2334
2335 /* a generic arc_done_func_t which you can use */
2336 /* ARGSUSED */
2337 void
2338 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2339 {
2340         bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2341         VERIFY(arc_buf_remove_ref(buf, arg) == 1);
2342 }
2343
2344 /* a generic arc_done_func_t */
2345 void
2346 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2347 {
2348         arc_buf_t **bufp = arg;
2349         if (zio && zio->io_error) {
2350                 VERIFY(arc_buf_remove_ref(buf, arg) == 1);
2351                 *bufp = NULL;
2352         } else {
2353                 *bufp = buf;
2354         }
2355 }
2356
2357 static void
2358 arc_read_done(zio_t *zio)
2359 {
2360         arc_buf_hdr_t   *hdr, *found;
2361         arc_buf_t       *buf;
2362         arc_buf_t       *abuf;  /* buffer we're assigning to callback */
2363         kmutex_t        *hash_lock;
2364         arc_callback_t  *callback_list, *acb;
2365         int             freeable = FALSE;
2366
2367         buf = zio->io_private;
2368         hdr = buf->b_hdr;
2369
2370         /*
2371          * The hdr was inserted into hash-table and removed from lists
2372          * prior to starting I/O.  We should find this header, since
2373          * it's in the hash table, and it should be legit since it's
2374          * not possible to evict it during the I/O.  The only possible
2375          * reason for it not to be found is if we were freed during the
2376          * read.
2377          */
2378         found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth,
2379             &hash_lock);
2380
2381         ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
2382             (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2383             (found == hdr && HDR_L2_READING(hdr)));
2384
2385         hdr->b_flags &= ~ARC_L2_EVICTED;
2386         if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
2387                 hdr->b_flags &= ~ARC_L2CACHE;
2388
2389         /* byteswap if necessary */
2390         callback_list = hdr->b_acb;
2391         ASSERT(callback_list != NULL);
2392         if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
2393                 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
2394                     byteswap_uint64_array :
2395                     dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
2396                 func(buf->b_data, hdr->b_size);
2397         }
2398
2399         arc_cksum_compute(buf, B_FALSE);
2400
2401         /* create copies of the data buffer for the callers */
2402         abuf = buf;
2403         for (acb = callback_list; acb; acb = acb->acb_next) {
2404                 if (acb->acb_done) {
2405                         if (abuf == NULL)
2406                                 abuf = arc_buf_clone(buf);
2407                         acb->acb_buf = abuf;
2408                         abuf = NULL;
2409                 }
2410         }
2411         hdr->b_acb = NULL;
2412         hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2413         ASSERT(!HDR_BUF_AVAILABLE(hdr));
2414         if (abuf == buf)
2415                 hdr->b_flags |= ARC_BUF_AVAILABLE;
2416
2417         ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
2418
2419         if (zio->io_error != 0) {
2420                 hdr->b_flags |= ARC_IO_ERROR;
2421                 if (hdr->b_state != arc_anon)
2422                         arc_change_state(arc_anon, hdr, hash_lock);
2423                 if (HDR_IN_HASH_TABLE(hdr))
2424                         buf_hash_remove(hdr);
2425                 freeable = refcount_is_zero(&hdr->b_refcnt);
2426         }
2427
2428         /*
2429          * Broadcast before we drop the hash_lock to avoid the possibility
2430          * that the hdr (and hence the cv) might be freed before we get to
2431          * the cv_broadcast().
2432          */
2433         cv_broadcast(&hdr->b_cv);
2434
2435         if (hash_lock) {
2436                 /*
2437                  * Only call arc_access on anonymous buffers.  This is because
2438                  * if we've issued an I/O for an evicted buffer, we've already
2439                  * called arc_access (to prevent any simultaneous readers from
2440                  * getting confused).
2441                  */
2442                 if (zio->io_error == 0 && hdr->b_state == arc_anon)
2443                         arc_access(hdr, hash_lock);
2444                 mutex_exit(hash_lock);
2445         } else {
2446                 /*
2447                  * This block was freed while we waited for the read to
2448                  * complete.  It has been removed from the hash table and
2449                  * moved to the anonymous state (so that it won't show up
2450                  * in the cache).
2451                  */
2452                 ASSERT3P(hdr->b_state, ==, arc_anon);
2453                 freeable = refcount_is_zero(&hdr->b_refcnt);
2454         }
2455
2456         /* execute each callback and free its structure */
2457         while ((acb = callback_list) != NULL) {
2458                 if (acb->acb_done)
2459                         acb->acb_done(zio, acb->acb_buf, acb->acb_private);
2460
2461                 if (acb->acb_zio_dummy != NULL) {
2462                         acb->acb_zio_dummy->io_error = zio->io_error;
2463                         zio_nowait(acb->acb_zio_dummy);
2464                 }
2465
2466                 callback_list = acb->acb_next;
2467                 kmem_free(acb, sizeof (arc_callback_t));
2468         }
2469
2470         if (freeable)
2471                 arc_hdr_destroy(hdr);
2472 }
2473
2474 /*
2475  * "Read" the block block at the specified DVA (in bp) via the
2476  * cache.  If the block is found in the cache, invoke the provided
2477  * callback immediately and return.  Note that the `zio' parameter
2478  * in the callback will be NULL in this case, since no IO was
2479  * required.  If the block is not in the cache pass the read request
2480  * on to the spa with a substitute callback function, so that the
2481  * requested block will be added to the cache.
2482  *
2483  * If a read request arrives for a block that has a read in-progress,
2484  * either wait for the in-progress read to complete (and return the
2485  * results); or, if this is a read with a "done" func, add a record
2486  * to the read to invoke the "done" func when the read completes,
2487  * and return; or just return.
2488  *
2489  * arc_read_done() will invoke all the requested "done" functions
2490  * for readers of this block.
2491  *
2492  * Normal callers should use arc_read and pass the arc buffer and offset
2493  * for the bp.  But if you know you don't need locking, you can use
2494  * arc_read_bp.
2495  */
2496 int
2497 arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
2498     arc_done_func_t *done, void *private, int priority, int zio_flags,
2499     uint32_t *arc_flags, const zbookmark_t *zb)
2500 {
2501         int err;
2502         arc_buf_hdr_t *hdr = pbuf->b_hdr;
2503
2504         ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
2505         ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
2506         rw_enter(&pbuf->b_lock, RW_READER);
2507
2508         err = arc_read_nolock(pio, spa, bp, done, private, priority,
2509             zio_flags, arc_flags, zb);
2510
2511         ASSERT3P(hdr, ==, pbuf->b_hdr);
2512         rw_exit(&pbuf->b_lock);
2513         return (err);
2514 }
2515
2516 int
2517 arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
2518     arc_done_func_t *done, void *private, int priority, int zio_flags,
2519     uint32_t *arc_flags, const zbookmark_t *zb)
2520 {
2521         arc_buf_hdr_t *hdr;
2522         arc_buf_t *buf;
2523         kmutex_t *hash_lock;
2524         zio_t *rzio;
2525
2526 top:
2527         hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
2528         if (hdr && hdr->b_datacnt > 0) {
2529
2530                 *arc_flags |= ARC_CACHED;
2531
2532                 if (HDR_IO_IN_PROGRESS(hdr)) {
2533
2534                         if (*arc_flags & ARC_WAIT) {
2535                                 cv_wait(&hdr->b_cv, hash_lock);
2536                                 mutex_exit(hash_lock);
2537                                 goto top;
2538                         }
2539                         ASSERT(*arc_flags & ARC_NOWAIT);
2540
2541                         if (done) {
2542                                 arc_callback_t  *acb = NULL;
2543
2544                                 acb = kmem_zalloc(sizeof (arc_callback_t),
2545                                     KM_SLEEP);
2546                                 acb->acb_done = done;
2547                                 acb->acb_private = private;
2548                                 if (pio != NULL)
2549                                         acb->acb_zio_dummy = zio_null(pio,
2550                                             spa, NULL, NULL, zio_flags);
2551
2552                                 ASSERT(acb->acb_done != NULL);
2553                                 acb->acb_next = hdr->b_acb;
2554                                 hdr->b_acb = acb;
2555                                 add_reference(hdr, hash_lock, private);
2556                                 mutex_exit(hash_lock);
2557                                 return (0);
2558                         }
2559                         mutex_exit(hash_lock);
2560                         return (0);
2561                 }
2562
2563                 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2564
2565                 if (done) {
2566                         add_reference(hdr, hash_lock, private);
2567                         /*
2568                          * If this block is already in use, create a new
2569                          * copy of the data so that we will be guaranteed
2570                          * that arc_release() will always succeed.
2571                          */
2572                         buf = hdr->b_buf;
2573                         ASSERT(buf);
2574                         ASSERT(buf->b_data);
2575                         if (HDR_BUF_AVAILABLE(hdr)) {
2576                                 ASSERT(buf->b_efunc == NULL);
2577                                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
2578                         } else {
2579                                 buf = arc_buf_clone(buf);
2580                         }
2581                 } else if (*arc_flags & ARC_PREFETCH &&
2582                     refcount_count(&hdr->b_refcnt) == 0) {
2583                         hdr->b_flags |= ARC_PREFETCH;
2584                 }
2585                 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
2586                 arc_access(hdr, hash_lock);
2587                 if (*arc_flags & ARC_L2CACHE)
2588                         hdr->b_flags |= ARC_L2CACHE;
2589                 mutex_exit(hash_lock);
2590                 ARCSTAT_BUMP(arcstat_hits);
2591                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2592                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2593                     data, metadata, hits);
2594
2595                 if (done)
2596                         done(NULL, buf, private);
2597         } else {
2598                 uint64_t size = BP_GET_LSIZE(bp);
2599                 arc_callback_t  *acb;
2600                 vdev_t *vd = NULL;
2601                 daddr_t addr;
2602
2603                 if (hdr == NULL) {
2604                         /* this block is not in the cache */
2605                         arc_buf_hdr_t   *exists;
2606                         arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
2607                         buf = arc_buf_alloc(spa, size, private, type);
2608                         hdr = buf->b_hdr;
2609                         hdr->b_dva = *BP_IDENTITY(bp);
2610                         hdr->b_birth = bp->blk_birth;
2611                         hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
2612                         exists = buf_hash_insert(hdr, &hash_lock);
2613                         if (exists) {
2614                                 /* somebody beat us to the hash insert */
2615                                 mutex_exit(hash_lock);
2616                                 bzero(&hdr->b_dva, sizeof (dva_t));
2617                                 hdr->b_birth = 0;
2618                                 hdr->b_cksum0 = 0;
2619                                 (void) arc_buf_remove_ref(buf, private);
2620                                 goto top; /* restart the IO request */
2621                         }
2622                         /* if this is a prefetch, we don't have a reference */
2623                         if (*arc_flags & ARC_PREFETCH) {
2624                                 (void) remove_reference(hdr, hash_lock,
2625                                     private);
2626                                 hdr->b_flags |= ARC_PREFETCH;
2627                         }
2628                         if (*arc_flags & ARC_L2CACHE)
2629                                 hdr->b_flags |= ARC_L2CACHE;
2630                         if (BP_GET_LEVEL(bp) > 0)
2631                                 hdr->b_flags |= ARC_INDIRECT;
2632                 } else {
2633                         /* this block is in the ghost cache */
2634                         ASSERT(GHOST_STATE(hdr->b_state));
2635                         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2636                         ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
2637                         ASSERT(hdr->b_buf == NULL);
2638
2639                         /* if this is a prefetch, we don't have a reference */
2640                         if (*arc_flags & ARC_PREFETCH)
2641                                 hdr->b_flags |= ARC_PREFETCH;
2642                         else
2643                                 add_reference(hdr, hash_lock, private);
2644                         if (*arc_flags & ARC_L2CACHE)
2645                                 hdr->b_flags |= ARC_L2CACHE;
2646                         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2647                         buf->b_hdr = hdr;
2648                         buf->b_data = NULL;
2649                         buf->b_efunc = NULL;
2650                         buf->b_private = NULL;
2651                         buf->b_next = NULL;
2652                         hdr->b_buf = buf;
2653                         arc_get_data_buf(buf);
2654                         ASSERT(hdr->b_datacnt == 0);
2655                         hdr->b_datacnt = 1;
2656
2657                 }
2658
2659                 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
2660                 acb->acb_done = done;
2661                 acb->acb_private = private;
2662
2663                 ASSERT(hdr->b_acb == NULL);
2664                 hdr->b_acb = acb;
2665                 hdr->b_flags |= ARC_IO_IN_PROGRESS;
2666
2667                 /*
2668                  * If the buffer has been evicted, migrate it to a present state
2669                  * before issuing the I/O.  Once we drop the hash-table lock,
2670                  * the header will be marked as I/O in progress and have an
2671                  * attached buffer.  At this point, anybody who finds this
2672                  * buffer ought to notice that it's legit but has a pending I/O.
2673                  */
2674
2675                 if (GHOST_STATE(hdr->b_state))
2676                         arc_access(hdr, hash_lock);
2677
2678                 if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
2679                     (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
2680                         addr = hdr->b_l2hdr->b_daddr;
2681                         /*
2682                          * Lock out device removal.
2683                          */
2684                         if (vdev_is_dead(vd) ||
2685                             !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
2686                                 vd = NULL;
2687                 }
2688
2689                 mutex_exit(hash_lock);
2690
2691                 ASSERT3U(hdr->b_size, ==, size);
2692                 DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
2693                     zbookmark_t *, zb);
2694                 ARCSTAT_BUMP(arcstat_misses);
2695                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2696                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2697                     data, metadata, misses);
2698
2699                 if (vd != NULL) {
2700                         /*
2701                          * Read from the L2ARC if the following are true:
2702                          * 1. The L2ARC vdev was previously cached.
2703                          * 2. This buffer still has L2ARC metadata.
2704                          * 3. This buffer isn't currently writing to the L2ARC.
2705                          * 4. The L2ARC entry wasn't evicted, which may
2706                          *    also have invalidated the vdev.
2707                          */
2708                         if (hdr->b_l2hdr != NULL &&
2709                             !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) {
2710                                 l2arc_read_callback_t *cb;
2711
2712                                 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
2713                                 ARCSTAT_BUMP(arcstat_l2_hits);
2714
2715                                 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
2716                                     KM_SLEEP);
2717                                 cb->l2rcb_buf = buf;
2718                                 cb->l2rcb_spa = spa;
2719                                 cb->l2rcb_bp = *bp;
2720                                 cb->l2rcb_zb = *zb;
2721                                 cb->l2rcb_flags = zio_flags;
2722
2723                                 /*
2724                                  * l2arc read.  The SCL_L2ARC lock will be
2725                                  * released by l2arc_read_done().
2726                                  */
2727                                 rzio = zio_read_phys(pio, vd, addr, size,
2728                                     buf->b_data, ZIO_CHECKSUM_OFF,
2729                                     l2arc_read_done, cb, priority, zio_flags |
2730                                     ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
2731                                     ZIO_FLAG_DONT_PROPAGATE |
2732                                     ZIO_FLAG_DONT_RETRY, B_FALSE);
2733                                 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
2734                                     zio_t *, rzio);
2735
2736                                 if (*arc_flags & ARC_NOWAIT) {
2737                                         zio_nowait(rzio);
2738                                         return (0);
2739                                 }
2740
2741                                 ASSERT(*arc_flags & ARC_WAIT);
2742                                 if (zio_wait(rzio) == 0)
2743                                         return (0);
2744
2745                                 /* l2arc read error; goto zio_read() */
2746                         } else {
2747                                 DTRACE_PROBE1(l2arc__miss,
2748                                     arc_buf_hdr_t *, hdr);
2749                                 ARCSTAT_BUMP(arcstat_l2_misses);
2750                                 if (HDR_L2_WRITING(hdr))
2751                                         ARCSTAT_BUMP(arcstat_l2_rw_clash);
2752                                 spa_config_exit(spa, SCL_L2ARC, vd);
2753                         }
2754                 }
2755
2756                 rzio = zio_read(pio, spa, bp, buf->b_data, size,
2757                     arc_read_done, buf, priority, zio_flags, zb);
2758
2759                 if (*arc_flags & ARC_WAIT)
2760                         return (zio_wait(rzio));
2761
2762                 ASSERT(*arc_flags & ARC_NOWAIT);
2763                 zio_nowait(rzio);
2764         }
2765         return (0);
2766 }
2767
2768 /*
2769  * arc_read() variant to support pool traversal.  If the block is already
2770  * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
2771  * The idea is that we don't want pool traversal filling up memory, but
2772  * if the ARC already has the data anyway, we shouldn't pay for the I/O.
2773  */
2774 int
2775 arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
2776 {
2777         arc_buf_hdr_t *hdr;
2778         kmutex_t *hash_mtx;
2779         int rc = 0;
2780
2781         hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
2782
2783         if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
2784                 arc_buf_t *buf = hdr->b_buf;
2785
2786                 ASSERT(buf);
2787                 while (buf->b_data == NULL) {
2788                         buf = buf->b_next;
2789                         ASSERT(buf);
2790                 }
2791                 bcopy(buf->b_data, data, hdr->b_size);
2792         } else {
2793                 rc = ENOENT;
2794         }
2795
2796         if (hash_mtx)
2797                 mutex_exit(hash_mtx);
2798
2799         return (rc);
2800 }
2801
2802 void
2803 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
2804 {
2805         ASSERT(buf->b_hdr != NULL);
2806         ASSERT(buf->b_hdr->b_state != arc_anon);
2807         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
2808         buf->b_efunc = func;
2809         buf->b_private = private;
2810 }
2811
2812 /*
2813  * This is used by the DMU to let the ARC know that a buffer is
2814  * being evicted, so the ARC should clean up.  If this arc buf
2815  * is not yet in the evicted state, it will be put there.
2816  */
2817 int
2818 arc_buf_evict(arc_buf_t *buf)
2819 {
2820         arc_buf_hdr_t *hdr;
2821         kmutex_t *hash_lock;
2822         arc_buf_t **bufp;
2823
2824         rw_enter(&buf->b_lock, RW_WRITER);
2825         hdr = buf->b_hdr;
2826         if (hdr == NULL) {
2827                 /*
2828                  * We are in arc_do_user_evicts().
2829                  */
2830                 ASSERT(buf->b_data == NULL);
2831                 rw_exit(&buf->b_lock);
2832                 return (0);
2833         } else if (buf->b_data == NULL) {
2834                 arc_buf_t copy = *buf; /* structure assignment */
2835                 /*
2836                  * We are on the eviction list; process this buffer now
2837                  * but let arc_do_user_evicts() do the reaping.
2838                  */
2839                 buf->b_efunc = NULL;
2840                 rw_exit(&buf->b_lock);
2841                 VERIFY(copy.b_efunc(&copy) == 0);
2842                 return (1);
2843         }
2844         hash_lock = HDR_LOCK(hdr);
2845         mutex_enter(hash_lock);
2846
2847         ASSERT(buf->b_hdr == hdr);
2848         ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
2849         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2850
2851         /*
2852          * Pull this buffer off of the hdr
2853          */
2854         bufp = &hdr->b_buf;
2855         while (*bufp != buf)
2856                 bufp = &(*bufp)->b_next;
2857         *bufp = buf->b_next;
2858
2859         ASSERT(buf->b_data != NULL);
2860         arc_buf_destroy(buf, FALSE, FALSE);
2861
2862         if (hdr->b_datacnt == 0) {
2863                 arc_state_t *old_state = hdr->b_state;
2864                 arc_state_t *evicted_state;
2865
2866                 ASSERT(refcount_is_zero(&hdr->b_refcnt));
2867
2868                 evicted_state =
2869                     (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
2870
2871                 mutex_enter(&old_state->arcs_mtx);
2872                 mutex_enter(&evicted_state->arcs_mtx);
2873
2874                 arc_change_state(evicted_state, hdr, hash_lock);
2875                 ASSERT(HDR_IN_HASH_TABLE(hdr));
2876                 hdr->b_flags |= ARC_IN_HASH_TABLE;
2877                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
2878
2879                 mutex_exit(&evicted_state->arcs_mtx);
2880                 mutex_exit(&old_state->arcs_mtx);
2881         }
2882         mutex_exit(hash_lock);
2883         rw_exit(&buf->b_lock);
2884
2885         VERIFY(buf->b_efunc(buf) == 0);
2886         buf->b_efunc = NULL;
2887         buf->b_private = NULL;
2888         buf->b_hdr = NULL;
2889         kmem_cache_free(buf_cache, buf);
2890         return (1);
2891 }
2892
2893 /*
2894  * Release this buffer from the cache.  This must be done
2895  * after a read and prior to modifying the buffer contents.
2896  * If the buffer has more than one reference, we must make
2897  * a new hdr for the buffer.
2898  */
2899 void
2900 arc_release(arc_buf_t *buf, void *tag)
2901 {
2902         arc_buf_hdr_t *hdr;
2903         kmutex_t *hash_lock;
2904         l2arc_buf_hdr_t *l2hdr;
2905         uint64_t buf_size;
2906
2907         rw_enter(&buf->b_lock, RW_WRITER);
2908         hdr = buf->b_hdr;
2909
2910         /* this buffer is not on any list */
2911         ASSERT(refcount_count(&hdr->b_refcnt) > 0);
2912         ASSERT(!(hdr->b_flags & ARC_STORED));
2913
2914         if (hdr->b_state == arc_anon) {
2915                 /* this buffer is already released */
2916                 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
2917                 ASSERT(BUF_EMPTY(hdr));
2918                 ASSERT(buf->b_efunc == NULL);
2919                 arc_buf_thaw(buf);
2920                 rw_exit(&buf->b_lock);
2921                 return;
2922         }
2923
2924         hash_lock = HDR_LOCK(hdr);
2925         mutex_enter(hash_lock);
2926
2927         l2hdr = hdr->b_l2hdr;
2928         if (l2hdr) {
2929                 mutex_enter(&l2arc_buflist_mtx);
2930                 hdr->b_l2hdr = NULL;
2931                 buf_size = hdr->b_size;
2932         }
2933
2934         /*
2935          * Do we have more than one buf?
2936          */
2937         if (hdr->b_datacnt > 1) {
2938                 arc_buf_hdr_t *nhdr;
2939                 arc_buf_t **bufp;
2940                 uint64_t blksz = hdr->b_size;
2941                 spa_t *spa = hdr->b_spa;
2942                 arc_buf_contents_t type = hdr->b_type;
2943                 uint32_t flags = hdr->b_flags;
2944
2945                 ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
2946                 /*
2947                  * Pull the data off of this buf and attach it to
2948                  * a new anonymous buf.
2949                  */
2950                 (void) remove_reference(hdr, hash_lock, tag);
2951                 bufp = &hdr->b_buf;
2952                 while (*bufp != buf)
2953                         bufp = &(*bufp)->b_next;
2954                 *bufp = (*bufp)->b_next;
2955                 buf->b_next = NULL;
2956
2957                 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
2958                 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
2959                 if (refcount_is_zero(&hdr->b_refcnt)) {
2960                         uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
2961                         ASSERT3U(*size, >=, hdr->b_size);
2962                         atomic_add_64(size, -hdr->b_size);
2963                 }
2964                 hdr->b_datacnt -= 1;
2965                 arc_cksum_verify(buf);
2966
2967                 mutex_exit(hash_lock);
2968
2969                 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
2970                 nhdr->b_size = blksz;
2971                 nhdr->b_spa = spa;
2972                 nhdr->b_type = type;
2973                 nhdr->b_buf = buf;
2974                 nhdr->b_state = arc_anon;
2975                 nhdr->b_arc_access = 0;
2976                 nhdr->b_flags = flags & ARC_L2_WRITING;
2977                 nhdr->b_l2hdr = NULL;
2978                 nhdr->b_datacnt = 1;
2979                 nhdr->b_freeze_cksum = NULL;
2980                 (void) refcount_add(&nhdr->b_refcnt, tag);
2981                 buf->b_hdr = nhdr;
2982                 rw_exit(&buf->b_lock);
2983                 atomic_add_64(&arc_anon->arcs_size, blksz);
2984         } else {
2985                 rw_exit(&buf->b_lock);
2986                 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
2987                 ASSERT(!list_link_active(&hdr->b_arc_node));
2988                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2989                 arc_change_state(arc_anon, hdr, hash_lock);
2990                 hdr->b_arc_access = 0;
2991                 mutex_exit(hash_lock);
2992
2993                 bzero(&hdr->b_dva, sizeof (dva_t));
2994                 hdr->b_birth = 0;
2995                 hdr->b_cksum0 = 0;
2996                 arc_buf_thaw(buf);
2997         }
2998         buf->b_efunc = NULL;
2999         buf->b_private = NULL;
3000
3001         if (l2hdr) {
3002                 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3003                 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3004                 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3005                 mutex_exit(&l2arc_buflist_mtx);
3006         }
3007 }
3008
3009 int
3010 arc_released(arc_buf_t *buf)
3011 {
3012         int released;
3013
3014         rw_enter(&buf->b_lock, RW_READER);
3015         released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3016         rw_exit(&buf->b_lock);
3017         return (released);
3018 }
3019
3020 int
3021 arc_has_callback(arc_buf_t *buf)
3022 {
3023         int callback;
3024
3025         rw_enter(&buf->b_lock, RW_READER);
3026         callback = (buf->b_efunc != NULL);
3027         rw_exit(&buf->b_lock);
3028         return (callback);
3029 }
3030
3031 #ifdef ZFS_DEBUG
3032 int
3033 arc_referenced(arc_buf_t *buf)
3034 {
3035         int referenced;
3036
3037         rw_enter(&buf->b_lock, RW_READER);
3038         referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3039         rw_exit(&buf->b_lock);
3040         return (referenced);
3041 }
3042 #endif
3043
3044 static void
3045 arc_write_ready(zio_t *zio)
3046 {
3047         arc_write_callback_t *callback = zio->io_private;
3048         arc_buf_t *buf = callback->awcb_buf;
3049         arc_buf_hdr_t *hdr = buf->b_hdr;
3050
3051         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3052         callback->awcb_ready(zio, buf, callback->awcb_private);
3053
3054         /*
3055          * If the IO is already in progress, then this is a re-write
3056          * attempt, so we need to thaw and re-compute the cksum.
3057          * It is the responsibility of the callback to handle the
3058          * accounting for any re-write attempt.
3059          */
3060         if (HDR_IO_IN_PROGRESS(hdr)) {
3061                 mutex_enter(&hdr->b_freeze_lock);
3062                 if (hdr->b_freeze_cksum != NULL) {
3063                         kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3064                         hdr->b_freeze_cksum = NULL;
3065                 }
3066                 mutex_exit(&hdr->b_freeze_lock);
3067         }
3068         arc_cksum_compute(buf, B_FALSE);
3069         hdr->b_flags |= ARC_IO_IN_PROGRESS;
3070 }
3071
3072 static void
3073 arc_write_done(zio_t *zio)
3074 {
3075         arc_write_callback_t *callback = zio->io_private;
3076         arc_buf_t *buf = callback->awcb_buf;
3077         arc_buf_hdr_t *hdr = buf->b_hdr;
3078
3079         hdr->b_acb = NULL;
3080
3081         hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3082         hdr->b_birth = zio->io_bp->blk_birth;
3083         hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3084         /*
3085          * If the block to be written was all-zero, we may have
3086          * compressed it away.  In this case no write was performed
3087          * so there will be no dva/birth-date/checksum.  The buffer
3088          * must therefor remain anonymous (and uncached).
3089          */
3090         if (!BUF_EMPTY(hdr)) {
3091                 arc_buf_hdr_t *exists;
3092                 kmutex_t *hash_lock;
3093
3094                 arc_cksum_verify(buf);
3095
3096                 exists = buf_hash_insert(hdr, &hash_lock);
3097                 if (exists) {
3098                         /*
3099                          * This can only happen if we overwrite for
3100                          * sync-to-convergence, because we remove
3101                          * buffers from the hash table when we arc_free().
3102                          */
3103                         ASSERT(zio->io_flags & ZIO_FLAG_IO_REWRITE);
3104                         ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
3105                             BP_IDENTITY(zio->io_bp)));
3106                         ASSERT3U(zio->io_bp_orig.blk_birth, ==,
3107                             zio->io_bp->blk_birth);
3108
3109                         ASSERT(refcount_is_zero(&exists->b_refcnt));
3110                         arc_change_state(arc_anon, exists, hash_lock);
3111                         mutex_exit(hash_lock);
3112                         arc_hdr_destroy(exists);
3113                         exists = buf_hash_insert(hdr, &hash_lock);
3114                         ASSERT3P(exists, ==, NULL);
3115                 }
3116                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3117                 /* if it's not anon, we are doing a scrub */
3118                 if (hdr->b_state == arc_anon)
3119                         arc_access(hdr, hash_lock);
3120                 mutex_exit(hash_lock);
3121         } else if (callback->awcb_done == NULL) {
3122                 int destroy_hdr;
3123                 /*
3124                  * This is an anonymous buffer with no user callback,
3125                  * destroy it if there are no active references.
3126                  */
3127                 mutex_enter(&arc_eviction_mtx);
3128                 destroy_hdr = refcount_is_zero(&hdr->b_refcnt);
3129                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3130                 mutex_exit(&arc_eviction_mtx);
3131                 if (destroy_hdr)
3132                         arc_hdr_destroy(hdr);
3133         } else {
3134                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3135         }
3136         hdr->b_flags &= ~ARC_STORED;
3137
3138         if (callback->awcb_done) {
3139                 ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3140                 callback->awcb_done(zio, buf, callback->awcb_private);
3141         }
3142
3143         kmem_free(callback, sizeof (arc_write_callback_t));
3144 }
3145
3146 static void
3147 write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp)
3148 {
3149         boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata);
3150
3151         /* Determine checksum setting */
3152         if (ismd) {
3153                 /*
3154                  * Metadata always gets checksummed.  If the data
3155                  * checksum is multi-bit correctable, and it's not a
3156                  * ZBT-style checksum, then it's suitable for metadata
3157                  * as well.  Otherwise, the metadata checksum defaults
3158                  * to fletcher4.
3159                  */
3160                 if (zio_checksum_table[wp->wp_oschecksum].ci_correctable &&
3161                     !zio_checksum_table[wp->wp_oschecksum].ci_zbt)
3162                         zp->zp_checksum = wp->wp_oschecksum;
3163                 else
3164                         zp->zp_checksum = ZIO_CHECKSUM_FLETCHER_4;
3165         } else {
3166                 zp->zp_checksum = zio_checksum_select(wp->wp_dnchecksum,
3167                     wp->wp_oschecksum);
3168         }
3169
3170         /* Determine compression setting */
3171         if (ismd) {
3172                 /*
3173                  * XXX -- we should design a compression algorithm
3174                  * that specializes in arrays of bps.
3175                  */
3176                 zp->zp_compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
3177                     ZIO_COMPRESS_LZJB;
3178         } else {
3179                 zp->zp_compress = zio_compress_select(wp->wp_dncompress,
3180                     wp->wp_oscompress);
3181         }
3182
3183         zp->zp_type = wp->wp_type;
3184         zp->zp_level = wp->wp_level;
3185         zp->zp_ndvas = MIN(wp->wp_copies + ismd, spa_max_replication(spa));
3186 }
3187
3188 zio_t *
3189 arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
3190     boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
3191     arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
3192     int zio_flags, const zbookmark_t *zb)
3193 {
3194         arc_buf_hdr_t *hdr = buf->b_hdr;
3195         arc_write_callback_t *callback;
3196         zio_t *zio;
3197         zio_prop_t zp;
3198
3199         ASSERT(ready != NULL);
3200         ASSERT(!HDR_IO_ERROR(hdr));
3201         ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3202         ASSERT(hdr->b_acb == 0);
3203         if (l2arc)
3204                 hdr->b_flags |= ARC_L2CACHE;
3205         callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3206         callback->awcb_ready = ready;
3207         callback->awcb_done = done;
3208         callback->awcb_private = private;
3209         callback->awcb_buf = buf;
3210
3211         write_policy(spa, wp, &zp);
3212         zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, &zp,
3213             arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
3214
3215         return (zio);
3216 }
3217
3218 int
3219 arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
3220     zio_done_func_t *done, void *private, uint32_t arc_flags)
3221 {
3222         arc_buf_hdr_t *ab;
3223         kmutex_t *hash_lock;
3224         zio_t   *zio;
3225
3226         /*
3227          * If this buffer is in the cache, release it, so it
3228          * can be re-used.
3229          */
3230         ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
3231         if (ab != NULL) {
3232                 /*
3233                  * The checksum of blocks to free is not always
3234                  * preserved (eg. on the deadlist).  However, if it is
3235                  * nonzero, it should match what we have in the cache.
3236                  */
3237                 ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
3238                     bp->blk_cksum.zc_word[0] == ab->b_cksum0 ||
3239                     bp->blk_fill == BLK_FILL_ALREADY_FREED);
3240
3241                 if (ab->b_state != arc_anon)
3242                         arc_change_state(arc_anon, ab, hash_lock);
3243                 if (HDR_IO_IN_PROGRESS(ab)) {
3244                         /*
3245                          * This should only happen when we prefetch.
3246                          */
3247                         ASSERT(ab->b_flags & ARC_PREFETCH);
3248                         ASSERT3U(ab->b_datacnt, ==, 1);
3249                         ab->b_flags |= ARC_FREED_IN_READ;
3250                         if (HDR_IN_HASH_TABLE(ab))
3251                                 buf_hash_remove(ab);
3252                         ab->b_arc_access = 0;
3253                         bzero(&ab->b_dva, sizeof (dva_t));
3254                         ab->b_birth = 0;
3255                         ab->b_cksum0 = 0;
3256                         ab->b_buf->b_efunc = NULL;
3257                         ab->b_buf->b_private = NULL;
3258                         mutex_exit(hash_lock);
3259                 } else if (refcount_is_zero(&ab->b_refcnt)) {
3260                         ab->b_flags |= ARC_FREE_IN_PROGRESS;
3261                         mutex_exit(hash_lock);
3262                         arc_hdr_destroy(ab);
3263                         ARCSTAT_BUMP(arcstat_deleted);
3264                 } else {
3265                         /*
3266                          * We still have an active reference on this
3267                          * buffer.  This can happen, e.g., from
3268                          * dbuf_unoverride().
3269                          */
3270                         ASSERT(!HDR_IN_HASH_TABLE(ab));
3271                         ab->b_arc_access = 0;
3272                         bzero(&ab->b_dva, sizeof (dva_t));
3273                         ab->b_birth = 0;
3274                         ab->b_cksum0 = 0;
3275                         ab->b_buf->b_efunc = NULL;
3276                         ab->b_buf->b_private = NULL;
3277                         mutex_exit(hash_lock);
3278                 }
3279         }
3280
3281         zio = zio_free(pio, spa, txg, bp, done, private, ZIO_FLAG_MUSTSUCCEED);
3282
3283         if (arc_flags & ARC_WAIT)
3284                 return (zio_wait(zio));
3285
3286         ASSERT(arc_flags & ARC_NOWAIT);
3287         zio_nowait(zio);
3288
3289         return (0);
3290 }
3291
3292 static int
3293 arc_memory_throttle(uint64_t reserve, uint64_t txg)
3294 {
3295 #ifdef _KERNEL
3296         uint64_t inflight_data = arc_anon->arcs_size;
3297         uint64_t available_memory = ptoa((uintmax_t)cnt.v_free_count);
3298         static uint64_t page_load = 0;
3299         static uint64_t last_txg = 0;
3300
3301 #if 0
3302 #if defined(__i386)
3303         available_memory =
3304             MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3305 #endif
3306 #endif
3307         if (available_memory >= zfs_write_limit_max)
3308                 return (0);
3309
3310         if (txg > last_txg) {
3311                 last_txg = txg;
3312                 page_load = 0;
3313         }
3314         /*
3315          * If we are in pageout, we know that memory is already tight,
3316          * the arc is already going to be evicting, so we just want to
3317          * continue to let page writes occur as quickly as possible.
3318          */
3319         if (curproc == pageproc) {
3320                 if (page_load > available_memory / 4)
3321                         return (ERESTART);
3322                 /* Note: reserve is inflated, so we deflate */
3323                 page_load += reserve / 8;
3324                 return (0);
3325         } else if (page_load > 0 && arc_reclaim_needed()) {
3326                 /* memory is low, delay before restarting */
3327                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3328                 return (EAGAIN);
3329         }
3330         page_load = 0;
3331
3332         if (arc_size > arc_c_min) {
3333                 uint64_t evictable_memory =
3334                     arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3335                     arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3336                     arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3337                     arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3338                 available_memory += MIN(evictable_memory, arc_size - arc_c_min);
3339         }
3340
3341         if (inflight_data > available_memory / 4) {
3342                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3343                 return (ERESTART);
3344         }
3345 #endif
3346         return (0);
3347 }
3348
3349 void
3350 arc_tempreserve_clear(uint64_t reserve)
3351 {
3352         atomic_add_64(&arc_tempreserve, -reserve);
3353         ASSERT((int64_t)arc_tempreserve >= 0);
3354 }
3355
3356 int
3357 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3358 {
3359         int error;
3360
3361 #ifdef ZFS_DEBUG
3362         /*
3363          * Once in a while, fail for no reason.  Everything should cope.
3364          */
3365         if (spa_get_random(10000) == 0) {
3366                 dprintf("forcing random failure\n");
3367                 return (ERESTART);
3368         }
3369 #endif
3370         if (reserve > arc_c/4 && !arc_no_grow)
3371                 arc_c = MIN(arc_c_max, reserve * 4);
3372         if (reserve > arc_c)
3373                 return (ENOMEM);
3374
3375         /*
3376          * Writes will, almost always, require additional memory allocations
3377          * in order to compress/encrypt/etc the data.  We therefor need to
3378          * make sure that there is sufficient available memory for this.
3379          */
3380         if (error = arc_memory_throttle(reserve, txg))
3381                 return (error);
3382
3383         /*
3384          * Throttle writes when the amount of dirty data in the cache
3385          * gets too large.  We try to keep the cache less than half full
3386          * of dirty blocks so that our sync times don't grow too large.
3387          * Note: if two requests come in concurrently, we might let them
3388          * both succeed, when one of them should fail.  Not a huge deal.
3389          */
3390         if (reserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 &&
3391             arc_anon->arcs_size > arc_c / 4) {
3392                 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3393                     "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3394                     arc_tempreserve>>10,
3395                     arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3396                     arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3397                     reserve>>10, arc_c>>10);
3398                 return (ERESTART);
3399         }
3400         atomic_add_64(&arc_tempreserve, reserve);
3401         return (0);
3402 }
3403
3404 static kmutex_t arc_lowmem_lock;
3405 #ifdef _KERNEL
3406 static eventhandler_tag arc_event_lowmem = NULL;
3407
3408 static void
3409 arc_lowmem(void *arg __unused, int howto __unused)
3410 {
3411
3412         /* Serialize access via arc_lowmem_lock. */
3413         mutex_enter(&arc_lowmem_lock);
3414         needfree = 1;
3415         cv_signal(&arc_reclaim_thr_cv);
3416         while (needfree)
3417                 tsleep(&needfree, 0, "zfs:lowmem", hz / 5);
3418         mutex_exit(&arc_lowmem_lock);
3419 }
3420 #endif
3421
3422 void
3423 arc_init(void)
3424 {
3425         int prefetch_tunable_set = 0;
3426
3427         mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3428         cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3429         mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL);
3430
3431         /* Convert seconds to clock ticks */
3432         arc_min_prefetch_lifespan = 1 * hz;
3433
3434         /* Start out with 1/8 of all memory */
3435         arc_c = kmem_size() / 8;
3436 #if 0
3437 #ifdef _KERNEL
3438         /*
3439          * On architectures where the physical memory can be larger
3440          * than the addressable space (intel in 32-bit mode), we may
3441          * need to limit the cache to 1/8 of VM size.
3442          */
3443         arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3444 #endif
3445 #endif
3446         /* set min cache to 1/32 of all memory, or 16MB, whichever is more */
3447         arc_c_min = MAX(arc_c / 4, 64<<18);
3448         /* set max to 1/2 of all memory, or all but 1GB, whichever is more */
3449         if (arc_c * 8 >= 1<<30)
3450                 arc_c_max = (arc_c * 8) - (1<<30);
3451         else
3452                 arc_c_max = arc_c_min;
3453         arc_c_max = MAX(arc_c * 5, arc_c_max);
3454 #ifdef _KERNEL
3455         /*
3456          * Allow the tunables to override our calculations if they are
3457          * reasonable (ie. over 16MB)
3458          */
3459         if (zfs_arc_max >= 64<<18 && zfs_arc_max < kmem_size())
3460                 arc_c_max = zfs_arc_max;
3461         if (zfs_arc_min >= 64<<18 && zfs_arc_min <= arc_c_max)
3462                 arc_c_min = zfs_arc_min;
3463 #endif
3464         arc_c = arc_c_max;
3465         arc_p = (arc_c >> 1);
3466
3467         /* limit meta-data to 1/4 of the arc capacity */
3468         arc_meta_limit = arc_c_max / 4;
3469
3470         /* Allow the tunable to override if it is reasonable */
3471         if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
3472                 arc_meta_limit = zfs_arc_meta_limit;
3473
3474         if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
3475                 arc_c_min = arc_meta_limit / 2;
3476
3477         /* if kmem_flags are set, lets try to use less memory */
3478         if (kmem_debugging())
3479                 arc_c = arc_c / 2;
3480         if (arc_c < arc_c_min)
3481                 arc_c = arc_c_min;
3482
3483         zfs_arc_min = arc_c_min;
3484         zfs_arc_max = arc_c_max;
3485
3486         arc_anon = &ARC_anon;
3487         arc_mru = &ARC_mru;
3488         arc_mru_ghost = &ARC_mru_ghost;
3489         arc_mfu = &ARC_mfu;
3490         arc_mfu_ghost = &ARC_mfu_ghost;
3491         arc_l2c_only = &ARC_l2c_only;
3492         arc_size = 0;
3493
3494         mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3495         mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3496         mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3497         mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3498         mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3499         mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3500
3501         list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
3502             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3503         list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
3504             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3505         list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
3506             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3507         list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
3508             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3509         list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
3510             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3511         list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
3512             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3513         list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
3514             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3515         list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
3516             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3517         list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
3518             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3519         list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
3520             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3521
3522         buf_init();
3523
3524         arc_thread_exit = 0;
3525         arc_eviction_list = NULL;
3526         mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3527         bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3528
3529         arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3530             sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3531
3532         if (arc_ksp != NULL) {
3533                 arc_ksp->ks_data = &arc_stats;
3534                 kstat_install(arc_ksp);
3535         }
3536
3537         (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3538             TS_RUN, minclsyspri);
3539
3540 #ifdef _KERNEL
3541         arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
3542             EVENTHANDLER_PRI_FIRST);
3543 #endif
3544
3545         arc_dead = FALSE;
3546         arc_warm = B_FALSE;
3547
3548         if (zfs_write_limit_max == 0)
3549                 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
3550         else
3551                 zfs_write_limit_shift = 0;
3552         mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
3553
3554 #ifdef _KERNEL
3555         if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
3556                 prefetch_tunable_set = 1;
3557
3558 #ifdef __i386__
3559         if (prefetch_tunable_set == 0) {
3560                 printf("ZFS NOTICE: prefetch is disabled by default on i386"
3561                     " - add enable to tunable to change.\n" );
3562                 zfs_prefetch_disable=1;
3563         }
3564 #else
3565         if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
3566             prefetch_tunable_set == 0) {
3567                 printf("ZFS NOTICE: system has less than 4GB and prefetch enable is not set"
3568                     "... disabling.\n");
3569                 zfs_prefetch_disable=1;
3570         }
3571 #endif
3572         /* Warn about ZFS memory and address space requirements. */
3573         if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
3574                 printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
3575                     "expect unstable behavior.\n");
3576         }
3577         if (kmem_size() < 512 * (1 << 20)) {
3578                 printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
3579                     "expect unstable behavior.\n");
3580                 printf("             Consider tuning vm.kmem_size and "
3581                     "vm.kmem_size_max\n");
3582                 printf("             in /boot/loader.conf.\n");
3583         }
3584 #endif
3585 }
3586
3587 void
3588 arc_fini(void)
3589 {
3590
3591         mutex_enter(&arc_reclaim_thr_lock);
3592         arc_thread_exit = 1;
3593         cv_signal(&arc_reclaim_thr_cv);
3594         while (arc_thread_exit != 0)
3595                 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3596         mutex_exit(&arc_reclaim_thr_lock);
3597
3598         arc_flush(NULL);
3599
3600         arc_dead = TRUE;
3601
3602         if (arc_ksp != NULL) {
3603                 kstat_delete(arc_ksp);
3604                 arc_ksp = NULL;
3605         }
3606
3607         mutex_destroy(&arc_eviction_mtx);
3608         mutex_destroy(&arc_reclaim_thr_lock);
3609         cv_destroy(&arc_reclaim_thr_cv);
3610
3611         list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3612         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3613         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3614         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3615         list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3616         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3617         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3618         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3619
3620         mutex_destroy(&arc_anon->arcs_mtx);
3621         mutex_destroy(&arc_mru->arcs_mtx);
3622         mutex_destroy(&arc_mru_ghost->arcs_mtx);
3623         mutex_destroy(&arc_mfu->arcs_mtx);
3624         mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3625
3626         mutex_destroy(&zfs_write_limit_lock);
3627
3628         buf_fini();
3629
3630         mutex_destroy(&arc_lowmem_lock);
3631 #ifdef _KERNEL
3632         if (arc_event_lowmem != NULL)
3633                 EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
3634 #endif
3635 }
3636
3637 /*
3638  * Level 2 ARC
3639  *
3640  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3641  * It uses dedicated storage devices to hold cached data, which are populated
3642  * using large infrequent writes.  The main role of this cache is to boost
3643  * the performance of random read workloads.  The intended L2ARC devices
3644  * include short-stroked disks, solid state disks, and other media with
3645  * substantially faster read latency than disk.
3646  *
3647  *                 +-----------------------+
3648  *                 |         ARC           |
3649  *                 +-----------------------+
3650  *                    |         ^     ^
3651  *                    |         |     |
3652  *      l2arc_feed_thread()    arc_read()
3653  *                    |         |     |
3654  *                    |  l2arc read   |
3655  *                    V         |     |
3656  *               +---------------+    |
3657  *               |     L2ARC     |    |
3658  *               +---------------+    |
3659  *                   |    ^           |
3660  *          l2arc_write() |           |
3661  *                   |    |           |
3662  *                   V    |           |
3663  *                 +-------+      +-------+
3664  *                 | vdev  |      | vdev  |
3665  *                 | cache |      | cache |
3666  *                 +-------+      +-------+
3667  *                 +=========+     .-----.
3668  *                 :  L2ARC  :    |-_____-|
3669  *                 : devices :    | Disks |
3670  *                 +=========+    `-_____-'
3671  *
3672  * Read requests are satisfied from the following sources, in order:
3673  *
3674  *      1) ARC
3675  *      2) vdev cache of L2ARC devices
3676  *      3) L2ARC devices
3677  *      4) vdev cache of disks
3678  *      5) disks
3679  *
3680  * Some L2ARC device types exhibit extremely slow write performance.
3681  * To accommodate for this there are some significant differences between
3682  * the L2ARC and traditional cache design:
3683  *
3684  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
3685  * the ARC behave as usual, freeing buffers and placing headers on ghost
3686  * lists.  The ARC does not send buffers to the L2ARC during eviction as
3687  * this would add inflated write latencies for all ARC memory pressure.
3688  *
3689  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
3690  * It does this by periodically scanning buffers from the eviction-end of
3691  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
3692  * not already there.  It scans until a headroom of buffers is satisfied,
3693  * which itself is a buffer for ARC eviction.  The thread that does this is
3694  * l2arc_feed_thread(), illustrated below; example sizes are included to
3695  * provide a better sense of ratio than this diagram:
3696  *
3697  *             head -->                        tail
3698  *              +---------------------+----------+
3699  *      ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
3700  *              +---------------------+----------+   |   o L2ARC eligible
3701  *      ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
3702  *              +---------------------+----------+   |
3703  *                   15.9 Gbytes      ^ 32 Mbytes    |
3704  *                                 headroom          |
3705  *                                            l2arc_feed_thread()
3706  *                                                   |
3707  *                       l2arc write hand <--[oooo]--'
3708  *                               |           8 Mbyte
3709  *                               |          write max
3710  *                               V
3711  *                +==============================+
3712  *      L2ARC dev |####|#|###|###|    |####| ... |
3713  *                +==============================+
3714  *                           32 Gbytes
3715  *
3716  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
3717  * evicted, then the L2ARC has cached a buffer much sooner than it probably
3718  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
3719  * safe to say that this is an uncommon case, since buffers at the end of
3720  * the ARC lists have moved there due to inactivity.
3721  *
3722  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
3723  * then the L2ARC simply misses copying some buffers.  This serves as a
3724  * pressure valve to prevent heavy read workloads from both stalling the ARC
3725  * with waits and clogging the L2ARC with writes.  This also helps prevent
3726  * the potential for the L2ARC to churn if it attempts to cache content too
3727  * quickly, such as during backups of the entire pool.
3728  *
3729  * 5. After system boot and before the ARC has filled main memory, there are
3730  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
3731  * lists can remain mostly static.  Instead of searching from tail of these
3732  * lists as pictured, the l2arc_feed_thread() will search from the list heads
3733  * for eligible buffers, greatly increasing its chance of finding them.
3734  *
3735  * The L2ARC device write speed is also boosted during this time so that
3736  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
3737  * there are no L2ARC reads, and no fear of degrading read performance
3738  * through increased writes.
3739  *
3740  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
3741  * the vdev queue can aggregate them into larger and fewer writes.  Each
3742  * device is written to in a rotor fashion, sweeping writes through
3743  * available space then repeating.
3744  *
3745  * 7. The L2ARC does not store dirty content.  It never needs to flush
3746  * write buffers back to disk based storage.
3747  *
3748  * 8. If an ARC buffer is written (and dirtied) which also exists in the
3749  * L2ARC, the now stale L2ARC buffer is immediately dropped.
3750  *
3751  * The performance of the L2ARC can be tweaked by a number of tunables, which
3752  * may be necessary for different workloads:
3753  *
3754  *      l2arc_write_max         max write bytes per interval
3755  *      l2arc_write_boost       extra write bytes during device warmup
3756  *      l2arc_noprefetch        skip caching prefetched buffers
3757  *      l2arc_headroom          number of max device writes to precache
3758  *      l2arc_feed_secs         seconds between L2ARC writing
3759  *
3760  * Tunables may be removed or added as future performance improvements are
3761  * integrated, and also may become zpool properties.
3762  */
3763
3764 static void
3765 l2arc_hdr_stat_add(void)
3766 {
3767         ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
3768         ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
3769 }
3770
3771 static void
3772 l2arc_hdr_stat_remove(void)
3773 {
3774         ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
3775         ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
3776 }
3777
3778 /*
3779  * Cycle through L2ARC devices.  This is how L2ARC load balances.
3780  * If a device is returned, this also returns holding the spa config lock.
3781  */
3782 static l2arc_dev_t *
3783 l2arc_dev_get_next(void)
3784 {
3785         l2arc_dev_t *first, *next = NULL;
3786
3787         /*
3788          * Lock out the removal of spas (spa_namespace_lock), then removal
3789          * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
3790          * both locks will be dropped and a spa config lock held instead.
3791          */
3792         mutex_enter(&spa_namespace_lock);
3793         mutex_enter(&l2arc_dev_mtx);
3794
3795         /* if there are no vdevs, there is nothing to do */
3796         if (l2arc_ndev == 0)
3797                 goto out;
3798
3799         first = NULL;
3800         next = l2arc_dev_last;
3801         do {
3802                 /* loop around the list looking for a non-faulted vdev */
3803                 if (next == NULL) {
3804                         next = list_head(l2arc_dev_list);
3805                 } else {
3806                         next = list_next(l2arc_dev_list, next);
3807                         if (next == NULL)
3808                                 next = list_head(l2arc_dev_list);
3809                 }
3810
3811                 /* if we have come back to the start, bail out */
3812                 if (first == NULL)
3813                         first = next;
3814                 else if (next == first)
3815                         break;
3816
3817         } while (vdev_is_dead(next->l2ad_vdev));
3818
3819         /* if we were unable to find any usable vdevs, return NULL */
3820         if (vdev_is_dead(next->l2ad_vdev))
3821                 next = NULL;
3822
3823         l2arc_dev_last = next;
3824
3825 out:
3826         mutex_exit(&l2arc_dev_mtx);
3827
3828         /*
3829          * Grab the config lock to prevent the 'next' device from being
3830          * removed while we are writing to it.
3831          */
3832         if (next != NULL)
3833                 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
3834         mutex_exit(&spa_namespace_lock);
3835
3836         return (next);
3837 }
3838
3839 /*
3840  * Free buffers that were tagged for destruction.
3841  */
3842 static void
3843 l2arc_do_free_on_write()
3844 {
3845         list_t *buflist;
3846         l2arc_data_free_t *df, *df_prev;
3847
3848         mutex_enter(&l2arc_free_on_write_mtx);
3849         buflist = l2arc_free_on_write;
3850
3851         for (df = list_tail(buflist); df; df = df_prev) {
3852                 df_prev = list_prev(buflist, df);
3853                 ASSERT(df->l2df_data != NULL);
3854                 ASSERT(df->l2df_func != NULL);
3855                 df->l2df_func(df->l2df_data, df->l2df_size);
3856                 list_remove(buflist, df);
3857                 kmem_free(df, sizeof (l2arc_data_free_t));
3858         }
3859
3860         mutex_exit(&l2arc_free_on_write_mtx);
3861 }
3862
3863 /*
3864  * A write to a cache device has completed.  Update all headers to allow
3865  * reads from these buffers to begin.
3866  */
3867 static void
3868 l2arc_write_done(zio_t *zio)
3869 {
3870         l2arc_write_callback_t *cb;
3871         l2arc_dev_t *dev;
3872         list_t *buflist;
3873         arc_buf_hdr_t *head, *ab, *ab_prev;
3874         l2arc_buf_hdr_t *abl2;
3875         kmutex_t *hash_lock;
3876
3877         cb = zio->io_private;
3878         ASSERT(cb != NULL);
3879         dev = cb->l2wcb_dev;
3880         ASSERT(dev != NULL);
3881         head = cb->l2wcb_head;
3882         ASSERT(head != NULL);
3883         buflist = dev->l2ad_buflist;
3884         ASSERT(buflist != NULL);
3885         DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
3886             l2arc_write_callback_t *, cb);
3887
3888         if (zio->io_error != 0)
3889                 ARCSTAT_BUMP(arcstat_l2_writes_error);
3890
3891         mutex_enter(&l2arc_buflist_mtx);
3892
3893         /*
3894          * All writes completed, or an error was hit.
3895          */
3896         for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
3897                 ab_prev = list_prev(buflist, ab);
3898
3899                 hash_lock = HDR_LOCK(ab);
3900                 if (!mutex_tryenter(hash_lock)) {
3901                         /*
3902                          * This buffer misses out.  It may be in a stage
3903                          * of eviction.  Its ARC_L2_WRITING flag will be
3904                          * left set, denying reads to this buffer.
3905                          */
3906                         ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
3907                         continue;
3908                 }
3909
3910                 if (zio->io_error != 0) {
3911                         /*
3912                          * Error - drop L2ARC entry.
3913                          */
3914                         list_remove(buflist, ab);
3915                         abl2 = ab->b_l2hdr;
3916                         ab->b_l2hdr = NULL;
3917                         kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
3918                         ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
3919                 }
3920
3921                 /*
3922                  * Allow ARC to begin reads to this L2ARC entry.
3923                  */
3924                 ab->b_flags &= ~ARC_L2_WRITING;
3925
3926                 mutex_exit(hash_lock);
3927         }
3928
3929         atomic_inc_64(&l2arc_writes_done);
3930         list_remove(buflist, head);
3931         kmem_cache_free(hdr_cache, head);
3932         mutex_exit(&l2arc_buflist_mtx);
3933
3934         l2arc_do_free_on_write();
3935
3936         kmem_free(cb, sizeof (l2arc_write_callback_t));
3937 }
3938
3939 /*
3940  * A read to a cache device completed.  Validate buffer contents before
3941  * handing over to the regular ARC routines.
3942  */
3943 static void
3944 l2arc_read_done(zio_t *zio)
3945 {
3946         l2arc_read_callback_t *cb;
3947         arc_buf_hdr_t *hdr;
3948         arc_buf_t *buf;
3949         kmutex_t *hash_lock;
3950         int equal;
3951
3952         ASSERT(zio->io_vd != NULL);
3953         ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
3954
3955         spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
3956
3957         cb = zio->io_private;
3958         ASSERT(cb != NULL);
3959         buf = cb->l2rcb_buf;
3960         ASSERT(buf != NULL);
3961         hdr = buf->b_hdr;
3962         ASSERT(hdr != NULL);
3963
3964         hash_lock = HDR_LOCK(hdr);
3965         mutex_enter(hash_lock);
3966
3967         /*
3968          * Check this survived the L2ARC journey.
3969          */
3970         equal = arc_cksum_equal(buf);
3971         if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
3972                 mutex_exit(hash_lock);
3973                 zio->io_private = buf;
3974                 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
3975                 zio->io_bp = &zio->io_bp_copy;  /* XXX fix in L2ARC 2.0 */
3976                 arc_read_done(zio);
3977         } else {
3978                 mutex_exit(hash_lock);
3979                 /*
3980                  * Buffer didn't survive caching.  Increment stats and
3981                  * reissue to the original storage device.
3982                  */
3983                 if (zio->io_error != 0) {
3984                         ARCSTAT_BUMP(arcstat_l2_io_error);
3985                 } else {
3986                         zio->io_error = EIO;
3987                 }
3988                 if (!equal)
3989                         ARCSTAT_BUMP(arcstat_l2_cksum_bad);
3990
3991                 /*
3992                  * If there's no waiter, issue an async i/o to the primary
3993                  * storage now.  If there *is* a waiter, the caller must
3994                  * issue the i/o in a context where it's OK to block.
3995                  */
3996                 if (zio->io_waiter == NULL)
3997                         zio_nowait(zio_read(zio->io_parent,
3998                             cb->l2rcb_spa, &cb->l2rcb_bp,
3999                             buf->b_data, zio->io_size, arc_read_done, buf,
4000                             zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4001         }
4002
4003         kmem_free(cb, sizeof (l2arc_read_callback_t));
4004 }
4005
4006 /*
4007  * This is the list priority from which the L2ARC will search for pages to
4008  * cache.  This is used within loops (0..3) to cycle through lists in the
4009  * desired order.  This order can have a significant effect on cache
4010  * performance.
4011  *
4012  * Currently the metadata lists are hit first, MFU then MRU, followed by
4013  * the data lists.  This function returns a locked list, and also returns
4014  * the lock pointer.
4015  */
4016 static list_t *
4017 l2arc_list_locked(int list_num, kmutex_t **lock)
4018 {
4019         list_t *list;
4020
4021         ASSERT(list_num >= 0 && list_num <= 3);
4022
4023         switch (list_num) {
4024         case 0:
4025                 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4026                 *lock = &arc_mfu->arcs_mtx;
4027                 break;
4028         case 1:
4029                 list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4030                 *lock = &arc_mru->arcs_mtx;
4031                 break;
4032         case 2:
4033                 list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4034                 *lock = &arc_mfu->arcs_mtx;
4035                 break;
4036         case 3:
4037                 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4038                 *lock = &arc_mru->arcs_mtx;
4039                 break;
4040         }
4041
4042         ASSERT(!(MUTEX_HELD(*lock)));
4043         mutex_enter(*lock);
4044         return (list);
4045 }
4046
4047 /*
4048  * Evict buffers from the device write hand to the distance specified in
4049  * bytes.  This distance may span populated buffers, it may span nothing.
4050  * This is clearing a region on the L2ARC device ready for writing.
4051  * If the 'all' boolean is set, every buffer is evicted.
4052  */
4053 static void
4054 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4055 {
4056         list_t *buflist;
4057         l2arc_buf_hdr_t *abl2;
4058         arc_buf_hdr_t *ab, *ab_prev;
4059         kmutex_t *hash_lock;
4060         uint64_t taddr;
4061
4062         buflist = dev->l2ad_buflist;
4063
4064         if (buflist == NULL)
4065                 return;
4066
4067         if (!all && dev->l2ad_first) {
4068                 /*
4069                  * This is the first sweep through the device.  There is
4070                  * nothing to evict.
4071                  */
4072                 return;
4073         }
4074
4075         if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4076                 /*
4077                  * When nearing the end of the device, evict to the end
4078                  * before the device write hand jumps to the start.
4079                  */
4080                 taddr = dev->l2ad_end;
4081         } else {
4082                 taddr = dev->l2ad_hand + distance;
4083         }
4084         DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4085             uint64_t, taddr, boolean_t, all);
4086
4087 top:
4088         mutex_enter(&l2arc_buflist_mtx);
4089         for (ab = list_tail(buflist); ab; ab = ab_prev) {
4090                 ab_prev = list_prev(buflist, ab);
4091
4092                 hash_lock = HDR_LOCK(ab);
4093                 if (!mutex_tryenter(hash_lock)) {
4094                         /*
4095                          * Missed the hash lock.  Retry.
4096                          */
4097                         ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4098                         mutex_exit(&l2arc_buflist_mtx);
4099                         mutex_enter(hash_lock);
4100                         mutex_exit(hash_lock);
4101                         goto top;
4102                 }
4103
4104                 if (HDR_L2_WRITE_HEAD(ab)) {
4105                         /*
4106                          * We hit a write head node.  Leave it for
4107                          * l2arc_write_done().
4108                          */
4109                         list_remove(buflist, ab);
4110                         mutex_exit(hash_lock);
4111                         continue;
4112                 }
4113
4114                 if (!all && ab->b_l2hdr != NULL &&
4115                     (ab->b_l2hdr->b_daddr > taddr ||
4116                     ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4117                         /*
4118                          * We've evicted to the target address,
4119                          * or the end of the device.
4120                          */
4121                         mutex_exit(hash_lock);
4122                         break;
4123                 }
4124
4125                 if (HDR_FREE_IN_PROGRESS(ab)) {
4126                         /*
4127                          * Already on the path to destruction.
4128                          */
4129                         mutex_exit(hash_lock);
4130                         continue;
4131                 }
4132
4133                 if (ab->b_state == arc_l2c_only) {
4134                         ASSERT(!HDR_L2_READING(ab));
4135                         /*
4136                          * This doesn't exist in the ARC.  Destroy.
4137                          * arc_hdr_destroy() will call list_remove()
4138                          * and decrement arcstat_l2_size.
4139                          */
4140                         arc_change_state(arc_anon, ab, hash_lock);
4141                         arc_hdr_destroy(ab);
4142                 } else {
4143                         /*
4144                          * Invalidate issued or about to be issued
4145                          * reads, since we may be about to write
4146                          * over this location.
4147                          */
4148                         if (HDR_L2_READING(ab)) {
4149                                 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4150                                 ab->b_flags |= ARC_L2_EVICTED;
4151                         }
4152
4153                         /*
4154                          * Tell ARC this no longer exists in L2ARC.
4155                          */
4156                         if (ab->b_l2hdr != NULL) {
4157                                 abl2 = ab->b_l2hdr;
4158                                 ab->b_l2hdr = NULL;
4159                                 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4160                                 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4161                         }
4162                         list_remove(buflist, ab);
4163
4164                         /*
4165                          * This may have been leftover after a
4166                          * failed write.
4167                          */
4168                         ab->b_flags &= ~ARC_L2_WRITING;
4169                 }
4170                 mutex_exit(hash_lock);
4171         }
4172         mutex_exit(&l2arc_buflist_mtx);
4173
4174         spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict));
4175         dev->l2ad_evict = taddr;
4176 }
4177
4178 /*
4179  * Find and write ARC buffers to the L2ARC device.
4180  *
4181  * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4182  * for reading until they have completed writing.
4183  */
4184 static void
4185 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
4186 {
4187         arc_buf_hdr_t *ab, *ab_prev, *head;
4188         l2arc_buf_hdr_t *hdrl2;
4189         list_t *list;
4190         uint64_t passed_sz, write_sz, buf_sz, headroom;
4191         void *buf_data;
4192         kmutex_t *hash_lock, *list_lock;
4193         boolean_t have_lock, full;
4194         l2arc_write_callback_t *cb;
4195         zio_t *pio, *wzio;
4196         int try;
4197
4198         ASSERT(dev->l2ad_vdev != NULL);
4199
4200         pio = NULL;
4201         write_sz = 0;
4202         full = B_FALSE;
4203         head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4204         head->b_flags |= ARC_L2_WRITE_HEAD;
4205
4206         /*
4207          * Copy buffers for L2ARC writing.
4208          */
4209         mutex_enter(&l2arc_buflist_mtx);
4210         for (try = 0; try <= 3; try++) {
4211                 list = l2arc_list_locked(try, &list_lock);
4212                 passed_sz = 0;
4213
4214                 /*
4215                  * L2ARC fast warmup.
4216                  *
4217                  * Until the ARC is warm and starts to evict, read from the
4218                  * head of the ARC lists rather than the tail.
4219                  */
4220                 headroom = target_sz * l2arc_headroom;
4221                 if (arc_warm == B_FALSE)
4222                         ab = list_head(list);
4223                 else
4224                         ab = list_tail(list);
4225
4226                 for (; ab; ab = ab_prev) {
4227                         if (arc_warm == B_FALSE)
4228                                 ab_prev = list_next(list, ab);
4229                         else
4230                                 ab_prev = list_prev(list, ab);
4231
4232                         hash_lock = HDR_LOCK(ab);
4233                         have_lock = MUTEX_HELD(hash_lock);
4234                         if (!have_lock && !mutex_tryenter(hash_lock)) {
4235                                 /*
4236                                  * Skip this buffer rather than waiting.
4237                                  */
4238                                 continue;
4239                         }
4240
4241                         passed_sz += ab->b_size;
4242                         if (passed_sz > headroom) {
4243                                 /*
4244                                  * Searched too far.
4245                                  */
4246                                 mutex_exit(hash_lock);
4247                                 break;
4248                         }
4249
4250                         if (ab->b_spa != spa) {
4251                                 mutex_exit(hash_lock);
4252                                 continue;
4253                         }
4254
4255                         if (ab->b_l2hdr != NULL) {
4256                                 /*
4257                                  * Already in L2ARC.
4258                                  */
4259                                 mutex_exit(hash_lock);
4260                                 continue;
4261                         }
4262
4263                         if (HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) {
4264                                 mutex_exit(hash_lock);
4265                                 continue;
4266                         }
4267
4268                         if ((write_sz + ab->b_size) > target_sz) {
4269                                 full = B_TRUE;
4270                                 mutex_exit(hash_lock);
4271                                 break;
4272                         }
4273
4274                         if (ab->b_buf == NULL) {
4275                                 DTRACE_PROBE1(l2arc__buf__null, void *, ab);
4276                                 mutex_exit(hash_lock);
4277                                 continue;
4278                         }
4279
4280                         if (pio == NULL) {
4281                                 /*
4282                                  * Insert a dummy header on the buflist so
4283                                  * l2arc_write_done() can find where the
4284                                  * write buffers begin without searching.
4285                                  */
4286                                 list_insert_head(dev->l2ad_buflist, head);
4287
4288                                 cb = kmem_alloc(
4289                                     sizeof (l2arc_write_callback_t), KM_SLEEP);
4290                                 cb->l2wcb_dev = dev;
4291                                 cb->l2wcb_head = head;
4292                                 pio = zio_root(spa, l2arc_write_done, cb,
4293                                     ZIO_FLAG_CANFAIL);
4294                         }
4295
4296                         /*
4297                          * Create and add a new L2ARC header.
4298                          */
4299                         hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4300                         hdrl2->b_dev = dev;
4301                         hdrl2->b_daddr = dev->l2ad_hand;
4302
4303                         ab->b_flags |= ARC_L2_WRITING;
4304                         ab->b_l2hdr = hdrl2;
4305                         list_insert_head(dev->l2ad_buflist, ab);
4306                         buf_data = ab->b_buf->b_data;
4307                         buf_sz = ab->b_size;
4308
4309                         /*
4310                          * Compute and store the buffer cksum before
4311                          * writing.  On debug the cksum is verified first.
4312                          */
4313                         arc_cksum_verify(ab->b_buf);
4314                         arc_cksum_compute(ab->b_buf, B_TRUE);
4315
4316                         mutex_exit(hash_lock);
4317
4318                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
4319                             dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4320                             NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4321                             ZIO_FLAG_CANFAIL, B_FALSE);
4322
4323                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4324                             zio_t *, wzio);
4325                         (void) zio_nowait(wzio);
4326
4327                         /*
4328                          * Keep the clock hand suitably device-aligned.
4329                          */
4330                         buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4331
4332                         write_sz += buf_sz;
4333                         dev->l2ad_hand += buf_sz;
4334                 }
4335
4336                 mutex_exit(list_lock);
4337
4338                 if (full == B_TRUE)
4339                         break;
4340         }
4341         mutex_exit(&l2arc_buflist_mtx);
4342
4343         if (pio == NULL) {
4344                 ASSERT3U(write_sz, ==, 0);
4345                 kmem_cache_free(hdr_cache, head);
4346                 return;
4347         }
4348
4349         ASSERT3U(write_sz, <=, target_sz);
4350         ARCSTAT_BUMP(arcstat_l2_writes_sent);
4351         ARCSTAT_INCR(arcstat_l2_size, write_sz);
4352         spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz);
4353
4354         /*
4355          * Bump device hand to the device start if it is approaching the end.
4356          * l2arc_evict() will already have evicted ahead for this case.
4357          */
4358         if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4359                 spa_l2cache_space_update(dev->l2ad_vdev, 0,
4360                     dev->l2ad_end - dev->l2ad_hand);
4361                 dev->l2ad_hand = dev->l2ad_start;
4362                 dev->l2ad_evict = dev->l2ad_start;
4363                 dev->l2ad_first = B_FALSE;
4364         }
4365
4366         (void) zio_wait(pio);
4367 }
4368
4369 /*
4370  * This thread feeds the L2ARC at regular intervals.  This is the beating
4371  * heart of the L2ARC.
4372  */
4373 static void
4374 l2arc_feed_thread(void *dummy __unused)
4375 {
4376         callb_cpr_t cpr;
4377         l2arc_dev_t *dev;
4378         spa_t *spa;
4379         uint64_t size;
4380
4381         CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
4382
4383         mutex_enter(&l2arc_feed_thr_lock);
4384
4385         while (l2arc_thread_exit == 0) {
4386                 /*
4387                  * Pause for l2arc_feed_secs seconds between writes.
4388                  */
4389                 CALLB_CPR_SAFE_BEGIN(&cpr);
4390                 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
4391                     hz * l2arc_feed_secs);
4392                 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
4393
4394                 /*
4395                  * Quick check for L2ARC devices.
4396                  */
4397                 mutex_enter(&l2arc_dev_mtx);
4398                 if (l2arc_ndev == 0) {
4399                         mutex_exit(&l2arc_dev_mtx);
4400                         continue;
4401                 }
4402                 mutex_exit(&l2arc_dev_mtx);
4403
4404                 /*
4405                  * This selects the next l2arc device to write to, and in
4406                  * doing so the next spa to feed from: dev->l2ad_spa.   This
4407                  * will return NULL if there are now no l2arc devices or if
4408                  * they are all faulted.
4409                  *
4410                  * If a device is returned, its spa's config lock is also
4411                  * held to prevent device removal.  l2arc_dev_get_next()
4412                  * will grab and release l2arc_dev_mtx.
4413                  */
4414                 if ((dev = l2arc_dev_get_next()) == NULL)
4415                         continue;
4416
4417                 spa = dev->l2ad_spa;
4418                 ASSERT(spa != NULL);
4419
4420                 /*
4421                  * Avoid contributing to memory pressure.
4422                  */
4423                 if (arc_reclaim_needed()) {
4424                         ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
4425                         spa_config_exit(spa, SCL_L2ARC, dev);
4426                         continue;
4427                 }
4428
4429                 ARCSTAT_BUMP(arcstat_l2_feeds);
4430
4431                 size = dev->l2ad_write;
4432                 if (arc_warm == B_FALSE)
4433                         size += dev->l2ad_boost;
4434
4435                 /*
4436                  * Evict L2ARC buffers that will be overwritten.
4437                  */
4438                 l2arc_evict(dev, size, B_FALSE);
4439
4440                 /*
4441                  * Write ARC buffers.
4442                  */
4443                 l2arc_write_buffers(spa, dev, size);
4444                 spa_config_exit(spa, SCL_L2ARC, dev);
4445         }
4446
4447         l2arc_thread_exit = 0;
4448         cv_broadcast(&l2arc_feed_thr_cv);
4449         CALLB_CPR_EXIT(&cpr);           /* drops l2arc_feed_thr_lock */
4450         thread_exit();
4451 }
4452
4453 boolean_t
4454 l2arc_vdev_present(vdev_t *vd)
4455 {
4456         l2arc_dev_t *dev;
4457
4458         mutex_enter(&l2arc_dev_mtx);
4459         for (dev = list_head(l2arc_dev_list); dev != NULL;
4460             dev = list_next(l2arc_dev_list, dev)) {
4461                 if (dev->l2ad_vdev == vd)
4462                         break;
4463         }
4464         mutex_exit(&l2arc_dev_mtx);
4465
4466         return (dev != NULL);
4467 }
4468
4469 /*
4470  * Add a vdev for use by the L2ARC.  By this point the spa has already
4471  * validated the vdev and opened it.
4472  */
4473 void
4474 l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end)
4475 {
4476         l2arc_dev_t *adddev;
4477
4478         ASSERT(!l2arc_vdev_present(vd));
4479
4480         /*
4481          * Create a new l2arc device entry.
4482          */
4483         adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
4484         adddev->l2ad_spa = spa;
4485         adddev->l2ad_vdev = vd;
4486         adddev->l2ad_write = l2arc_write_max;
4487         adddev->l2ad_boost = l2arc_write_boost;
4488         adddev->l2ad_start = start;
4489         adddev->l2ad_end = end;
4490         adddev->l2ad_hand = adddev->l2ad_start;
4491         adddev->l2ad_evict = adddev->l2ad_start;
4492         adddev->l2ad_first = B_TRUE;
4493         ASSERT3U(adddev->l2ad_write, >, 0);
4494
4495         /*
4496          * This is a list of all ARC buffers that are still valid on the
4497          * device.
4498          */
4499         adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
4500         list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
4501             offsetof(arc_buf_hdr_t, b_l2node));
4502
4503         spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0);
4504
4505         /*
4506          * Add device to global list
4507          */
4508         mutex_enter(&l2arc_dev_mtx);
4509         list_insert_head(l2arc_dev_list, adddev);
4510         atomic_inc_64(&l2arc_ndev);
4511         mutex_exit(&l2arc_dev_mtx);
4512 }
4513
4514 /*
4515  * Remove a vdev from the L2ARC.
4516  */
4517 void
4518 l2arc_remove_vdev(vdev_t *vd)
4519 {
4520         l2arc_dev_t *dev, *nextdev, *remdev = NULL;
4521
4522         /*
4523          * Find the device by vdev
4524          */
4525         mutex_enter(&l2arc_dev_mtx);
4526         for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
4527                 nextdev = list_next(l2arc_dev_list, dev);
4528                 if (vd == dev->l2ad_vdev) {
4529                         remdev = dev;
4530                         break;
4531                 }
4532         }
4533         ASSERT(remdev != NULL);
4534
4535         /*
4536          * Remove device from global list
4537          */
4538         list_remove(l2arc_dev_list, remdev);
4539         l2arc_dev_last = NULL;          /* may have been invalidated */
4540         atomic_dec_64(&l2arc_ndev);
4541         mutex_exit(&l2arc_dev_mtx);
4542
4543         /*
4544          * Clear all buflists and ARC references.  L2ARC device flush.
4545          */
4546         l2arc_evict(remdev, 0, B_TRUE);
4547         list_destroy(remdev->l2ad_buflist);
4548         kmem_free(remdev->l2ad_buflist, sizeof (list_t));
4549         kmem_free(remdev, sizeof (l2arc_dev_t));
4550 }
4551
4552 void
4553 l2arc_init(void)
4554 {
4555         l2arc_thread_exit = 0;
4556         l2arc_ndev = 0;
4557         l2arc_writes_sent = 0;
4558         l2arc_writes_done = 0;
4559
4560         mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4561         cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
4562         mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
4563         mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
4564         mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
4565
4566         l2arc_dev_list = &L2ARC_dev_list;
4567         l2arc_free_on_write = &L2ARC_free_on_write;
4568         list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
4569             offsetof(l2arc_dev_t, l2ad_node));
4570         list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
4571             offsetof(l2arc_data_free_t, l2df_list_node));
4572 }
4573
4574 void
4575 l2arc_fini(void)
4576 {
4577         /*
4578          * This is called from dmu_fini(), which is called from spa_fini();
4579          * Because of this, we can assume that all l2arc devices have
4580          * already been removed when the pools themselves were removed.
4581          */
4582
4583         l2arc_do_free_on_write();
4584
4585         mutex_destroy(&l2arc_feed_thr_lock);
4586         cv_destroy(&l2arc_feed_thr_cv);
4587         mutex_destroy(&l2arc_dev_mtx);
4588         mutex_destroy(&l2arc_buflist_mtx);
4589         mutex_destroy(&l2arc_free_on_write_mtx);
4590
4591         list_destroy(l2arc_dev_list);
4592         list_destroy(l2arc_free_on_write);
4593 }
4594
4595 void
4596 l2arc_start(void)
4597 {
4598         if (!(spa_mode & FWRITE))
4599                 return;
4600
4601         (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
4602             TS_RUN, minclsyspri);
4603 }
4604
4605 void
4606 l2arc_stop(void)
4607 {
4608         if (!(spa_mode & FWRITE))
4609                 return;
4610
4611         mutex_enter(&l2arc_feed_thr_lock);
4612         cv_signal(&l2arc_feed_thr_cv);  /* kick thread out of startup */
4613         l2arc_thread_exit = 1;
4614         while (l2arc_thread_exit != 0)
4615                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
4616         mutex_exit(&l2arc_feed_thr_lock);
4617 }