sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 2013 by Delphix. All rights reserved.
  25  */
  26
  27 /*
  28  * DVA-based Adjustable Replacement Cache
  29  *
  30  * While much of the theory of operation used here is
  31  * based on the self-tuning, low overhead replacement cache
  32  * presented by Megiddo and Modha at FAST 2003, there are some
  33  * significant differences:
  34  *
  35  * 1. The Megiddo and Modha model assumes any page is evictable.
  36  * Pages in its cache cannot be "locked" into memory.  This makes
  37  * the eviction algorithm simple: evict the last page in the list.
  38  * This also make the performance characteristics easy to reason
  39  * about.  Our cache is not so simple.  At any given moment, some
  40  * subset of the blocks in the cache are un-evictable because we
  41  * have handed out a reference to them.  Blocks are only evictable
  42  * when there are no external references active.  This makes
  43  * eviction far more problematic:  we choose to evict the evictable
  44  * blocks that are the "lowest" in the list.
  45  *
  46  * There are times when it is not possible to evict the requested
  47  * space.  In these circumstances we are unable to adjust the cache
  48  * size.  To prevent the cache growing unbounded at these times we
  49  * implement a "cache throttle" that slows the flow of new data
  50  * into the cache until we can make space available.
  51  *
  52  * 2. The Megiddo and Modha model assumes a fixed cache size.
  53  * Pages are evicted when the cache is full and there is a cache
  54  * miss.  Our model has a variable sized cache.  It grows with
  55  * high use, but also tries to react to memory pressure from the
  56  * operating system: decreasing its size when system memory is
  57  * tight.
  58  *
  59  * 3. The Megiddo and Modha model assumes a fixed page size. All
  60  * elements of the cache are therefor exactly the same size.  So
  61  * when adjusting the cache size following a cache miss, its simply
  62  * a matter of choosing a single page to evict.  In our model, we
  63  * have variable sized cache blocks (rangeing from 512 bytes to
  64  * 128K bytes).  We therefor choose a set of blocks to evict to make
  65  * space for a cache miss that approximates as closely as possible
  66  * the space used by the new block.
  67  *
  68  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  69  * by N. Megiddo & D. Modha, FAST 2003
  70  */
  71
  72 /*
  73  * The locking model:
  74  *
  75  * A new reference to a cache buffer can be obtained in two
  76  * ways: 1) via a hash table lookup using the DVA as a key,
  77  * or 2) via one of the ARC lists.  The arc_read() interface
  78  * uses method 1, while the internal arc algorithms for
  79  * adjusting the cache use method 2.  We therefor provide two
  80  * types of locks: 1) the hash table lock array, and 2) the
  81  * arc list locks.
  82  *
  83  * Buffers do not have their own mutexs, rather they rely on the
  84  * hash table mutexs for the bulk of their protection (i.e. most
  85  * fields in the arc_buf_hdr_t are protected by these mutexs).
  86  *
  87  * buf_hash_find() returns the appropriate mutex (held) when it
  88  * locates the requested buffer in the hash table.  It returns
  89  * NULL for the mutex if the buffer was not in the table.
  90  *
  91  * buf_hash_remove() expects the appropriate hash mutex to be
  92  * already held before it is invoked.
  93  *
  94  * Each arc state also has a mutex which is used to protect the
  95  * buffer list associated with the state.  When attempting to
  96  * obtain a hash table lock while holding an arc list lock you
  97  * must use: mutex_tryenter() to avoid deadlock.  Also note that
  98  * the active state mutex must be held before the ghost state mutex.
  99  *
 100  * Arc buffers may have an associated eviction callback function.
 101  * This function will be invoked prior to removing the buffer (e.g.
 102  * in arc_do_user_evicts()).  Note however that the data associated
 103  * with the buffer may be evicted prior to the callback.  The callback
 104  * must be made with *no locks held* (to prevent deadlock).  Additionally,
 105  * the users of callbacks must ensure that their private data is
 106  * protected from simultaneous callbacks from arc_buf_evict()
 107  * and arc_do_user_evicts().
 108  *
 109  * Note that the majority of the performance stats are manipulated
 110  * with atomic operations.
 111  *
 112  * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
 113  *
 114  *      - L2ARC buflist creation
 115  *      - L2ARC buflist eviction
 116  *      - L2ARC write completion, which walks L2ARC buflists
 117  *      - ARC header destruction, as it removes from L2ARC buflists
 118  *      - ARC header release, as it removes from L2ARC buflists
 119  */
 120
 121 #include <sys/spa.h>
 122 #include <sys/zio.h>
 123 #include <sys/zfs_context.h>
 124 #include <sys/arc.h>
 125 #include <sys/refcount.h>
 126 #include <sys/vdev.h>
 127 #include <sys/vdev_impl.h>
 128 #ifdef _KERNEL
 129 #include <sys/dnlc.h>
 130 #endif
 131 #include <sys/callb.h>
 132 #include <sys/kstat.h>
 133 #include <sys/trim_map.h>
 134 #include <zfs_fletcher.h>
 135 #include <sys/sdt.h>
 136
 137 #include <vm/vm_pageout.h>
 138
 139 #ifdef illumos
 140 #ifndef _KERNEL
 141 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 142 boolean_t arc_watch = B_FALSE;
 143 int arc_procfd;
 144 #endif
 145 #endif /* illumos */
 146
 147 static kmutex_t         arc_reclaim_thr_lock;
 148 static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 149 static uint8_t          arc_thread_exit;
 150
 151 extern int zfs_write_limit_shift;
 152 extern uint64_t zfs_write_limit_max;
 153 extern kmutex_t zfs_write_limit_lock;
 154
 155 #define ARC_REDUCE_DNLC_PERCENT 3
 156 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
 157
 158 typedef enum arc_reclaim_strategy {
 159         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
 160         ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
 161 } arc_reclaim_strategy_t;
 162
 163 /* number of seconds before growing cache again */
 164 static int              arc_grow_retry = 60;
 165
 166 /* shift of arc_c for calculating both min and max arc_p */
 167 static int              arc_p_min_shift = 4;
 168
 169 /* log2(fraction of arc to reclaim) */
 170 static int              arc_shrink_shift = 5;
 171
 172 /*
 173  * minimum lifespan of a prefetch block in clock ticks
 174  * (initialized in arc_init())
 175  */
 176 static int              arc_min_prefetch_lifespan;
 177
 178 static int arc_dead;
 179 extern int zfs_prefetch_disable;
 180
 181 /*
 182  * The arc has filled available memory and has now warmed up.
 183  */
 184 static boolean_t arc_warm;
 185
 186 /*
 187  * These tunables are for performance analysis.
 188  */
 189 uint64_t zfs_arc_max;
 190 uint64_t zfs_arc_min;
 191 uint64_t zfs_arc_meta_limit = 0;
 192 int zfs_arc_grow_retry = 0;
 193 int zfs_arc_shrink_shift = 0;
 194 int zfs_arc_p_min_shift = 0;
 195 int zfs_disable_dup_eviction = 0;
 196
 197 TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
 198 TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
 199 TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
 200 SYSCTL_DECL(_vfs_zfs);
 201 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
 202     "Maximum ARC size");
 203 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
 204     "Minimum ARC size");
 205
 206 /*
 207  * Note that buffers can be in one of 6 states:
 208  *      ARC_anon        - anonymous (discussed below)
 209  *      ARC_mru         - recently used, currently cached
 210  *      ARC_mru_ghost   - recentely used, no longer in cache
 211  *      ARC_mfu         - frequently used, currently cached
 212  *      ARC_mfu_ghost   - frequently used, no longer in cache
 213  *      ARC_l2c_only    - exists in L2ARC but not other states
 214  * When there are no active references to the buffer, they are
 215  * are linked onto a list in one of these arc states.  These are
 216  * the only buffers that can be evicted or deleted.  Within each
 217  * state there are multiple lists, one for meta-data and one for
 218  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
 219  * etc.) is tracked separately so that it can be managed more
 220  * explicitly: favored over data, limited explicitly.
 221  *
 222  * Anonymous buffers are buffers that are not associated with
 223  * a DVA.  These are buffers that hold dirty block copies
 224  * before they are written to stable storage.  By definition,
 225  * they are "ref'd" and are considered part of arc_mru
 226  * that cannot be freed.  Generally, they will aquire a DVA
 227  * as they are written and migrate onto the arc_mru list.
 228  *
 229  * The ARC_l2c_only state is for buffers that are in the second
 230  * level ARC but no longer in any of the ARC_m* lists.  The second
 231  * level ARC itself may also contain buffers that are in any of
 232  * the ARC_m* states - meaning that a buffer can exist in two
 233  * places.  The reason for the ARC_l2c_only state is to keep the
 234  * buffer header in the hash table, so that reads that hit the
 235  * second level ARC benefit from these fast lookups.
 236  */
 237
 238 #define ARCS_LOCK_PAD           CACHE_LINE_SIZE
 239 struct arcs_lock {
 240         kmutex_t        arcs_lock;
 241 #ifdef _KERNEL
 242         unsigned char   pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))];
 243 #endif
 244 };
 245
 246 /*
 247  * must be power of two for mask use to work
 248  *
 249  */
 250 #define ARC_BUFC_NUMDATALISTS           16
 251 #define ARC_BUFC_NUMMETADATALISTS       16
 252 #define ARC_BUFC_NUMLISTS       (ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS)
 253
 254 typedef struct arc_state {
 255         uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
 256         uint64_t arcs_size;     /* total amount of data in this state */
 257         list_t  arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */
 258         struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE);
 259 } arc_state_t;
 260
 261 #define ARCS_LOCK(s, i) (&((s)->arcs_locks[(i)].arcs_lock))
 262
 263 /* The 6 states: */
 264 static arc_state_t ARC_anon;
 265 static arc_state_t ARC_mru;
 266 static arc_state_t ARC_mru_ghost;
 267 static arc_state_t ARC_mfu;
 268 static arc_state_t ARC_mfu_ghost;
 269 static arc_state_t ARC_l2c_only;
 270
 271 typedef struct arc_stats {
 272         kstat_named_t arcstat_hits;
 273         kstat_named_t arcstat_misses;
 274         kstat_named_t arcstat_demand_data_hits;
 275         kstat_named_t arcstat_demand_data_misses;
 276         kstat_named_t arcstat_demand_metadata_hits;
 277         kstat_named_t arcstat_demand_metadata_misses;
 278         kstat_named_t arcstat_prefetch_data_hits;
 279         kstat_named_t arcstat_prefetch_data_misses;
 280         kstat_named_t arcstat_prefetch_metadata_hits;
 281         kstat_named_t arcstat_prefetch_metadata_misses;
 282         kstat_named_t arcstat_mru_hits;
 283         kstat_named_t arcstat_mru_ghost_hits;
 284         kstat_named_t arcstat_mfu_hits;
 285         kstat_named_t arcstat_mfu_ghost_hits;
 286         kstat_named_t arcstat_allocated;
 287         kstat_named_t arcstat_deleted;
 288         kstat_named_t arcstat_stolen;
 289         kstat_named_t arcstat_recycle_miss;
 290         /*
 291          * Number of buffers that could not be evicted because the hash lock
 292          * was held by another thread.  The lock may not necessarily be held
 293          * by something using the same buffer, since hash locks are shared
 294          * by multiple buffers.
 295          */
 296         kstat_named_t arcstat_mutex_miss;
 297         /*
 298          * Number of buffers skipped because they have I/O in progress, are
 299          * indrect prefetch buffers that have not lived long enough, or are
 300          * not from the spa we're trying to evict from.
 301          */
 302         kstat_named_t arcstat_evict_skip;
 303         kstat_named_t arcstat_evict_l2_cached;
 304         kstat_named_t arcstat_evict_l2_eligible;
 305         kstat_named_t arcstat_evict_l2_ineligible;
 306         kstat_named_t arcstat_hash_elements;
 307         kstat_named_t arcstat_hash_elements_max;
 308         kstat_named_t arcstat_hash_collisions;
 309         kstat_named_t arcstat_hash_chains;
 310         kstat_named_t arcstat_hash_chain_max;
 311         kstat_named_t arcstat_p;
 312         kstat_named_t arcstat_c;
 313         kstat_named_t arcstat_c_min;
 314         kstat_named_t arcstat_c_max;
 315         kstat_named_t arcstat_size;
 316         kstat_named_t arcstat_hdr_size;
 317         kstat_named_t arcstat_data_size;
 318         kstat_named_t arcstat_other_size;
 319         kstat_named_t arcstat_l2_hits;
 320         kstat_named_t arcstat_l2_misses;
 321         kstat_named_t arcstat_l2_feeds;
 322         kstat_named_t arcstat_l2_rw_clash;
 323         kstat_named_t arcstat_l2_read_bytes;
 324         kstat_named_t arcstat_l2_write_bytes;
 325         kstat_named_t arcstat_l2_writes_sent;
 326         kstat_named_t arcstat_l2_writes_done;
 327         kstat_named_t arcstat_l2_writes_error;
 328         kstat_named_t arcstat_l2_writes_hdr_miss;
 329         kstat_named_t arcstat_l2_evict_lock_retry;
 330         kstat_named_t arcstat_l2_evict_reading;
 331         kstat_named_t arcstat_l2_free_on_write;
 332         kstat_named_t arcstat_l2_abort_lowmem;
 333         kstat_named_t arcstat_l2_cksum_bad;
 334         kstat_named_t arcstat_l2_io_error;
 335         kstat_named_t arcstat_l2_size;
 336         kstat_named_t arcstat_l2_hdr_size;
 337         kstat_named_t arcstat_l2_write_trylock_fail;
 338         kstat_named_t arcstat_l2_write_passed_headroom;
 339         kstat_named_t arcstat_l2_write_spa_mismatch;
 340         kstat_named_t arcstat_l2_write_in_l2;
 341         kstat_named_t arcstat_l2_write_hdr_io_in_progress;
 342         kstat_named_t arcstat_l2_write_not_cacheable;
 343         kstat_named_t arcstat_l2_write_full;
 344         kstat_named_t arcstat_l2_write_buffer_iter;
 345         kstat_named_t arcstat_l2_write_pios;
 346         kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
 347         kstat_named_t arcstat_l2_write_buffer_list_iter;
 348         kstat_named_t arcstat_l2_write_buffer_list_null_iter;
 349         kstat_named_t arcstat_memory_throttle_count;
 350         kstat_named_t arcstat_duplicate_buffers;
 351         kstat_named_t arcstat_duplicate_buffers_size;
 352         kstat_named_t arcstat_duplicate_reads;
 353 } arc_stats_t;
 354
 355 static arc_stats_t arc_stats = {
 356         { "hits",                       KSTAT_DATA_UINT64 },
 357         { "misses",                     KSTAT_DATA_UINT64 },
 358         { "demand_data_hits",           KSTAT_DATA_UINT64 },
 359         { "demand_data_misses",         KSTAT_DATA_UINT64 },
 360         { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
 361         { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
 362         { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
 363         { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
 364         { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
 365         { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },
 366         { "mru_hits",                   KSTAT_DATA_UINT64 },
 367         { "mru_ghost_hits",             KSTAT_DATA_UINT64 },
 368         { "mfu_hits",                   KSTAT_DATA_UINT64 },
 369         { "mfu_ghost_hits",             KSTAT_DATA_UINT64 },
 370         { "allocated",                  KSTAT_DATA_UINT64 },
 371         { "deleted",                    KSTAT_DATA_UINT64 },
 372         { "stolen",                     KSTAT_DATA_UINT64 },
 373         { "recycle_miss",               KSTAT_DATA_UINT64 },
 374         { "mutex_miss",                 KSTAT_DATA_UINT64 },
 375         { "evict_skip",                 KSTAT_DATA_UINT64 },
 376         { "evict_l2_cached",            KSTAT_DATA_UINT64 },
 377         { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
 378         { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
 379         { "hash_elements",              KSTAT_DATA_UINT64 },
 380         { "hash_elements_max",          KSTAT_DATA_UINT64 },
 381         { "hash_collisions",            KSTAT_DATA_UINT64 },
 382         { "hash_chains",                KSTAT_DATA_UINT64 },
 383         { "hash_chain_max",             KSTAT_DATA_UINT64 },
 384         { "p",                          KSTAT_DATA_UINT64 },
 385         { "c",                          KSTAT_DATA_UINT64 },
 386         { "c_min",                      KSTAT_DATA_UINT64 },
 387         { "c_max",                      KSTAT_DATA_UINT64 },
 388         { "size",                       KSTAT_DATA_UINT64 },
 389         { "hdr_size",                   KSTAT_DATA_UINT64 },
 390         { "data_size",                  KSTAT_DATA_UINT64 },
 391         { "other_size",                 KSTAT_DATA_UINT64 },
 392         { "l2_hits",                    KSTAT_DATA_UINT64 },
 393         { "l2_misses",                  KSTAT_DATA_UINT64 },
 394         { "l2_feeds",                   KSTAT_DATA_UINT64 },
 395         { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 396         { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 397         { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 398         { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 399         { "l2_writes_done",             KSTAT_DATA_UINT64 },
 400         { "l2_writes_error",            KSTAT_DATA_UINT64 },
 401         { "l2_writes_hdr_miss",         KSTAT_DATA_UINT64 },
 402         { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 403         { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 404         { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 405         { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 406         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 407         { "l2_io_error",                KSTAT_DATA_UINT64 },
 408         { "l2_size",                    KSTAT_DATA_UINT64 },
 409         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 410         { "l2_write_trylock_fail",      KSTAT_DATA_UINT64 },
 411         { "l2_write_passed_headroom",   KSTAT_DATA_UINT64 },
 412         { "l2_write_spa_mismatch",      KSTAT_DATA_UINT64 },
 413         { "l2_write_in_l2",             KSTAT_DATA_UINT64 },
 414         { "l2_write_io_in_progress",    KSTAT_DATA_UINT64 },
 415         { "l2_write_not_cacheable",     KSTAT_DATA_UINT64 },
 416         { "l2_write_full",              KSTAT_DATA_UINT64 },
 417         { "l2_write_buffer_iter",       KSTAT_DATA_UINT64 },
 418         { "l2_write_pios",              KSTAT_DATA_UINT64 },
 419         { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
 420         { "l2_write_buffer_list_iter",  KSTAT_DATA_UINT64 },
 421         { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
 422         { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 423         { "duplicate_buffers",          KSTAT_DATA_UINT64 },
 424         { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
 425         { "duplicate_reads",            KSTAT_DATA_UINT64 }
 426 };
 427
 428 #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
 429
 430 #define ARCSTAT_INCR(stat, val) \
 431         atomic_add_64(&arc_stats.stat.value.ui64, (val));
 432
 433 #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 434 #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 435
 436 #define ARCSTAT_MAX(stat, val) {                                        \
 437         uint64_t m;                                                     \
 438         while ((val) > (m = arc_stats.stat.value.ui64) &&               \
 439             (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
 440                 continue;                                               \
 441 }
 442
 443 #define ARCSTAT_MAXSTAT(stat) \
 444         ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
 445
 446 /*
 447  * We define a macro to allow ARC hits/misses to be easily broken down by
 448  * two separate conditions, giving a total of four different subtypes for
 449  * each of hits and misses (so eight statistics total).
 450  */
 451 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 452         if (cond1) {                                                    \
 453                 if (cond2) {                                            \
 454                         ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 455                 } else {                                                \
 456                         ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 457                 }                                                       \
 458         } else {                                                        \
 459                 if (cond2) {                                            \
 460                         ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 461                 } else {                                                \
 462                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 463                 }                                                       \
 464         }
 465
 466 kstat_t                 *arc_ksp;
 467 static arc_state_t      *arc_anon;
 468 static arc_state_t      *arc_mru;
 469 static arc_state_t      *arc_mru_ghost;
 470 static arc_state_t      *arc_mfu;
 471 static arc_state_t      *arc_mfu_ghost;
 472 static arc_state_t      *arc_l2c_only;
 473
 474 /*
 475  * There are several ARC variables that are critical to export as kstats --
 476  * but we don't want to have to grovel around in the kstat whenever we wish to
 477  * manipulate them.  For these variables, we therefore define them to be in
 478  * terms of the statistic variable.  This assures that we are not introducing
 479  * the possibility of inconsistency by having shadow copies of the variables,
 480  * while still allowing the code to be readable.
 481  */
 482 #define arc_size        ARCSTAT(arcstat_size)   /* actual total arc size */
 483 #define arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
 484 #define arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
 485 #define arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
 486 #define arc_c_max       ARCSTAT(arcstat_c_max)  /* max target cache size */
 487
 488 static int              arc_no_grow;    /* Don't try to grow cache size */
 489 static uint64_t         arc_tempreserve;
 490 static uint64_t         arc_loaned_bytes;
 491 static uint64_t         arc_meta_used;
 492 static uint64_t         arc_meta_limit;
 493 static uint64_t         arc_meta_max = 0;
 494 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RDTUN,
 495     &arc_meta_used, 0, "ARC metadata used");
 496 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RDTUN,
 497     &arc_meta_limit, 0, "ARC metadata limit");
 498
 499 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 500
 501 typedef struct arc_callback arc_callback_t;
 502
 503 struct arc_callback {
 504         void                    *acb_private;
 505         arc_done_func_t         *acb_done;
 506         arc_buf_t               *acb_buf;
 507         zio_t                   *acb_zio_dummy;
 508         arc_callback_t          *acb_next;
 509 };
 510
 511 typedef struct arc_write_callback arc_write_callback_t;
 512
 513 struct arc_write_callback {
 514         void            *awcb_private;
 515         arc_done_func_t *awcb_ready;
 516         arc_done_func_t *awcb_done;
 517         arc_buf_t       *awcb_buf;
 518 };
 519
 520 struct arc_buf_hdr {
 521         /* protected by hash lock */
 522         dva_t                   b_dva;
 523         uint64_t                b_birth;
 524         uint64_t                b_cksum0;
 525
 526         kmutex_t                b_freeze_lock;
 527         zio_cksum_t             *b_freeze_cksum;
 528         void                    *b_thawed;
 529
 530         arc_buf_hdr_t           *b_hash_next;
 531         arc_buf_t               *b_buf;
 532         uint32_t                b_flags;
 533         uint32_t                b_datacnt;
 534
 535         arc_callback_t          *b_acb;
 536         kcondvar_t              b_cv;
 537
 538         /* immutable */
 539         arc_buf_contents_t      b_type;
 540         uint64_t                b_size;
 541         uint64_t                b_spa;
 542
 543         /* protected by arc state mutex */
 544         arc_state_t             *b_state;
 545         list_node_t             b_arc_node;
 546
 547         /* updated atomically */
 548         clock_t                 b_arc_access;
 549
 550         /* self protecting */
 551         refcount_t              b_refcnt;
 552
 553         l2arc_buf_hdr_t         *b_l2hdr;
 554         list_node_t             b_l2node;
 555 };
 556
 557 static arc_buf_t *arc_eviction_list;
 558 static kmutex_t arc_eviction_mtx;
 559 static arc_buf_hdr_t arc_eviction_hdr;
 560 static void arc_get_data_buf(arc_buf_t *buf);
 561 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
 562 static int arc_evict_needed(arc_buf_contents_t type);
 563 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
 564 #ifdef illumos
 565 static void arc_buf_watch(arc_buf_t *buf);
 566 #endif /* illumos */
 567
 568 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
 569
 570 #define GHOST_STATE(state)      \
 571         ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
 572         (state) == arc_l2c_only)
 573
 574 /*
 575  * Private ARC flags.  These flags are private ARC only flags that will show up
 576  * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
 577  * be passed in as arc_flags in things like arc_read.  However, these flags
 578  * should never be passed and should only be set by ARC code.  When adding new
 579  * public flags, make sure not to smash the private ones.
 580  */
 581
 582 #define ARC_IN_HASH_TABLE       (1 << 9)        /* this buffer is hashed */
 583 #define ARC_IO_IN_PROGRESS      (1 << 10)       /* I/O in progress for buf */
 584 #define ARC_IO_ERROR            (1 << 11)       /* I/O failed for buf */
 585 #define ARC_FREED_IN_READ       (1 << 12)       /* buf freed while in read */
 586 #define ARC_BUF_AVAILABLE       (1 << 13)       /* block not in active use */
 587 #define ARC_INDIRECT            (1 << 14)       /* this is an indirect block */
 588 #define ARC_FREE_IN_PROGRESS    (1 << 15)       /* hdr about to be freed */
 589 #define ARC_L2_WRITING          (1 << 16)       /* L2ARC write in progress */
 590 #define ARC_L2_EVICTED          (1 << 17)       /* evicted during I/O */
 591 #define ARC_L2_WRITE_HEAD       (1 << 18)       /* head of write list */
 592
 593 #define HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_IN_HASH_TABLE)
 594 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
 595 #define HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_IO_ERROR)
 596 #define HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_PREFETCH)
 597 #define HDR_FREED_IN_READ(hdr)  ((hdr)->b_flags & ARC_FREED_IN_READ)
 598 #define HDR_BUF_AVAILABLE(hdr)  ((hdr)->b_flags & ARC_BUF_AVAILABLE)
 599 #define HDR_FREE_IN_PROGRESS(hdr)       ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
 600 #define HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_L2CACHE)
 601 #define HDR_L2_READING(hdr)     ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \
 602                                     (hdr)->b_l2hdr != NULL)
 603 #define HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_L2_WRITING)
 604 #define HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_L2_EVICTED)
 605 #define HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
 606
 607 /*
 608  * Other sizes
 609  */
 610
 611 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 612 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
 613
 614 /*
 615  * Hash table routines
 616  */
 617
 618 #define HT_LOCK_PAD     CACHE_LINE_SIZE
 619
 620 struct ht_lock {
 621         kmutex_t        ht_lock;
 622 #ifdef _KERNEL
 623         unsigned char   pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
 624 #endif
 625 };
 626
 627 #define BUF_LOCKS 256
 628 typedef struct buf_hash_table {
 629         uint64_t ht_mask;
 630         arc_buf_hdr_t **ht_table;
 631         struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
 632 } buf_hash_table_t;
 633
 634 static buf_hash_table_t buf_hash_table;
 635
 636 #define BUF_HASH_INDEX(spa, dva, birth) \
 637         (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 638 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 639 #define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 640 #define HDR_LOCK(hdr) \
 641         (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 642
 643 uint64_t zfs_crc64_table[256];
 644
 645 /*
 646  * Level 2 ARC
 647  */
 648
 649 #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
 650 #define L2ARC_HEADROOM          2               /* num of writes */
 651 #define L2ARC_FEED_SECS         1               /* caching interval secs */
 652 #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 653
 654 #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 655 #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 656
 657 /*
 658  * L2ARC Performance Tunables
 659  */
 660 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;    /* default max write size */
 661 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;  /* extra write during warmup */
 662 uint64_t l2arc_headroom = L2ARC_HEADROOM;       /* number of dev writes */
 663 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     /* interval seconds */
 664 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
 665 boolean_t l2arc_noprefetch = B_TRUE;            /* don't cache prefetch bufs */
 666 boolean_t l2arc_feed_again = B_TRUE;            /* turbo warmup */
 667 boolean_t l2arc_norw = B_TRUE;                  /* no reads during writes */
 668
 669 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
 670     &l2arc_write_max, 0, "max write size");
 671 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
 672     &l2arc_write_boost, 0, "extra write during warmup");
 673 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
 674     &l2arc_headroom, 0, "number of dev writes");
 675 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
 676     &l2arc_feed_secs, 0, "interval seconds");
 677 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
 678     &l2arc_feed_min_ms, 0, "min interval milliseconds");
 679
 680 SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
 681     &l2arc_noprefetch, 0, "don't cache prefetch bufs");
 682 SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
 683     &l2arc_feed_again, 0, "turbo warmup");
 684 SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
 685     &l2arc_norw, 0, "no reads during writes");
 686
 687 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
 688     &ARC_anon.arcs_size, 0, "size of anonymous state");
 689 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,
 690     &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state");
 691 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,
 692     &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state");
 693
 694 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
 695     &ARC_mru.arcs_size, 0, "size of mru state");
 696 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,
 697     &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state");
 698 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,
 699     &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state");
 700
 701 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
 702     &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state");
 703 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,
 704     &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
 705     "size of metadata in mru ghost state");
 706 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,
 707     &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
 708     "size of data in mru ghost state");
 709
 710 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
 711     &ARC_mfu.arcs_size, 0, "size of mfu state");
 712 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,
 713     &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state");
 714 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,
 715     &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state");
 716
 717 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
 718     &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state");
 719 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,
 720     &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
 721     "size of metadata in mfu ghost state");
 722 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,
 723     &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
 724     "size of data in mfu ghost state");
 725
 726 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
 727     &ARC_l2c_only.arcs_size, 0, "size of mru state");
 728
 729 /*
 730  * L2ARC Internals
 731  */
 732 typedef struct l2arc_dev {
 733         vdev_t                  *l2ad_vdev;     /* vdev */
 734         spa_t                   *l2ad_spa;      /* spa */
 735         uint64_t                l2ad_hand;      /* next write location */
 736         uint64_t                l2ad_write;     /* desired write size, bytes */
 737         uint64_t                l2ad_boost;     /* warmup write boost, bytes */
 738         uint64_t                l2ad_start;     /* first addr on device */
 739         uint64_t                l2ad_end;       /* last addr on device */
 740         uint64_t                l2ad_evict;     /* last addr eviction reached */
 741         boolean_t               l2ad_first;     /* first sweep through */
 742         boolean_t               l2ad_writing;   /* currently writing */
 743         list_t                  *l2ad_buflist;  /* buffer list */
 744         list_node_t             l2ad_node;      /* device list node */
 745 } l2arc_dev_t;
 746
 747 static list_t L2ARC_dev_list;                   /* device list */
 748 static list_t *l2arc_dev_list;                  /* device list pointer */
 749 static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 750 static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 751 static kmutex_t l2arc_buflist_mtx;              /* mutex for all buflists */
 752 static list_t L2ARC_free_on_write;              /* free after write buf list */
 753 static list_t *l2arc_free_on_write;             /* free after write list ptr */
 754 static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 755 static uint64_t l2arc_ndev;                     /* number of devices */
 756
 757 typedef struct l2arc_read_callback {
 758         arc_buf_t       *l2rcb_buf;             /* read buffer */
 759         spa_t           *l2rcb_spa;             /* spa */
 760         blkptr_t        l2rcb_bp;               /* original blkptr */
 761         zbookmark_t     l2rcb_zb;               /* original bookmark */
 762         int             l2rcb_flags;            /* original flags */
 763 } l2arc_read_callback_t;
 764
 765 typedef struct l2arc_write_callback {
 766         l2arc_dev_t     *l2wcb_dev;             /* device info */
 767         arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
 768 } l2arc_write_callback_t;
 769
 770 struct l2arc_buf_hdr {
 771         /* protected by arc_buf_hdr  mutex */
 772         l2arc_dev_t     *b_dev;                 /* L2ARC device */
 773         uint64_t        b_daddr;                /* disk address, offset byte */
 774 };
 775
 776 typedef struct l2arc_data_free {
 777         /* protected by l2arc_free_on_write_mtx */
 778         void            *l2df_data;
 779         size_t          l2df_size;
 780         void            (*l2df_func)(void *, size_t);
 781         list_node_t     l2df_list_node;
 782 } l2arc_data_free_t;
 783
 784 static kmutex_t l2arc_feed_thr_lock;
 785 static kcondvar_t l2arc_feed_thr_cv;
 786 static uint8_t l2arc_thread_exit;
 787
 788 static void l2arc_read_done(zio_t *zio);
 789 static void l2arc_hdr_stat_add(void);
 790 static void l2arc_hdr_stat_remove(void);
 791
 792 static uint64_t
 793 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 794 {
 795         uint8_t *vdva = (uint8_t *)dva;
 796         uint64_t crc = -1ULL;
 797         int i;
 798
 799         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 800
 801         for (i = 0; i < sizeof (dva_t); i++)
 802                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 803
 804         crc ^= (spa>>8) ^ birth;
 805
 806         return (crc);
 807 }
 808
 809 #define BUF_EMPTY(buf)                                          \
 810         ((buf)->b_dva.dva_word[0] == 0 &&                       \
 811         (buf)->b_dva.dva_word[1] == 0 &&                        \
 812         (buf)->b_birth == 0)
 813
 814 #define BUF_EQUAL(spa, dva, birth, buf)                         \
 815         ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&     \
 816         ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&     \
 817         ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 818
 819 static void
 820 buf_discard_identity(arc_buf_hdr_t *hdr)
 821 {
 822         hdr->b_dva.dva_word[0] = 0;
 823         hdr->b_dva.dva_word[1] = 0;
 824         hdr->b_birth = 0;
 825         hdr->b_cksum0 = 0;
 826 }
 827
 828 static arc_buf_hdr_t *
 829 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
 830 {
 831         uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 832         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 833         arc_buf_hdr_t *buf;
 834
 835         mutex_enter(hash_lock);
 836         for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
 837             buf = buf->b_hash_next) {
 838                 if (BUF_EQUAL(spa, dva, birth, buf)) {
 839                         *lockp = hash_lock;
 840                         return (buf);
 841                 }
 842         }
 843         mutex_exit(hash_lock);
 844         *lockp = NULL;
 845         return (NULL);
 846 }
 847
 848 /*
 849  * Insert an entry into the hash table.  If there is already an element
 850  * equal to elem in the hash table, then the already existing element
 851  * will be returned and the new element will not be inserted.
 852  * Otherwise returns NULL.
 853  */
 854 static arc_buf_hdr_t *
 855 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
 856 {
 857         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 858         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 859         arc_buf_hdr_t *fbuf;
 860         uint32_t i;
 861
 862         ASSERT(!HDR_IN_HASH_TABLE(buf));
 863         *lockp = hash_lock;
 864         mutex_enter(hash_lock);
 865         for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
 866             fbuf = fbuf->b_hash_next, i++) {
 867                 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
 868                         return (fbuf);
 869         }
 870
 871         buf->b_hash_next = buf_hash_table.ht_table[idx];
 872         buf_hash_table.ht_table[idx] = buf;
 873         buf->b_flags |= ARC_IN_HASH_TABLE;
 874
 875         /* collect some hash table performance data */
 876         if (i > 0) {
 877                 ARCSTAT_BUMP(arcstat_hash_collisions);
 878                 if (i == 1)
 879                         ARCSTAT_BUMP(arcstat_hash_chains);
 880
 881                 ARCSTAT_MAX(arcstat_hash_chain_max, i);
 882         }
 883
 884         ARCSTAT_BUMP(arcstat_hash_elements);
 885         ARCSTAT_MAXSTAT(arcstat_hash_elements);
 886
 887         return (NULL);
 888 }
 889
 890 static void
 891 buf_hash_remove(arc_buf_hdr_t *buf)
 892 {
 893         arc_buf_hdr_t *fbuf, **bufp;
 894         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 895
 896         ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 897         ASSERT(HDR_IN_HASH_TABLE(buf));
 898
 899         bufp = &buf_hash_table.ht_table[idx];
 900         while ((fbuf = *bufp) != buf) {
 901                 ASSERT(fbuf != NULL);
 902                 bufp = &fbuf->b_hash_next;
 903         }
 904         *bufp = buf->b_hash_next;
 905         buf->b_hash_next = NULL;
 906         buf->b_flags &= ~ARC_IN_HASH_TABLE;
 907
 908         /* collect some hash table performance data */
 909         ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 910
 911         if (buf_hash_table.ht_table[idx] &&
 912             buf_hash_table.ht_table[idx]->b_hash_next == NULL)
 913                 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 914 }
 915
 916 /*
 917  * Global data structures and functions for the buf kmem cache.
 918  */
 919 static kmem_cache_t *hdr_cache;
 920 static kmem_cache_t *buf_cache;
 921
 922 static void
 923 buf_fini(void)
 924 {
 925         int i;
 926
 927         kmem_free(buf_hash_table.ht_table,
 928             (buf_hash_table.ht_mask + 1) * sizeof (void *));
 929         for (i = 0; i < BUF_LOCKS; i++)
 930                 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
 931         kmem_cache_destroy(hdr_cache);
 932         kmem_cache_destroy(buf_cache);
 933 }
 934
 935 /*
 936  * Constructor callback - called when the cache is empty
 937  * and a new buf is requested.
 938  */
 939 /* ARGSUSED */
 940 static int
 941 hdr_cons(void *vbuf, void *unused, int kmflag)
 942 {
 943         arc_buf_hdr_t *buf = vbuf;
 944
 945         bzero(buf, sizeof (arc_buf_hdr_t));
 946         refcount_create(&buf->b_refcnt);
 947         cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
 948         mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
 949         arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 950
 951         return (0);
 952 }
 953
 954 /* ARGSUSED */
 955 static int
 956 buf_cons(void *vbuf, void *unused, int kmflag)
 957 {
 958         arc_buf_t *buf = vbuf;
 959
 960         bzero(buf, sizeof (arc_buf_t));
 961         mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 962         arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 963
 964         return (0);
 965 }
 966
 967 /*
 968  * Destructor callback - called when a cached buf is
 969  * no longer required.
 970  */
 971 /* ARGSUSED */
 972 static void
 973 hdr_dest(void *vbuf, void *unused)
 974 {
 975         arc_buf_hdr_t *buf = vbuf;
 976
 977         ASSERT(BUF_EMPTY(buf));
 978         refcount_destroy(&buf->b_refcnt);
 979         cv_destroy(&buf->b_cv);
 980         mutex_destroy(&buf->b_freeze_lock);
 981         arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 982 }
 983
 984 /* ARGSUSED */
 985 static void
 986 buf_dest(void *vbuf, void *unused)
 987 {
 988         arc_buf_t *buf = vbuf;
 989
 990         mutex_destroy(&buf->b_evict_lock);
 991         arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 992 }
 993
 994 /*
 995  * Reclaim callback -- invoked when memory is low.
 996  */
 997 /* ARGSUSED */
 998 static void
 999 hdr_recl(void *unused)
1000 {
1001         dprintf("hdr_recl called\n");
1002         /*
1003          * umem calls the reclaim func when we destroy the buf cache,
1004          * which is after we do arc_fini().
1005          */
1006         if (!arc_dead)
1007                 cv_signal(&arc_reclaim_thr_cv);
1008 }
1009
1010 static void
1011 buf_init(void)
1012 {
1013         uint64_t *ct;
1014         uint64_t hsize = 1ULL << 12;
1015         int i, j;
1016
1017         /*
1018          * The hash table is big enough to fill all of physical memory
1019          * with an average 64K block size.  The table will take up
1020          * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
1021          */
1022         while (hsize * 65536 < (uint64_t)physmem * PAGESIZE)
1023                 hsize <<= 1;
1024 retry:
1025         buf_hash_table.ht_mask = hsize - 1;
1026         buf_hash_table.ht_table =
1027             kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1028         if (buf_hash_table.ht_table == NULL) {
1029                 ASSERT(hsize > (1ULL << 8));
1030                 hsize >>= 1;
1031                 goto retry;
1032         }
1033
1034         hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
1035             0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
1036         buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1037             0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1038
1039         for (i = 0; i < 256; i++)
1040                 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1041                         *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1042
1043         for (i = 0; i < BUF_LOCKS; i++) {
1044                 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1045                     NULL, MUTEX_DEFAULT, NULL);
1046         }
1047 }
1048
1049 #define ARC_MINTIME     (hz>>4) /* 62 ms */
1050
1051 static void
1052 arc_cksum_verify(arc_buf_t *buf)
1053 {
1054         zio_cksum_t zc;
1055
1056         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1057                 return;
1058
1059         mutex_enter(&buf->b_hdr->b_freeze_lock);
1060         if (buf->b_hdr->b_freeze_cksum == NULL ||
1061             (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
1062                 mutex_exit(&buf->b_hdr->b_freeze_lock);
1063                 return;
1064         }
1065         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1066         if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1067                 panic("buffer modified while frozen!");
1068         mutex_exit(&buf->b_hdr->b_freeze_lock);
1069 }
1070
1071 static int
1072 arc_cksum_equal(arc_buf_t *buf)
1073 {
1074         zio_cksum_t zc;
1075         int equal;
1076
1077         mutex_enter(&buf->b_hdr->b_freeze_lock);
1078         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1079         equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1080         mutex_exit(&buf->b_hdr->b_freeze_lock);
1081
1082         return (equal);
1083 }
1084
1085 static void
1086 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1087 {
1088         if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1089                 return;
1090
1091         mutex_enter(&buf->b_hdr->b_freeze_lock);
1092         if (buf->b_hdr->b_freeze_cksum != NULL) {
1093                 mutex_exit(&buf->b_hdr->b_freeze_lock);
1094                 return;
1095         }
1096         buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1097         fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1098             buf->b_hdr->b_freeze_cksum);
1099         mutex_exit(&buf->b_hdr->b_freeze_lock);
1100 #ifdef illumos
1101         arc_buf_watch(buf);
1102 #endif /* illumos */
1103 }
1104
1105 #ifdef illumos
1106 #ifndef _KERNEL
1107 typedef struct procctl {
1108         long cmd;
1109         prwatch_t prwatch;
1110 } procctl_t;
1111 #endif
1112
1113 /* ARGSUSED */
1114 static void
1115 arc_buf_unwatch(arc_buf_t *buf)
1116 {
1117 #ifndef _KERNEL
1118         if (arc_watch) {
1119                 int result;
1120                 procctl_t ctl;
1121                 ctl.cmd = PCWATCH;
1122                 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1123                 ctl.prwatch.pr_size = 0;
1124                 ctl.prwatch.pr_wflags = 0;
1125                 result = write(arc_procfd, &ctl, sizeof (ctl));
1126                 ASSERT3U(result, ==, sizeof (ctl));
1127         }
1128 #endif
1129 }
1130
1131 /* ARGSUSED */
1132 static void
1133 arc_buf_watch(arc_buf_t *buf)
1134 {
1135 #ifndef _KERNEL
1136         if (arc_watch) {
1137                 int result;
1138                 procctl_t ctl;
1139                 ctl.cmd = PCWATCH;
1140                 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1141                 ctl.prwatch.pr_size = buf->b_hdr->b_size;
1142                 ctl.prwatch.pr_wflags = WA_WRITE;
1143                 result = write(arc_procfd, &ctl, sizeof (ctl));
1144                 ASSERT3U(result, ==, sizeof (ctl));
1145         }
1146 #endif
1147 }
1148 #endif /* illumos */
1149
1150 void
1151 arc_buf_thaw(arc_buf_t *buf)
1152 {
1153         if (zfs_flags & ZFS_DEBUG_MODIFY) {
1154                 if (buf->b_hdr->b_state != arc_anon)
1155                         panic("modifying non-anon buffer!");
1156                 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1157                         panic("modifying buffer while i/o in progress!");
1158                 arc_cksum_verify(buf);
1159         }
1160
1161         mutex_enter(&buf->b_hdr->b_freeze_lock);
1162         if (buf->b_hdr->b_freeze_cksum != NULL) {
1163                 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1164                 buf->b_hdr->b_freeze_cksum = NULL;
1165         }
1166
1167         if (zfs_flags & ZFS_DEBUG_MODIFY) {
1168                 if (buf->b_hdr->b_thawed)
1169                         kmem_free(buf->b_hdr->b_thawed, 1);
1170                 buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1171         }
1172
1173         mutex_exit(&buf->b_hdr->b_freeze_lock);
1174
1175 #ifdef illumos
1176         arc_buf_unwatch(buf);
1177 #endif /* illumos */
1178 }
1179
1180 void
1181 arc_buf_freeze(arc_buf_t *buf)
1182 {
1183         kmutex_t *hash_lock;
1184
1185         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1186                 return;
1187
1188         hash_lock = HDR_LOCK(buf->b_hdr);
1189         mutex_enter(hash_lock);
1190
1191         ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1192             buf->b_hdr->b_state == arc_anon);
1193         arc_cksum_compute(buf, B_FALSE);
1194         mutex_exit(hash_lock);
1195
1196 }
1197
1198 static void
1199 get_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lock)
1200 {
1201         uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth);
1202
1203         if (ab->b_type == ARC_BUFC_METADATA)
1204                 buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1);
1205         else {
1206                 buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1);
1207                 buf_hashid += ARC_BUFC_NUMMETADATALISTS;
1208         }
1209
1210         *list = &state->arcs_lists[buf_hashid];
1211         *lock = ARCS_LOCK(state, buf_hashid);
1212 }
1213
1214
1215 static void
1216 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1217 {
1218         ASSERT(MUTEX_HELD(hash_lock));
1219
1220         if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1221             (ab->b_state != arc_anon)) {
1222                 uint64_t delta = ab->b_size * ab->b_datacnt;
1223                 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1224                 list_t *list;
1225                 kmutex_t *lock;
1226
1227                 get_buf_info(ab, ab->b_state, &list, &lock);
1228                 ASSERT(!MUTEX_HELD(lock));
1229                 mutex_enter(lock);
1230                 ASSERT(list_link_active(&ab->b_arc_node));
1231                 list_remove(list, ab);
1232                 if (GHOST_STATE(ab->b_state)) {
1233                         ASSERT0(ab->b_datacnt);
1234                         ASSERT3P(ab->b_buf, ==, NULL);
1235                         delta = ab->b_size;
1236                 }
1237                 ASSERT(delta > 0);
1238                 ASSERT3U(*size, >=, delta);
1239                 atomic_add_64(size, -delta);
1240                 mutex_exit(lock);
1241                 /* remove the prefetch flag if we get a reference */
1242                 if (ab->b_flags & ARC_PREFETCH)
1243                         ab->b_flags &= ~ARC_PREFETCH;
1244         }
1245 }
1246
1247 static int
1248 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1249 {
1250         int cnt;
1251         arc_state_t *state = ab->b_state;
1252
1253         ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1254         ASSERT(!GHOST_STATE(state));
1255
1256         if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1257             (state != arc_anon)) {
1258                 uint64_t *size = &state->arcs_lsize[ab->b_type];
1259                 list_t *list;
1260                 kmutex_t *lock;
1261
1262                 get_buf_info(ab, state, &list, &lock);
1263                 ASSERT(!MUTEX_HELD(lock));
1264                 mutex_enter(lock);
1265                 ASSERT(!list_link_active(&ab->b_arc_node));
1266                 list_insert_head(list, ab);
1267                 ASSERT(ab->b_datacnt > 0);
1268                 atomic_add_64(size, ab->b_size * ab->b_datacnt);
1269                 mutex_exit(lock);
1270         }
1271         return (cnt);
1272 }
1273
1274 /*
1275  * Move the supplied buffer to the indicated state.  The mutex
1276  * for the buffer must be held by the caller.
1277  */
1278 static void
1279 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1280 {
1281         arc_state_t *old_state = ab->b_state;
1282         int64_t refcnt = refcount_count(&ab->b_refcnt);
1283         uint64_t from_delta, to_delta;
1284         list_t *list;
1285         kmutex_t *lock;
1286
1287         ASSERT(MUTEX_HELD(hash_lock));
1288         ASSERT(new_state != old_state);
1289         ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1290         ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1291         ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1292
1293         from_delta = to_delta = ab->b_datacnt * ab->b_size;
1294
1295         /*
1296          * If this buffer is evictable, transfer it from the
1297          * old state list to the new state list.
1298          */
1299         if (refcnt == 0) {
1300                 if (old_state != arc_anon) {
1301                         int use_mutex;
1302                         uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1303
1304                         get_buf_info(ab, old_state, &list, &lock);
1305                         use_mutex = !MUTEX_HELD(lock);
1306                         if (use_mutex)
1307                                 mutex_enter(lock);
1308
1309                         ASSERT(list_link_active(&ab->b_arc_node));
1310                         list_remove(list, ab);
1311
1312                         /*
1313                          * If prefetching out of the ghost cache,
1314                          * we will have a non-zero datacnt.
1315                          */
1316                         if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1317                                 /* ghost elements have a ghost size */
1318                                 ASSERT(ab->b_buf == NULL);
1319                                 from_delta = ab->b_size;
1320                         }
1321                         ASSERT3U(*size, >=, from_delta);
1322                         atomic_add_64(size, -from_delta);
1323
1324                         if (use_mutex)
1325                                 mutex_exit(lock);
1326                 }
1327                 if (new_state != arc_anon) {
1328                         int use_mutex;
1329                         uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1330
1331                         get_buf_info(ab, new_state, &list, &lock);
1332                         use_mutex = !MUTEX_HELD(lock);
1333                         if (use_mutex)
1334                                 mutex_enter(lock);
1335
1336                         list_insert_head(list, ab);
1337
1338                         /* ghost elements have a ghost size */
1339                         if (GHOST_STATE(new_state)) {
1340                                 ASSERT(ab->b_datacnt == 0);
1341                                 ASSERT(ab->b_buf == NULL);
1342                                 to_delta = ab->b_size;
1343                         }
1344                         atomic_add_64(size, to_delta);
1345
1346                         if (use_mutex)
1347                                 mutex_exit(lock);
1348                 }
1349         }
1350
1351         ASSERT(!BUF_EMPTY(ab));
1352         if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1353                 buf_hash_remove(ab);
1354
1355         /* adjust state sizes */
1356         if (to_delta)
1357                 atomic_add_64(&new_state->arcs_size, to_delta);
1358         if (from_delta) {
1359                 ASSERT3U(old_state->arcs_size, >=, from_delta);
1360                 atomic_add_64(&old_state->arcs_size, -from_delta);
1361         }
1362         ab->b_state = new_state;
1363
1364         /* adjust l2arc hdr stats */
1365         if (new_state == arc_l2c_only)
1366                 l2arc_hdr_stat_add();
1367         else if (old_state == arc_l2c_only)
1368                 l2arc_hdr_stat_remove();
1369 }
1370
1371 void
1372 arc_space_consume(uint64_t space, arc_space_type_t type)
1373 {
1374         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1375
1376         switch (type) {
1377         case ARC_SPACE_DATA:
1378                 ARCSTAT_INCR(arcstat_data_size, space);
1379                 break;
1380         case ARC_SPACE_OTHER:
1381                 ARCSTAT_INCR(arcstat_other_size, space);
1382                 break;
1383         case ARC_SPACE_HDRS:
1384                 ARCSTAT_INCR(arcstat_hdr_size, space);
1385                 break;
1386         case ARC_SPACE_L2HDRS:
1387                 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1388                 break;
1389         }
1390
1391         atomic_add_64(&arc_meta_used, space);
1392         atomic_add_64(&arc_size, space);
1393 }
1394
1395 void
1396 arc_space_return(uint64_t space, arc_space_type_t type)
1397 {
1398         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1399
1400         switch (type) {
1401         case ARC_SPACE_DATA:
1402                 ARCSTAT_INCR(arcstat_data_size, -space);
1403                 break;
1404         case ARC_SPACE_OTHER:
1405                 ARCSTAT_INCR(arcstat_other_size, -space);
1406                 break;
1407         case ARC_SPACE_HDRS:
1408                 ARCSTAT_INCR(arcstat_hdr_size, -space);
1409                 break;
1410         case ARC_SPACE_L2HDRS:
1411                 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1412                 break;
1413         }
1414
1415         ASSERT(arc_meta_used >= space);
1416         if (arc_meta_max < arc_meta_used)
1417                 arc_meta_max = arc_meta_used;
1418         atomic_add_64(&arc_meta_used, -space);
1419         ASSERT(arc_size >= space);
1420         atomic_add_64(&arc_size, -space);
1421 }
1422
1423 void *
1424 arc_data_buf_alloc(uint64_t size)
1425 {
1426         if (arc_evict_needed(ARC_BUFC_DATA))
1427                 cv_signal(&arc_reclaim_thr_cv);
1428         atomic_add_64(&arc_size, size);
1429         return (zio_data_buf_alloc(size));
1430 }
1431
1432 void
1433 arc_data_buf_free(void *buf, uint64_t size)
1434 {
1435         zio_data_buf_free(buf, size);
1436         ASSERT(arc_size >= size);
1437         atomic_add_64(&arc_size, -size);
1438 }
1439
1440 arc_buf_t *
1441 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1442 {
1443         arc_buf_hdr_t *hdr;
1444         arc_buf_t *buf;
1445
1446         ASSERT3U(size, >, 0);
1447         hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1448         ASSERT(BUF_EMPTY(hdr));
1449         hdr->b_size = size;
1450         hdr->b_type = type;
1451         hdr->b_spa = spa_load_guid(spa);
1452         hdr->b_state = arc_anon;
1453         hdr->b_arc_access = 0;
1454         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1455         buf->b_hdr = hdr;
1456         buf->b_data = NULL;
1457         buf->b_efunc = NULL;
1458         buf->b_private = NULL;
1459         buf->b_next = NULL;
1460         hdr->b_buf = buf;
1461         arc_get_data_buf(buf);
1462         hdr->b_datacnt = 1;
1463         hdr->b_flags = 0;
1464         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1465         (void) refcount_add(&hdr->b_refcnt, tag);
1466
1467         return (buf);
1468 }
1469
1470 static char *arc_onloan_tag = "onloan";
1471
1472 /*
1473  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1474  * flight data by arc_tempreserve_space() until they are "returned". Loaned
1475  * buffers must be returned to the arc before they can be used by the DMU or
1476  * freed.
1477  */
1478 arc_buf_t *
1479 arc_loan_buf(spa_t *spa, int size)
1480 {
1481         arc_buf_t *buf;
1482
1483         buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1484
1485         atomic_add_64(&arc_loaned_bytes, size);
1486         return (buf);
1487 }
1488
1489 /*
1490  * Return a loaned arc buffer to the arc.
1491  */
1492 void
1493 arc_return_buf(arc_buf_t *buf, void *tag)
1494 {
1495         arc_buf_hdr_t *hdr = buf->b_hdr;
1496
1497         ASSERT(buf->b_data != NULL);
1498         (void) refcount_add(&hdr->b_refcnt, tag);
1499         (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1500
1501         atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1502 }
1503
1504 /* Detach an arc_buf from a dbuf (tag) */
1505 void
1506 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1507 {
1508         arc_buf_hdr_t *hdr;
1509
1510         ASSERT(buf->b_data != NULL);
1511         hdr = buf->b_hdr;
1512         (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1513         (void) refcount_remove(&hdr->b_refcnt, tag);
1514         buf->b_efunc = NULL;
1515         buf->b_private = NULL;
1516
1517         atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1518 }
1519
1520 static arc_buf_t *
1521 arc_buf_clone(arc_buf_t *from)
1522 {
1523         arc_buf_t *buf;
1524         arc_buf_hdr_t *hdr = from->b_hdr;
1525         uint64_t size = hdr->b_size;
1526
1527         ASSERT(hdr->b_state != arc_anon);
1528
1529         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1530         buf->b_hdr = hdr;
1531         buf->b_data = NULL;
1532         buf->b_efunc = NULL;
1533         buf->b_private = NULL;
1534         buf->b_next = hdr->b_buf;
1535         hdr->b_buf = buf;
1536         arc_get_data_buf(buf);
1537         bcopy(from->b_data, buf->b_data, size);
1538
1539         /*
1540          * This buffer already exists in the arc so create a duplicate
1541          * copy for the caller.  If the buffer is associated with user data
1542          * then track the size and number of duplicates.  These stats will be
1543          * updated as duplicate buffers are created and destroyed.
1544          */
1545         if (hdr->b_type == ARC_BUFC_DATA) {
1546                 ARCSTAT_BUMP(arcstat_duplicate_buffers);
1547                 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1548         }
1549         hdr->b_datacnt += 1;
1550         return (buf);
1551 }
1552
1553 void
1554 arc_buf_add_ref(arc_buf_t *buf, void* tag)
1555 {
1556         arc_buf_hdr_t *hdr;
1557         kmutex_t *hash_lock;
1558
1559         /*
1560          * Check to see if this buffer is evicted.  Callers
1561          * must verify b_data != NULL to know if the add_ref
1562          * was successful.
1563          */
1564         mutex_enter(&buf->b_evict_lock);
1565         if (buf->b_data == NULL) {
1566                 mutex_exit(&buf->b_evict_lock);
1567                 return;
1568         }
1569         hash_lock = HDR_LOCK(buf->b_hdr);
1570         mutex_enter(hash_lock);
1571         hdr = buf->b_hdr;
1572         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1573         mutex_exit(&buf->b_evict_lock);
1574
1575         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1576         add_reference(hdr, hash_lock, tag);
1577         DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1578         arc_access(hdr, hash_lock);
1579         mutex_exit(hash_lock);
1580         ARCSTAT_BUMP(arcstat_hits);
1581         ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1582             demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1583             data, metadata, hits);
1584 }
1585
1586 /*
1587  * Free the arc data buffer.  If it is an l2arc write in progress,
1588  * the buffer is placed on l2arc_free_on_write to be freed later.
1589  */
1590 static void
1591 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1592 {
1593         arc_buf_hdr_t *hdr = buf->b_hdr;
1594
1595         if (HDR_L2_WRITING(hdr)) {
1596                 l2arc_data_free_t *df;
1597                 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1598                 df->l2df_data = buf->b_data;
1599                 df->l2df_size = hdr->b_size;
1600                 df->l2df_func = free_func;
1601                 mutex_enter(&l2arc_free_on_write_mtx);
1602                 list_insert_head(l2arc_free_on_write, df);
1603                 mutex_exit(&l2arc_free_on_write_mtx);
1604                 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1605         } else {
1606                 free_func(buf->b_data, hdr->b_size);
1607         }
1608 }
1609
1610 static void
1611 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1612 {
1613         arc_buf_t **bufp;
1614
1615         /* free up data associated with the buf */
1616         if (buf->b_data) {
1617                 arc_state_t *state = buf->b_hdr->b_state;
1618                 uint64_t size = buf->b_hdr->b_size;
1619                 arc_buf_contents_t type = buf->b_hdr->b_type;
1620
1621                 arc_cksum_verify(buf);
1622 #ifdef illumos
1623                 arc_buf_unwatch(buf);
1624 #endif /* illumos */
1625
1626                 if (!recycle) {
1627                         if (type == ARC_BUFC_METADATA) {
1628                                 arc_buf_data_free(buf, zio_buf_free);
1629                                 arc_space_return(size, ARC_SPACE_DATA);
1630                         } else {
1631                                 ASSERT(type == ARC_BUFC_DATA);
1632                                 arc_buf_data_free(buf, zio_data_buf_free);
1633                                 ARCSTAT_INCR(arcstat_data_size, -size);
1634                                 atomic_add_64(&arc_size, -size);
1635                         }
1636                 }
1637                 if (list_link_active(&buf->b_hdr->b_arc_node)) {
1638                         uint64_t *cnt = &state->arcs_lsize[type];
1639
1640                         ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1641                         ASSERT(state != arc_anon);
1642
1643                         ASSERT3U(*cnt, >=, size);
1644                         atomic_add_64(cnt, -size);
1645                 }
1646                 ASSERT3U(state->arcs_size, >=, size);
1647                 atomic_add_64(&state->arcs_size, -size);
1648                 buf->b_data = NULL;
1649
1650                 /*
1651                  * If we're destroying a duplicate buffer make sure
1652                  * that the appropriate statistics are updated.
1653                  */
1654                 if (buf->b_hdr->b_datacnt > 1 &&
1655                     buf->b_hdr->b_type == ARC_BUFC_DATA) {
1656                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1657                         ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1658                 }
1659                 ASSERT(buf->b_hdr->b_datacnt > 0);
1660                 buf->b_hdr->b_datacnt -= 1;
1661         }
1662
1663         /* only remove the buf if requested */
1664         if (!all)
1665                 return;
1666
1667         /* remove the buf from the hdr list */
1668         for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1669                 continue;
1670         *bufp = buf->b_next;
1671         buf->b_next = NULL;
1672
1673         ASSERT(buf->b_efunc == NULL);
1674
1675         /* clean up the buf */
1676         buf->b_hdr = NULL;
1677         kmem_cache_free(buf_cache, buf);
1678 }
1679
1680 static void
1681 arc_hdr_destroy(arc_buf_hdr_t *hdr)
1682 {
1683         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1684         ASSERT3P(hdr->b_state, ==, arc_anon);
1685         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1686         l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1687
1688         if (l2hdr != NULL) {
1689                 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1690                 /*
1691                  * To prevent arc_free() and l2arc_evict() from
1692                  * attempting to free the same buffer at the same time,
1693                  * a FREE_IN_PROGRESS flag is given to arc_free() to
1694                  * give it priority.  l2arc_evict() can't destroy this
1695                  * header while we are waiting on l2arc_buflist_mtx.
1696                  *
1697                  * The hdr may be removed from l2ad_buflist before we
1698                  * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1699                  */
1700                 if (!buflist_held) {
1701                         mutex_enter(&l2arc_buflist_mtx);
1702                         l2hdr = hdr->b_l2hdr;
1703                 }
1704
1705                 if (l2hdr != NULL) {
1706                         trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
1707                             hdr->b_size, 0);
1708                         list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1709                         ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1710                         kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1711                         if (hdr->b_state == arc_l2c_only)
1712                                 l2arc_hdr_stat_remove();
1713                         hdr->b_l2hdr = NULL;
1714                 }
1715
1716                 if (!buflist_held)
1717                         mutex_exit(&l2arc_buflist_mtx);
1718         }
1719
1720         if (!BUF_EMPTY(hdr)) {
1721                 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1722                 buf_discard_identity(hdr);
1723         }
1724         while (hdr->b_buf) {
1725                 arc_buf_t *buf = hdr->b_buf;
1726
1727                 if (buf->b_efunc) {
1728                         mutex_enter(&arc_eviction_mtx);
1729                         mutex_enter(&buf->b_evict_lock);
1730                         ASSERT(buf->b_hdr != NULL);
1731                         arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1732                         hdr->b_buf = buf->b_next;
1733                         buf->b_hdr = &arc_eviction_hdr;
1734                         buf->b_next = arc_eviction_list;
1735                         arc_eviction_list = buf;
1736                         mutex_exit(&buf->b_evict_lock);
1737                         mutex_exit(&arc_eviction_mtx);
1738                 } else {
1739                         arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1740                 }
1741         }
1742         if (hdr->b_freeze_cksum != NULL) {
1743                 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1744                 hdr->b_freeze_cksum = NULL;
1745         }
1746         if (hdr->b_thawed) {
1747                 kmem_free(hdr->b_thawed, 1);
1748                 hdr->b_thawed = NULL;
1749         }
1750
1751         ASSERT(!list_link_active(&hdr->b_arc_node));
1752         ASSERT3P(hdr->b_hash_next, ==, NULL);
1753         ASSERT3P(hdr->b_acb, ==, NULL);
1754         kmem_cache_free(hdr_cache, hdr);
1755 }
1756
1757 void
1758 arc_buf_free(arc_buf_t *buf, void *tag)
1759 {
1760         arc_buf_hdr_t *hdr = buf->b_hdr;
1761         int hashed = hdr->b_state != arc_anon;
1762
1763         ASSERT(buf->b_efunc == NULL);
1764         ASSERT(buf->b_data != NULL);
1765
1766         if (hashed) {
1767                 kmutex_t *hash_lock = HDR_LOCK(hdr);
1768
1769                 mutex_enter(hash_lock);
1770                 hdr = buf->b_hdr;
1771                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1772
1773                 (void) remove_reference(hdr, hash_lock, tag);
1774                 if (hdr->b_datacnt > 1) {
1775                         arc_buf_destroy(buf, FALSE, TRUE);
1776                 } else {
1777                         ASSERT(buf == hdr->b_buf);
1778                         ASSERT(buf->b_efunc == NULL);
1779                         hdr->b_flags |= ARC_BUF_AVAILABLE;
1780                 }
1781                 mutex_exit(hash_lock);
1782         } else if (HDR_IO_IN_PROGRESS(hdr)) {
1783                 int destroy_hdr;
1784                 /*
1785                  * We are in the middle of an async write.  Don't destroy
1786                  * this buffer unless the write completes before we finish
1787                  * decrementing the reference count.
1788                  */
1789                 mutex_enter(&arc_eviction_mtx);
1790                 (void) remove_reference(hdr, NULL, tag);
1791                 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1792                 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1793                 mutex_exit(&arc_eviction_mtx);
1794                 if (destroy_hdr)
1795                         arc_hdr_destroy(hdr);
1796         } else {
1797                 if (remove_reference(hdr, NULL, tag) > 0)
1798                         arc_buf_destroy(buf, FALSE, TRUE);
1799                 else
1800                         arc_hdr_destroy(hdr);
1801         }
1802 }
1803
1804 boolean_t
1805 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1806 {
1807         arc_buf_hdr_t *hdr = buf->b_hdr;
1808         kmutex_t *hash_lock = HDR_LOCK(hdr);
1809         boolean_t no_callback = (buf->b_efunc == NULL);
1810
1811         if (hdr->b_state == arc_anon) {
1812                 ASSERT(hdr->b_datacnt == 1);
1813                 arc_buf_free(buf, tag);
1814                 return (no_callback);
1815         }
1816
1817         mutex_enter(hash_lock);
1818         hdr = buf->b_hdr;
1819         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1820         ASSERT(hdr->b_state != arc_anon);
1821         ASSERT(buf->b_data != NULL);
1822
1823         (void) remove_reference(hdr, hash_lock, tag);
1824         if (hdr->b_datacnt > 1) {
1825                 if (no_callback)
1826                         arc_buf_destroy(buf, FALSE, TRUE);
1827         } else if (no_callback) {
1828                 ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1829                 ASSERT(buf->b_efunc == NULL);
1830                 hdr->b_flags |= ARC_BUF_AVAILABLE;
1831         }
1832         ASSERT(no_callback || hdr->b_datacnt > 1 ||
1833             refcount_is_zero(&hdr->b_refcnt));
1834         mutex_exit(hash_lock);
1835         return (no_callback);
1836 }
1837
1838 int
1839 arc_buf_size(arc_buf_t *buf)
1840 {
1841         return (buf->b_hdr->b_size);
1842 }
1843
1844 /*
1845  * Called from the DMU to determine if the current buffer should be
1846  * evicted. In order to ensure proper locking, the eviction must be initiated
1847  * from the DMU. Return true if the buffer is associated with user data and
1848  * duplicate buffers still exist.
1849  */
1850 boolean_t
1851 arc_buf_eviction_needed(arc_buf_t *buf)
1852 {
1853         arc_buf_hdr_t *hdr;
1854         boolean_t evict_needed = B_FALSE;
1855
1856         if (zfs_disable_dup_eviction)
1857                 return (B_FALSE);
1858
1859         mutex_enter(&buf->b_evict_lock);
1860         hdr = buf->b_hdr;
1861         if (hdr == NULL) {
1862                 /*
1863                  * We are in arc_do_user_evicts(); let that function
1864                  * perform the eviction.
1865                  */
1866                 ASSERT(buf->b_data == NULL);
1867                 mutex_exit(&buf->b_evict_lock);
1868                 return (B_FALSE);
1869         } else if (buf->b_data == NULL) {
1870                 /*
1871                  * We have already been added to the arc eviction list;
1872                  * recommend eviction.
1873                  */
1874                 ASSERT3P(hdr, ==, &arc_eviction_hdr);
1875                 mutex_exit(&buf->b_evict_lock);
1876                 return (B_TRUE);
1877         }
1878
1879         if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
1880                 evict_needed = B_TRUE;
1881
1882         mutex_exit(&buf->b_evict_lock);
1883         return (evict_needed);
1884 }
1885
1886 /*
1887  * Evict buffers from list until we've removed the specified number of
1888  * bytes.  Move the removed buffers to the appropriate evict state.
1889  * If the recycle flag is set, then attempt to "recycle" a buffer:
1890  * - look for a buffer to evict that is `bytes' long.
1891  * - return the data block from this buffer rather than freeing it.
1892  * This flag is used by callers that are trying to make space for a
1893  * new buffer in a full arc cache.
1894  *
1895  * This function makes a "best effort".  It skips over any buffers
1896  * it can't get a hash_lock on, and so may not catch all candidates.
1897  * It may also return without evicting as much space as requested.
1898  */
1899 static void *
1900 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1901     arc_buf_contents_t type)
1902 {
1903         arc_state_t *evicted_state;
1904         uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1905         int64_t bytes_remaining;
1906         arc_buf_hdr_t *ab, *ab_prev = NULL;
1907         list_t *evicted_list, *list, *evicted_list_start, *list_start;
1908         kmutex_t *lock, *evicted_lock;
1909         kmutex_t *hash_lock;
1910         boolean_t have_lock;
1911         void *stolen = NULL;
1912         static int evict_metadata_offset, evict_data_offset;
1913         int i, idx, offset, list_count, count;
1914
1915         ASSERT(state == arc_mru || state == arc_mfu);
1916
1917         evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1918
1919         if (type == ARC_BUFC_METADATA) {
1920                 offset = 0;
1921                 list_count = ARC_BUFC_NUMMETADATALISTS;
1922                 list_start = &state->arcs_lists[0];
1923                 evicted_list_start = &evicted_state->arcs_lists[0];
1924                 idx = evict_metadata_offset;
1925         } else {
1926                 offset = ARC_BUFC_NUMMETADATALISTS;
1927                 list_start = &state->arcs_lists[offset];
1928                 evicted_list_start = &evicted_state->arcs_lists[offset];
1929                 list_count = ARC_BUFC_NUMDATALISTS;
1930                 idx = evict_data_offset;
1931         }
1932         bytes_remaining = evicted_state->arcs_lsize[type];
1933         count = 0;
1934
1935 evict_start:
1936         list = &list_start[idx];
1937         evicted_list = &evicted_list_start[idx];
1938         lock = ARCS_LOCK(state, (offset + idx));
1939         evicted_lock = ARCS_LOCK(evicted_state, (offset + idx));
1940
1941         mutex_enter(lock);
1942         mutex_enter(evicted_lock);
1943
1944         for (ab = list_tail(list); ab; ab = ab_prev) {
1945                 ab_prev = list_prev(list, ab);
1946                 bytes_remaining -= (ab->b_size * ab->b_datacnt);
1947                 /* prefetch buffers have a minimum lifespan */
1948                 if (HDR_IO_IN_PROGRESS(ab) ||
1949                     (spa && ab->b_spa != spa) ||
1950                     (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1951                     ddi_get_lbolt() - ab->b_arc_access <
1952                     arc_min_prefetch_lifespan)) {
1953                         skipped++;
1954                         continue;
1955                 }
1956                 /* "lookahead" for better eviction candidate */
1957                 if (recycle && ab->b_size != bytes &&
1958                     ab_prev && ab_prev->b_size == bytes)
1959                         continue;
1960                 hash_lock = HDR_LOCK(ab);
1961                 have_lock = MUTEX_HELD(hash_lock);
1962                 if (have_lock || mutex_tryenter(hash_lock)) {
1963                         ASSERT0(refcount_count(&ab->b_refcnt));
1964                         ASSERT(ab->b_datacnt > 0);
1965                         while (ab->b_buf) {
1966                                 arc_buf_t *buf = ab->b_buf;
1967                                 if (!mutex_tryenter(&buf->b_evict_lock)) {
1968                                         missed += 1;
1969                                         break;
1970                                 }
1971                                 if (buf->b_data) {
1972                                         bytes_evicted += ab->b_size;
1973                                         if (recycle && ab->b_type == type &&
1974                                             ab->b_size == bytes &&
1975                                             !HDR_L2_WRITING(ab)) {
1976                                                 stolen = buf->b_data;
1977                                                 recycle = FALSE;
1978                                         }
1979                                 }
1980                                 if (buf->b_efunc) {
1981                                         mutex_enter(&arc_eviction_mtx);
1982                                         arc_buf_destroy(buf,
1983                                             buf->b_data == stolen, FALSE);
1984                                         ab->b_buf = buf->b_next;
1985                                         buf->b_hdr = &arc_eviction_hdr;
1986                                         buf->b_next = arc_eviction_list;
1987                                         arc_eviction_list = buf;
1988                                         mutex_exit(&arc_eviction_mtx);
1989                                         mutex_exit(&buf->b_evict_lock);
1990                                 } else {
1991                                         mutex_exit(&buf->b_evict_lock);
1992                                         arc_buf_destroy(buf,
1993                                             buf->b_data == stolen, TRUE);
1994                                 }
1995                         }
1996
1997                         if (ab->b_l2hdr) {
1998                                 ARCSTAT_INCR(arcstat_evict_l2_cached,
1999                                     ab->b_size);
2000                         } else {
2001                                 if (l2arc_write_eligible(ab->b_spa, ab)) {
2002                                         ARCSTAT_INCR(arcstat_evict_l2_eligible,
2003                                             ab->b_size);
2004                                 } else {
2005                                         ARCSTAT_INCR(
2006                                             arcstat_evict_l2_ineligible,
2007                                             ab->b_size);
2008                                 }
2009                         }
2010
2011                         if (ab->b_datacnt == 0) {
2012                                 arc_change_state(evicted_state, ab, hash_lock);
2013                                 ASSERT(HDR_IN_HASH_TABLE(ab));
2014                                 ab->b_flags |= ARC_IN_HASH_TABLE;
2015                                 ab->b_flags &= ~ARC_BUF_AVAILABLE;
2016                                 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
2017                         }
2018                         if (!have_lock)
2019                                 mutex_exit(hash_lock);
2020                         if (bytes >= 0 && bytes_evicted >= bytes)
2021                                 break;
2022                         if (bytes_remaining > 0) {
2023                                 mutex_exit(evicted_lock);
2024                                 mutex_exit(lock);
2025                                 idx  = ((idx + 1) & (list_count - 1));
2026                                 count++;
2027                                 goto evict_start;
2028                         }
2029                 } else {
2030                         missed += 1;
2031                 }
2032         }
2033
2034         mutex_exit(evicted_lock);
2035         mutex_exit(lock);
2036
2037         idx  = ((idx + 1) & (list_count - 1));
2038         count++;
2039
2040         if (bytes_evicted < bytes) {
2041                 if (count < list_count)
2042                         goto evict_start;
2043                 else
2044                         dprintf("only evicted %lld bytes from %x",
2045                             (longlong_t)bytes_evicted, state);
2046         }
2047         if (type == ARC_BUFC_METADATA)
2048                 evict_metadata_offset = idx;
2049         else
2050                 evict_data_offset = idx;
2051
2052         if (skipped)
2053                 ARCSTAT_INCR(arcstat_evict_skip, skipped);
2054
2055         if (missed)
2056                 ARCSTAT_INCR(arcstat_mutex_miss, missed);
2057
2058         /*
2059          * We have just evicted some data into the ghost state, make
2060          * sure we also adjust the ghost state size if necessary.
2061          */
2062         if (arc_no_grow &&
2063             arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
2064                 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
2065                     arc_mru_ghost->arcs_size - arc_c;
2066
2067                 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
2068                         int64_t todelete =
2069                             MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
2070                         arc_evict_ghost(arc_mru_ghost, 0, todelete);
2071                 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
2072                         int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
2073                             arc_mru_ghost->arcs_size +
2074                             arc_mfu_ghost->arcs_size - arc_c);
2075                         arc_evict_ghost(arc_mfu_ghost, 0, todelete);
2076                 }
2077         }
2078         if (stolen)
2079                 ARCSTAT_BUMP(arcstat_stolen);
2080
2081         return (stolen);
2082 }
2083
2084 /*
2085  * Remove buffers from list until we've removed the specified number of
2086  * bytes.  Destroy the buffers that are removed.
2087  */
2088 static void
2089 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
2090 {
2091         arc_buf_hdr_t *ab, *ab_prev;
2092         arc_buf_hdr_t marker = { 0 };
2093         list_t *list, *list_start;
2094         kmutex_t *hash_lock, *lock;
2095         uint64_t bytes_deleted = 0;
2096         uint64_t bufs_skipped = 0;
2097         static int evict_offset;
2098         int list_count, idx = evict_offset;
2099         int offset, count = 0;
2100
2101         ASSERT(GHOST_STATE(state));
2102
2103         /*
2104          * data lists come after metadata lists
2105          */
2106         list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS];
2107         list_count = ARC_BUFC_NUMDATALISTS;
2108         offset = ARC_BUFC_NUMMETADATALISTS;
2109
2110 evict_start:
2111         list = &list_start[idx];
2112         lock = ARCS_LOCK(state, idx + offset);
2113
2114         mutex_enter(lock);
2115         for (ab = list_tail(list); ab; ab = ab_prev) {
2116                 ab_prev = list_prev(list, ab);
2117                 if (spa && ab->b_spa != spa)
2118                         continue;
2119
2120                 /* ignore markers */
2121                 if (ab->b_spa == 0)
2122                         continue;
2123
2124                 hash_lock = HDR_LOCK(ab);
2125                 /* caller may be trying to modify this buffer, skip it */
2126                 if (MUTEX_HELD(hash_lock))
2127                         continue;
2128                 if (mutex_tryenter(hash_lock)) {
2129                         ASSERT(!HDR_IO_IN_PROGRESS(ab));
2130                         ASSERT(ab->b_buf == NULL);
2131                         ARCSTAT_BUMP(arcstat_deleted);
2132                         bytes_deleted += ab->b_size;
2133
2134                         if (ab->b_l2hdr != NULL) {
2135                                 /*
2136                                  * This buffer is cached on the 2nd Level ARC;
2137                                  * don't destroy the header.
2138                                  */
2139                                 arc_change_state(arc_l2c_only, ab, hash_lock);
2140                                 mutex_exit(hash_lock);
2141                         } else {
2142                                 arc_change_state(arc_anon, ab, hash_lock);
2143                                 mutex_exit(hash_lock);
2144                                 arc_hdr_destroy(ab);
2145                         }
2146
2147                         DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
2148                         if (bytes >= 0 && bytes_deleted >= bytes)
2149                                 break;
2150                 } else if (bytes < 0) {
2151                         /*
2152                          * Insert a list marker and then wait for the
2153                          * hash lock to become available. Once its
2154                          * available, restart from where we left off.
2155                          */
2156                         list_insert_after(list, ab, &marker);
2157                         mutex_exit(lock);
2158                         mutex_enter(hash_lock);
2159                         mutex_exit(hash_lock);
2160                         mutex_enter(lock);
2161                         ab_prev = list_prev(list, &marker);
2162                         list_remove(list, &marker);
2163                 } else
2164                         bufs_skipped += 1;
2165         }
2166         mutex_exit(lock);
2167         idx  = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1));
2168         count++;
2169
2170         if (count < list_count)
2171                 goto evict_start;
2172
2173         evict_offset = idx;
2174         if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] &&
2175             (bytes < 0 || bytes_deleted < bytes)) {
2176                 list_start = &state->arcs_lists[0];
2177                 list_count = ARC_BUFC_NUMMETADATALISTS;
2178                 offset = count = 0;
2179                 goto evict_start;
2180         }
2181
2182         if (bufs_skipped) {
2183                 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2184                 ASSERT(bytes >= 0);
2185         }
2186
2187         if (bytes_deleted < bytes)
2188                 dprintf("only deleted %lld bytes from %p",
2189                     (longlong_t)bytes_deleted, state);
2190 }
2191
2192 static void
2193 arc_adjust(void)
2194 {
2195         int64_t adjustment, delta;
2196
2197         /*
2198          * Adjust MRU size
2199          */
2200
2201         adjustment = MIN((int64_t)(arc_size - arc_c),
2202             (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2203             arc_p));
2204
2205         if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
2206                 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
2207                 (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
2208                 adjustment -= delta;
2209         }
2210
2211         if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2212                 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2213                 (void) arc_evict(arc_mru, 0, delta, FALSE,
2214                     ARC_BUFC_METADATA);
2215         }
2216
2217         /*
2218          * Adjust MFU size
2219          */
2220
2221         adjustment = arc_size - arc_c;
2222
2223         if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
2224                 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
2225                 (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
2226                 adjustment -= delta;
2227         }
2228
2229         if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2230                 int64_t delta = MIN(adjustment,
2231                     arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2232                 (void) arc_evict(arc_mfu, 0, delta, FALSE,
2233                     ARC_BUFC_METADATA);
2234         }
2235
2236         /*
2237          * Adjust ghost lists
2238          */
2239
2240         adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2241
2242         if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2243                 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2244                 arc_evict_ghost(arc_mru_ghost, 0, delta);
2245         }
2246
2247         adjustment =
2248             arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2249
2250         if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2251                 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2252                 arc_evict_ghost(arc_mfu_ghost, 0, delta);
2253         }
2254 }
2255
2256 static void
2257 arc_do_user_evicts(void)
2258 {
2259         static arc_buf_t *tmp_arc_eviction_list;
2260
2261         /*
2262          * Move list over to avoid LOR
2263          */
2264 restart:
2265         mutex_enter(&arc_eviction_mtx);
2266         tmp_arc_eviction_list = arc_eviction_list;
2267         arc_eviction_list = NULL;
2268         mutex_exit(&arc_eviction_mtx);
2269
2270         while (tmp_arc_eviction_list != NULL) {
2271                 arc_buf_t *buf = tmp_arc_eviction_list;
2272                 tmp_arc_eviction_list = buf->b_next;
2273                 mutex_enter(&buf->b_evict_lock);
2274                 buf->b_hdr = NULL;
2275                 mutex_exit(&buf->b_evict_lock);
2276
2277                 if (buf->b_efunc != NULL)
2278                         VERIFY(buf->b_efunc(buf) == 0);
2279
2280                 buf->b_efunc = NULL;
2281                 buf->b_private = NULL;
2282                 kmem_cache_free(buf_cache, buf);
2283         }
2284
2285         if (arc_eviction_list != NULL)
2286                 goto restart;
2287 }
2288
2289 /*
2290  * Flush all *evictable* data from the cache for the given spa.
2291  * NOTE: this will not touch "active" (i.e. referenced) data.
2292  */
2293 void
2294 arc_flush(spa_t *spa)
2295 {
2296         uint64_t guid = 0;
2297
2298         if (spa)
2299                 guid = spa_load_guid(spa);
2300
2301         while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
2302                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2303                 if (spa)
2304                         break;
2305         }
2306         while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
2307                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2308                 if (spa)
2309                         break;
2310         }
2311         while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
2312                 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2313                 if (spa)
2314                         break;
2315         }
2316         while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
2317                 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2318                 if (spa)
2319                         break;
2320         }
2321
2322         arc_evict_ghost(arc_mru_ghost, guid, -1);
2323         arc_evict_ghost(arc_mfu_ghost, guid, -1);
2324
2325         mutex_enter(&arc_reclaim_thr_lock);
2326         arc_do_user_evicts();
2327         mutex_exit(&arc_reclaim_thr_lock);
2328         ASSERT(spa || arc_eviction_list == NULL);
2329 }
2330
2331 void
2332 arc_shrink(void)
2333 {
2334         if (arc_c > arc_c_min) {
2335                 uint64_t to_free;
2336
2337 #ifdef _KERNEL
2338                 to_free = arc_c >> arc_shrink_shift;
2339 #else
2340                 to_free = arc_c >> arc_shrink_shift;
2341 #endif
2342                 if (arc_c > arc_c_min + to_free)
2343                         atomic_add_64(&arc_c, -to_free);
2344                 else
2345                         arc_c = arc_c_min;
2346
2347                 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2348                 if (arc_c > arc_size)
2349                         arc_c = MAX(arc_size, arc_c_min);
2350                 if (arc_p > arc_c)
2351                         arc_p = (arc_c >> 1);
2352                 ASSERT(arc_c >= arc_c_min);
2353                 ASSERT((int64_t)arc_p >= 0);
2354         }
2355
2356         if (arc_size > arc_c)
2357                 arc_adjust();
2358 }
2359
2360 static int needfree = 0;
2361
2362 static int
2363 arc_reclaim_needed(void)
2364 {
2365
2366 #ifdef _KERNEL
2367
2368         if (needfree)
2369                 return (1);
2370
2371         /*
2372          * Cooperate with pagedaemon when it's time for it to scan
2373          * and reclaim some pages.
2374          */
2375         if (vm_paging_needed())
2376                 return (1);
2377
2378 #ifdef sun
2379         /*
2380          * take 'desfree' extra pages, so we reclaim sooner, rather than later
2381          */
2382         extra = desfree;
2383
2384         /*
2385          * check that we're out of range of the pageout scanner.  It starts to
2386          * schedule paging if freemem is less than lotsfree and needfree.
2387          * lotsfree is the high-water mark for pageout, and needfree is the
2388          * number of needed free pages.  We add extra pages here to make sure
2389          * the scanner doesn't start up while we're freeing memory.
2390          */
2391         if (freemem < lotsfree + needfree + extra)
2392                 return (1);
2393
2394         /*
2395          * check to make sure that swapfs has enough space so that anon
2396          * reservations can still succeed. anon_resvmem() checks that the
2397          * availrmem is greater than swapfs_minfree, and the number of reserved
2398          * swap pages.  We also add a bit of extra here just to prevent
2399          * circumstances from getting really dire.
2400          */
2401         if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2402                 return (1);
2403
2404 #if defined(__i386)
2405         /*
2406          * If we're on an i386 platform, it's possible that we'll exhaust the
2407          * kernel heap space before we ever run out of available physical
2408          * memory.  Most checks of the size of the heap_area compare against
2409          * tune.t_minarmem, which is the minimum available real memory that we
2410          * can have in the system.  However, this is generally fixed at 25 pages
2411          * which is so low that it's useless.  In this comparison, we seek to
2412          * calculate the total heap-size, and reclaim if more than 3/4ths of the
2413          * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2414          * free)
2415          */
2416         if (btop(vmem_size(heap_arena, VMEM_FREE)) <
2417             (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
2418                 return (1);
2419 #endif
2420 #else   /* !sun */
2421         if (kmem_used() > (kmem_size() * 3) / 4)
2422                 return (1);
2423 #endif  /* sun */
2424
2425 #else
2426         if (spa_get_random(100) == 0)
2427                 return (1);
2428 #endif
2429         return (0);
2430 }
2431
2432 extern kmem_cache_t     *zio_buf_cache[];
2433 extern kmem_cache_t     *zio_data_buf_cache[];
2434
2435 static void
2436 arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2437 {
2438         size_t                  i;
2439         kmem_cache_t            *prev_cache = NULL;
2440         kmem_cache_t            *prev_data_cache = NULL;
2441
2442 #ifdef _KERNEL
2443         if (arc_meta_used >= arc_meta_limit) {
2444                 /*
2445                  * We are exceeding our meta-data cache limit.
2446                  * Purge some DNLC entries to release holds on meta-data.
2447                  */
2448                 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2449         }
2450 #if defined(__i386)
2451         /*
2452          * Reclaim unused memory from all kmem caches.
2453          */
2454         kmem_reap();
2455 #endif
2456 #endif
2457
2458         /*
2459          * An aggressive reclamation will shrink the cache size as well as
2460          * reap free buffers from the arc kmem caches.
2461          */
2462         if (strat == ARC_RECLAIM_AGGR)
2463                 arc_shrink();
2464
2465         for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2466                 if (zio_buf_cache[i] != prev_cache) {
2467                         prev_cache = zio_buf_cache[i];
2468                         kmem_cache_reap_now(zio_buf_cache[i]);
2469                 }
2470                 if (zio_data_buf_cache[i] != prev_data_cache) {
2471                         prev_data_cache = zio_data_buf_cache[i];
2472                         kmem_cache_reap_now(zio_data_buf_cache[i]);
2473                 }
2474         }
2475         kmem_cache_reap_now(buf_cache);
2476         kmem_cache_reap_now(hdr_cache);
2477 }
2478
2479 static void
2480 arc_reclaim_thread(void *dummy __unused)
2481 {
2482         clock_t                 growtime = 0;
2483         arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
2484         callb_cpr_t             cpr;
2485
2486         CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2487
2488         mutex_enter(&arc_reclaim_thr_lock);
2489         while (arc_thread_exit == 0) {
2490                 if (arc_reclaim_needed()) {
2491
2492                         if (arc_no_grow) {
2493                                 if (last_reclaim == ARC_RECLAIM_CONS) {
2494                                         last_reclaim = ARC_RECLAIM_AGGR;
2495                                 } else {
2496                                         last_reclaim = ARC_RECLAIM_CONS;
2497                                 }
2498                         } else {
2499                                 arc_no_grow = TRUE;
2500                                 last_reclaim = ARC_RECLAIM_AGGR;
2501                                 membar_producer();
2502                         }
2503
2504                         /* reset the growth delay for every reclaim */
2505                         growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2506
2507                         if (needfree && last_reclaim == ARC_RECLAIM_CONS) {
2508                                 /*
2509                                  * If needfree is TRUE our vm_lowmem hook
2510                                  * was called and in that case we must free some
2511                                  * memory, so switch to aggressive mode.
2512                                  */
2513                                 arc_no_grow = TRUE;
2514                                 last_reclaim = ARC_RECLAIM_AGGR;
2515                         }
2516                         arc_kmem_reap_now(last_reclaim);
2517                         arc_warm = B_TRUE;
2518
2519                 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2520                         arc_no_grow = FALSE;
2521                 }
2522
2523                 arc_adjust();
2524
2525                 if (arc_eviction_list != NULL)
2526                         arc_do_user_evicts();
2527
2528 #ifdef _KERNEL
2529                 if (needfree) {
2530                         needfree = 0;
2531                         wakeup(&needfree);
2532                 }
2533 #endif
2534
2535                 /* block until needed, or one second, whichever is shorter */
2536                 CALLB_CPR_SAFE_BEGIN(&cpr);
2537                 (void) cv_timedwait(&arc_reclaim_thr_cv,
2538                     &arc_reclaim_thr_lock, hz);
2539                 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2540         }
2541
2542         arc_thread_exit = 0;
2543         cv_broadcast(&arc_reclaim_thr_cv);
2544         CALLB_CPR_EXIT(&cpr);           /* drops arc_reclaim_thr_lock */
2545         thread_exit();
2546 }
2547
2548 /*
2549  * Adapt arc info given the number of bytes we are trying to add and
2550  * the state that we are comming from.  This function is only called
2551  * when we are adding new content to the cache.
2552  */
2553 static void
2554 arc_adapt(int bytes, arc_state_t *state)
2555 {
2556         int mult;
2557         uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2558
2559         if (state == arc_l2c_only)
2560                 return;
2561
2562         ASSERT(bytes > 0);
2563         /*
2564          * Adapt the target size of the MRU list:
2565          *      - if we just hit in the MRU ghost list, then increase
2566          *        the target size of the MRU list.
2567          *      - if we just hit in the MFU ghost list, then increase
2568          *        the target size of the MFU list by decreasing the
2569          *        target size of the MRU list.
2570          */
2571         if (state == arc_mru_ghost) {
2572                 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2573                     1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2574                 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2575
2576                 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2577         } else if (state == arc_mfu_ghost) {
2578                 uint64_t delta;
2579
2580                 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2581                     1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2582                 mult = MIN(mult, 10);
2583
2584                 delta = MIN(bytes * mult, arc_p);
2585                 arc_p = MAX(arc_p_min, arc_p - delta);
2586         }
2587         ASSERT((int64_t)arc_p >= 0);
2588
2589         if (arc_reclaim_needed()) {
2590                 cv_signal(&arc_reclaim_thr_cv);
2591                 return;
2592         }
2593
2594         if (arc_no_grow)
2595                 return;
2596
2597         if (arc_c >= arc_c_max)
2598                 return;
2599
2600         /*
2601          * If we're within (2 * maxblocksize) bytes of the target
2602          * cache size, increment the target cache size
2603          */
2604         if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2605                 atomic_add_64(&arc_c, (int64_t)bytes);
2606                 if (arc_c > arc_c_max)
2607                         arc_c = arc_c_max;
2608                 else if (state == arc_anon)
2609                         atomic_add_64(&arc_p, (int64_t)bytes);
2610                 if (arc_p > arc_c)
2611                         arc_p = arc_c;
2612         }
2613         ASSERT((int64_t)arc_p >= 0);
2614 }
2615
2616 /*
2617  * Check if the cache has reached its limits and eviction is required
2618  * prior to insert.
2619  */
2620 static int
2621 arc_evict_needed(arc_buf_contents_t type)
2622 {
2623         if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2624                 return (1);
2625
2626 #ifdef sun
2627 #ifdef _KERNEL
2628         /*
2629          * If zio data pages are being allocated out of a separate heap segment,
2630          * then enforce that the size of available vmem for this area remains
2631          * above about 1/32nd free.
2632          */
2633         if (type == ARC_BUFC_DATA && zio_arena != NULL &&
2634             vmem_size(zio_arena, VMEM_FREE) <
2635             (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
2636                 return (1);
2637 #endif
2638 #endif  /* sun */
2639
2640         if (arc_reclaim_needed())
2641                 return (1);
2642
2643         return (arc_size > arc_c);
2644 }
2645
2646 /*
2647  * The buffer, supplied as the first argument, needs a data block.
2648  * So, if we are at cache max, determine which cache should be victimized.
2649  * We have the following cases:
2650  *
2651  * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2652  * In this situation if we're out of space, but the resident size of the MFU is
2653  * under the limit, victimize the MFU cache to satisfy this insertion request.
2654  *
2655  * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2656  * Here, we've used up all of the available space for the MRU, so we need to
2657  * evict from our own cache instead.  Evict from the set of resident MRU
2658  * entries.
2659  *
2660  * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2661  * c minus p represents the MFU space in the cache, since p is the size of the
2662  * cache that is dedicated to the MRU.  In this situation there's still space on
2663  * the MFU side, so the MRU side needs to be victimized.
2664  *
2665  * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2666  * MFU's resident set is consuming more space than it has been allotted.  In
2667  * this situation, we must victimize our own cache, the MFU, for this insertion.
2668  */
2669 static void
2670 arc_get_data_buf(arc_buf_t *buf)
2671 {
2672         arc_state_t             *state = buf->b_hdr->b_state;
2673         uint64_t                size = buf->b_hdr->b_size;
2674         arc_buf_contents_t      type = buf->b_hdr->b_type;
2675
2676         arc_adapt(size, state);
2677
2678         /*
2679          * We have not yet reached cache maximum size,
2680          * just allocate a new buffer.
2681          */
2682         if (!arc_evict_needed(type)) {
2683                 if (type == ARC_BUFC_METADATA) {
2684                         buf->b_data = zio_buf_alloc(size);
2685                         arc_space_consume(size, ARC_SPACE_DATA);
2686                 } else {
2687                         ASSERT(type == ARC_BUFC_DATA);
2688                         buf->b_data = zio_data_buf_alloc(size);
2689                         ARCSTAT_INCR(arcstat_data_size, size);
2690                         atomic_add_64(&arc_size, size);
2691                 }
2692                 goto out;
2693         }
2694
2695         /*
2696          * If we are prefetching from the mfu ghost list, this buffer
2697          * will end up on the mru list; so steal space from there.
2698          */
2699         if (state == arc_mfu_ghost)
2700                 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2701         else if (state == arc_mru_ghost)
2702                 state = arc_mru;
2703
2704         if (state == arc_mru || state == arc_anon) {
2705                 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2706                 state = (arc_mfu->arcs_lsize[type] >= size &&
2707                     arc_p > mru_used) ? arc_mfu : arc_mru;
2708         } else {
2709                 /* MFU cases */
2710                 uint64_t mfu_space = arc_c - arc_p;
2711                 state =  (arc_mru->arcs_lsize[type] >= size &&
2712                     mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2713         }
2714         if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
2715                 if (type == ARC_BUFC_METADATA) {
2716                         buf->b_data = zio_buf_alloc(size);
2717                         arc_space_consume(size, ARC_SPACE_DATA);
2718                 } else {
2719                         ASSERT(type == ARC_BUFC_DATA);
2720                         buf->b_data = zio_data_buf_alloc(size);
2721                         ARCSTAT_INCR(arcstat_data_size, size);
2722                         atomic_add_64(&arc_size, size);
2723                 }
2724                 ARCSTAT_BUMP(arcstat_recycle_miss);
2725         }
2726         ASSERT(buf->b_data != NULL);
2727 out:
2728         /*
2729          * Update the state size.  Note that ghost states have a
2730          * "ghost size" and so don't need to be updated.
2731          */
2732         if (!GHOST_STATE(buf->b_hdr->b_state)) {
2733                 arc_buf_hdr_t *hdr = buf->b_hdr;
2734
2735                 atomic_add_64(&hdr->b_state->arcs_size, size);
2736                 if (list_link_active(&hdr->b_arc_node)) {
2737                         ASSERT(refcount_is_zero(&hdr->b_refcnt));
2738                         atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2739                 }
2740                 /*
2741                  * If we are growing the cache, and we are adding anonymous
2742                  * data, and we have outgrown arc_p, update arc_p
2743                  */
2744                 if (arc_size < arc_c && hdr->b_state == arc_anon &&
2745                     arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2746                         arc_p = MIN(arc_c, arc_p + size);
2747         }
2748         ARCSTAT_BUMP(arcstat_allocated);
2749 }
2750
2751 /*
2752  * This routine is called whenever a buffer is accessed.
2753  * NOTE: the hash lock is dropped in this function.
2754  */
2755 static void
2756 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2757 {
2758         clock_t now;
2759
2760         ASSERT(MUTEX_HELD(hash_lock));
2761
2762         if (buf->b_state == arc_anon) {
2763                 /*
2764                  * This buffer is not in the cache, and does not
2765                  * appear in our "ghost" list.  Add the new buffer
2766                  * to the MRU state.
2767                  */
2768
2769                 ASSERT(buf->b_arc_access == 0);
2770                 buf->b_arc_access = ddi_get_lbolt();
2771                 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2772                 arc_change_state(arc_mru, buf, hash_lock);
2773
2774         } else if (buf->b_state == arc_mru) {
2775                 now = ddi_get_lbolt();
2776
2777                 /*
2778                  * If this buffer is here because of a prefetch, then either:
2779                  * - clear the flag if this is a "referencing" read
2780                  *   (any subsequent access will bump this into the MFU state).
2781                  * or
2782                  * - move the buffer to the head of the list if this is
2783                  *   another prefetch (to make it less likely to be evicted).
2784                  */
2785                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2786                         if (refcount_count(&buf->b_refcnt) == 0) {
2787                                 ASSERT(list_link_active(&buf->b_arc_node));
2788                         } else {
2789                                 buf->b_flags &= ~ARC_PREFETCH;
2790                                 ARCSTAT_BUMP(arcstat_mru_hits);
2791                         }
2792                         buf->b_arc_access = now;
2793                         return;
2794                 }
2795
2796                 /*
2797                  * This buffer has been "accessed" only once so far,
2798                  * but it is still in the cache. Move it to the MFU
2799                  * state.
2800                  */
2801                 if (now > buf->b_arc_access + ARC_MINTIME) {
2802                         /*
2803                          * More than 125ms have passed since we
2804                          * instantiated this buffer.  Move it to the
2805                          * most frequently used state.
2806                          */
2807                         buf->b_arc_access = now;
2808                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2809                         arc_change_state(arc_mfu, buf, hash_lock);
2810                 }
2811                 ARCSTAT_BUMP(arcstat_mru_hits);
2812         } else if (buf->b_state == arc_mru_ghost) {
2813                 arc_state_t     *new_state;
2814                 /*
2815                  * This buffer has been "accessed" recently, but
2816                  * was evicted from the cache.  Move it to the
2817                  * MFU state.
2818                  */
2819
2820                 if (buf->b_flags & ARC_PREFETCH) {
2821                         new_state = arc_mru;
2822                         if (refcount_count(&buf->b_refcnt) > 0)
2823                                 buf->b_flags &= ~ARC_PREFETCH;
2824                         DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2825                 } else {
2826                         new_state = arc_mfu;
2827                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2828                 }
2829
2830                 buf->b_arc_access = ddi_get_lbolt();
2831                 arc_change_state(new_state, buf, hash_lock);
2832
2833                 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2834         } else if (buf->b_state == arc_mfu) {
2835                 /*
2836                  * This buffer has been accessed more than once and is
2837                  * still in the cache.  Keep it in the MFU state.
2838                  *
2839                  * NOTE: an add_reference() that occurred when we did
2840                  * the arc_read() will have kicked this off the list.
2841                  * If it was a prefetch, we will explicitly move it to
2842                  * the head of the list now.
2843                  */
2844                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2845                         ASSERT(refcount_count(&buf->b_refcnt) == 0);
2846                         ASSERT(list_link_active(&buf->b_arc_node));
2847                 }
2848                 ARCSTAT_BUMP(arcstat_mfu_hits);
2849                 buf->b_arc_access = ddi_get_lbolt();
2850         } else if (buf->b_state == arc_mfu_ghost) {
2851                 arc_state_t     *new_state = arc_mfu;
2852                 /*
2853                  * This buffer has been accessed more than once but has
2854                  * been evicted from the cache.  Move it back to the
2855                  * MFU state.
2856                  */
2857
2858                 if (buf->b_flags & ARC_PREFETCH) {
2859                         /*
2860                          * This is a prefetch access...
2861                          * move this block back to the MRU state.
2862                          */
2863                         ASSERT0(refcount_count(&buf->b_refcnt));
2864                         new_state = arc_mru;
2865                 }
2866
2867                 buf->b_arc_access = ddi_get_lbolt();
2868                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2869                 arc_change_state(new_state, buf, hash_lock);
2870
2871                 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2872         } else if (buf->b_state == arc_l2c_only) {
2873                 /*
2874                  * This buffer is on the 2nd Level ARC.
2875                  */
2876
2877                 buf->b_arc_access = ddi_get_lbolt();
2878                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2879                 arc_change_state(arc_mfu, buf, hash_lock);
2880         } else {
2881                 ASSERT(!"invalid arc state");
2882         }
2883 }
2884
2885 /* a generic arc_done_func_t which you can use */
2886 /* ARGSUSED */
2887 void
2888 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2889 {
2890         if (zio == NULL || zio->io_error == 0)
2891                 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2892         VERIFY(arc_buf_remove_ref(buf, arg));
2893 }
2894
2895 /* a generic arc_done_func_t */
2896 void
2897 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2898 {
2899         arc_buf_t **bufp = arg;
2900         if (zio && zio->io_error) {
2901                 VERIFY(arc_buf_remove_ref(buf, arg));
2902                 *bufp = NULL;
2903         } else {
2904                 *bufp = buf;
2905                 ASSERT(buf->b_data);
2906         }
2907 }
2908
2909 static void
2910 arc_read_done(zio_t *zio)
2911 {
2912         arc_buf_hdr_t   *hdr, *found;
2913         arc_buf_t       *buf;
2914         arc_buf_t       *abuf;  /* buffer we're assigning to callback */
2915         kmutex_t        *hash_lock;
2916         arc_callback_t  *callback_list, *acb;
2917         int             freeable = FALSE;
2918
2919         buf = zio->io_private;
2920         hdr = buf->b_hdr;
2921
2922         /*
2923          * The hdr was inserted into hash-table and removed from lists
2924          * prior to starting I/O.  We should find this header, since
2925          * it's in the hash table, and it should be legit since it's
2926          * not possible to evict it during the I/O.  The only possible
2927          * reason for it not to be found is if we were freed during the
2928          * read.
2929          */
2930         found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
2931             &hash_lock);
2932
2933         ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
2934             (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2935             (found == hdr && HDR_L2_READING(hdr)));
2936
2937         hdr->b_flags &= ~ARC_L2_EVICTED;
2938         if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
2939                 hdr->b_flags &= ~ARC_L2CACHE;
2940
2941         /* byteswap if necessary */
2942         callback_list = hdr->b_acb;
2943         ASSERT(callback_list != NULL);
2944         if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
2945                 dmu_object_byteswap_t bswap =
2946                     DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
2947                 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
2948                     byteswap_uint64_array :
2949                     dmu_ot_byteswap[bswap].ob_func;
2950                 func(buf->b_data, hdr->b_size);
2951         }
2952
2953         arc_cksum_compute(buf, B_FALSE);
2954 #ifdef illumos
2955         arc_buf_watch(buf);
2956 #endif /* illumos */
2957
2958         if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
2959                 /*
2960                  * Only call arc_access on anonymous buffers.  This is because
2961                  * if we've issued an I/O for an evicted buffer, we've already
2962                  * called arc_access (to prevent any simultaneous readers from
2963                  * getting confused).
2964                  */
2965                 arc_access(hdr, hash_lock);
2966         }
2967
2968         /* create copies of the data buffer for the callers */
2969         abuf = buf;
2970         for (acb = callback_list; acb; acb = acb->acb_next) {
2971                 if (acb->acb_done) {
2972                         if (abuf == NULL) {
2973                                 ARCSTAT_BUMP(arcstat_duplicate_reads);
2974                                 abuf = arc_buf_clone(buf);
2975                         }
2976                         acb->acb_buf = abuf;
2977                         abuf = NULL;
2978                 }
2979         }
2980         hdr->b_acb = NULL;
2981         hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2982         ASSERT(!HDR_BUF_AVAILABLE(hdr));
2983         if (abuf == buf) {
2984                 ASSERT(buf->b_efunc == NULL);
2985                 ASSERT(hdr->b_datacnt == 1);
2986                 hdr->b_flags |= ARC_BUF_AVAILABLE;
2987         }
2988
2989         ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
2990
2991         if (zio->io_error != 0) {
2992                 hdr->b_flags |= ARC_IO_ERROR;
2993                 if (hdr->b_state != arc_anon)
2994                         arc_change_state(arc_anon, hdr, hash_lock);
2995                 if (HDR_IN_HASH_TABLE(hdr))
2996                         buf_hash_remove(hdr);
2997                 freeable = refcount_is_zero(&hdr->b_refcnt);
2998         }
2999
3000         /*
3001          * Broadcast before we drop the hash_lock to avoid the possibility
3002          * that the hdr (and hence the cv) might be freed before we get to
3003          * the cv_broadcast().
3004          */
3005         cv_broadcast(&hdr->b_cv);
3006
3007         if (hash_lock) {
3008                 mutex_exit(hash_lock);
3009         } else {
3010                 /*
3011                  * This block was freed while we waited for the read to
3012                  * complete.  It has been removed from the hash table and
3013                  * moved to the anonymous state (so that it won't show up
3014                  * in the cache).
3015                  */
3016                 ASSERT3P(hdr->b_state, ==, arc_anon);
3017                 freeable = refcount_is_zero(&hdr->b_refcnt);
3018         }
3019
3020         /* execute each callback and free its structure */
3021         while ((acb = callback_list) != NULL) {
3022                 if (acb->acb_done)
3023                         acb->acb_done(zio, acb->acb_buf, acb->acb_private);
3024
3025                 if (acb->acb_zio_dummy != NULL) {
3026                         acb->acb_zio_dummy->io_error = zio->io_error;
3027                         zio_nowait(acb->acb_zio_dummy);
3028                 }
3029
3030                 callback_list = acb->acb_next;
3031                 kmem_free(acb, sizeof (arc_callback_t));
3032         }
3033
3034         if (freeable)
3035                 arc_hdr_destroy(hdr);
3036 }
3037
3038 /*
3039  * "Read" the block block at the specified DVA (in bp) via the
3040  * cache.  If the block is found in the cache, invoke the provided
3041  * callback immediately and return.  Note that the `zio' parameter
3042  * in the callback will be NULL in this case, since no IO was
3043  * required.  If the block is not in the cache pass the read request
3044  * on to the spa with a substitute callback function, so that the
3045  * requested block will be added to the cache.
3046  *
3047  * If a read request arrives for a block that has a read in-progress,
3048  * either wait for the in-progress read to complete (and return the
3049  * results); or, if this is a read with a "done" func, add a record
3050  * to the read to invoke the "done" func when the read completes,
3051  * and return; or just return.
3052  *
3053  * arc_read_done() will invoke all the requested "done" functions
3054  * for readers of this block.
3055  */
3056 int
3057 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
3058     void *private, int priority, int zio_flags, uint32_t *arc_flags,
3059     const zbookmark_t *zb)
3060 {
3061         arc_buf_hdr_t *hdr;
3062         arc_buf_t *buf = NULL;
3063         kmutex_t *hash_lock;
3064         zio_t *rzio;
3065         uint64_t guid = spa_load_guid(spa);
3066
3067 top:
3068         hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
3069             &hash_lock);
3070         if (hdr && hdr->b_datacnt > 0) {
3071
3072                 *arc_flags |= ARC_CACHED;
3073
3074                 if (HDR_IO_IN_PROGRESS(hdr)) {
3075
3076                         if (*arc_flags & ARC_WAIT) {
3077                                 cv_wait(&hdr->b_cv, hash_lock);
3078                                 mutex_exit(hash_lock);
3079                                 goto top;
3080                         }
3081                         ASSERT(*arc_flags & ARC_NOWAIT);
3082
3083                         if (done) {
3084                                 arc_callback_t  *acb = NULL;
3085
3086                                 acb = kmem_zalloc(sizeof (arc_callback_t),
3087                                     KM_SLEEP);
3088                                 acb->acb_done = done;
3089                                 acb->acb_private = private;
3090                                 if (pio != NULL)
3091                                         acb->acb_zio_dummy = zio_null(pio,
3092                                             spa, NULL, NULL, NULL, zio_flags);
3093
3094                                 ASSERT(acb->acb_done != NULL);
3095                                 acb->acb_next = hdr->b_acb;
3096                                 hdr->b_acb = acb;
3097                                 add_reference(hdr, hash_lock, private);
3098                                 mutex_exit(hash_lock);
3099                                 return (0);
3100                         }
3101                         mutex_exit(hash_lock);
3102                         return (0);
3103                 }
3104
3105                 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3106
3107                 if (done) {
3108                         add_reference(hdr, hash_lock, private);
3109                         /*
3110                          * If this block is already in use, create a new
3111                          * copy of the data so that we will be guaranteed
3112                          * that arc_release() will always succeed.
3113                          */
3114                         buf = hdr->b_buf;
3115                         ASSERT(buf);
3116                         ASSERT(buf->b_data);
3117                         if (HDR_BUF_AVAILABLE(hdr)) {
3118                                 ASSERT(buf->b_efunc == NULL);
3119                                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3120                         } else {
3121                                 buf = arc_buf_clone(buf);
3122                         }
3123
3124                 } else if (*arc_flags & ARC_PREFETCH &&
3125                     refcount_count(&hdr->b_refcnt) == 0) {
3126                         hdr->b_flags |= ARC_PREFETCH;
3127                 }
3128                 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
3129                 arc_access(hdr, hash_lock);
3130                 if (*arc_flags & ARC_L2CACHE)
3131                         hdr->b_flags |= ARC_L2CACHE;
3132                 mutex_exit(hash_lock);
3133                 ARCSTAT_BUMP(arcstat_hits);
3134                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3135                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3136                     data, metadata, hits);
3137
3138                 if (done)
3139                         done(NULL, buf, private);
3140         } else {
3141                 uint64_t size = BP_GET_LSIZE(bp);
3142                 arc_callback_t  *acb;
3143                 vdev_t *vd = NULL;
3144                 uint64_t addr = 0;
3145                 boolean_t devw = B_FALSE;
3146
3147                 if (hdr == NULL) {
3148                         /* this block is not in the cache */
3149                         arc_buf_hdr_t   *exists;
3150                         arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
3151                         buf = arc_buf_alloc(spa, size, private, type);
3152                         hdr = buf->b_hdr;
3153                         hdr->b_dva = *BP_IDENTITY(bp);
3154                         hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
3155                         hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
3156                         exists = buf_hash_insert(hdr, &hash_lock);
3157                         if (exists) {
3158                                 /* somebody beat us to the hash insert */
3159                                 mutex_exit(hash_lock);
3160                                 buf_discard_identity(hdr);
3161                                 (void) arc_buf_remove_ref(buf, private);
3162                                 goto top; /* restart the IO request */
3163                         }
3164                         /* if this is a prefetch, we don't have a reference */
3165                         if (*arc_flags & ARC_PREFETCH) {
3166                                 (void) remove_reference(hdr, hash_lock,
3167                                     private);
3168                                 hdr->b_flags |= ARC_PREFETCH;
3169                         }
3170                         if (*arc_flags & ARC_L2CACHE)
3171                                 hdr->b_flags |= ARC_L2CACHE;
3172                         if (BP_GET_LEVEL(bp) > 0)
3173                                 hdr->b_flags |= ARC_INDIRECT;
3174                 } else {
3175                         /* this block is in the ghost cache */
3176                         ASSERT(GHOST_STATE(hdr->b_state));
3177                         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3178                         ASSERT0(refcount_count(&hdr->b_refcnt));
3179                         ASSERT(hdr->b_buf == NULL);
3180
3181                         /* if this is a prefetch, we don't have a reference */
3182                         if (*arc_flags & ARC_PREFETCH)
3183                                 hdr->b_flags |= ARC_PREFETCH;
3184                         else
3185                                 add_reference(hdr, hash_lock, private);
3186                         if (*arc_flags & ARC_L2CACHE)
3187                                 hdr->b_flags |= ARC_L2CACHE;
3188                         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3189                         buf->b_hdr = hdr;
3190                         buf->b_data = NULL;
3191                         buf->b_efunc = NULL;
3192                         buf->b_private = NULL;
3193                         buf->b_next = NULL;
3194                         hdr->b_buf = buf;
3195                         ASSERT(hdr->b_datacnt == 0);
3196                         hdr->b_datacnt = 1;
3197                         arc_get_data_buf(buf);
3198                         arc_access(hdr, hash_lock);
3199                 }
3200
3201                 ASSERT(!GHOST_STATE(hdr->b_state));
3202
3203                 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3204                 acb->acb_done = done;
3205                 acb->acb_private = private;
3206
3207                 ASSERT(hdr->b_acb == NULL);
3208                 hdr->b_acb = acb;
3209                 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3210
3211                 if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
3212                     (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3213                         devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3214                         addr = hdr->b_l2hdr->b_daddr;
3215                         /*
3216                          * Lock out device removal.
3217                          */
3218                         if (vdev_is_dead(vd) ||
3219                             !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3220                                 vd = NULL;
3221                 }
3222
3223                 mutex_exit(hash_lock);
3224
3225                 /*
3226                  * At this point, we have a level 1 cache miss.  Try again in
3227                  * L2ARC if possible.
3228                  */
3229                 ASSERT3U(hdr->b_size, ==, size);
3230                 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3231                     uint64_t, size, zbookmark_t *, zb);
3232                 ARCSTAT_BUMP(arcstat_misses);
3233                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3234                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3235                     data, metadata, misses);
3236 #ifdef _KERNEL
3237                 curthread->td_ru.ru_inblock++;
3238 #endif
3239
3240                 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3241                         /*
3242                          * Read from the L2ARC if the following are true:
3243                          * 1. The L2ARC vdev was previously cached.
3244                          * 2. This buffer still has L2ARC metadata.
3245                          * 3. This buffer isn't currently writing to the L2ARC.
3246                          * 4. The L2ARC entry wasn't evicted, which may
3247                          *    also have invalidated the vdev.
3248                          * 5. This isn't prefetch and l2arc_noprefetch is set.
3249                          */
3250                         if (hdr->b_l2hdr != NULL &&
3251                             !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3252                             !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3253                                 l2arc_read_callback_t *cb;
3254
3255                                 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3256                                 ARCSTAT_BUMP(arcstat_l2_hits);
3257
3258                                 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3259                                     KM_SLEEP);
3260                                 cb->l2rcb_buf = buf;
3261                                 cb->l2rcb_spa = spa;
3262                                 cb->l2rcb_bp = *bp;
3263                                 cb->l2rcb_zb = *zb;
3264                                 cb->l2rcb_flags = zio_flags;
3265
3266                                 ASSERT(addr >= VDEV_LABEL_START_SIZE &&
3267                                     addr + size < vd->vdev_psize -
3268                                     VDEV_LABEL_END_SIZE);
3269
3270                                 /*
3271                                  * l2arc read.  The SCL_L2ARC lock will be
3272                                  * released by l2arc_read_done().
3273                                  */
3274                                 rzio = zio_read_phys(pio, vd, addr, size,
3275                                     buf->b_data, ZIO_CHECKSUM_OFF,
3276                                     l2arc_read_done, cb, priority, zio_flags |
3277                                     ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
3278                                     ZIO_FLAG_DONT_PROPAGATE |
3279                                     ZIO_FLAG_DONT_RETRY, B_FALSE);
3280                                 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3281                                     zio_t *, rzio);
3282                                 ARCSTAT_INCR(arcstat_l2_read_bytes, size);
3283
3284                                 if (*arc_flags & ARC_NOWAIT) {
3285                                         zio_nowait(rzio);
3286                                         return (0);
3287                                 }
3288
3289                                 ASSERT(*arc_flags & ARC_WAIT);
3290                                 if (zio_wait(rzio) == 0)
3291                                         return (0);
3292
3293                                 /* l2arc read error; goto zio_read() */
3294                         } else {
3295                                 DTRACE_PROBE1(l2arc__miss,
3296                                     arc_buf_hdr_t *, hdr);
3297                                 ARCSTAT_BUMP(arcstat_l2_misses);
3298                                 if (HDR_L2_WRITING(hdr))
3299                                         ARCSTAT_BUMP(arcstat_l2_rw_clash);
3300                                 spa_config_exit(spa, SCL_L2ARC, vd);
3301                         }
3302                 } else {
3303                         if (vd != NULL)
3304                                 spa_config_exit(spa, SCL_L2ARC, vd);
3305                         if (l2arc_ndev != 0) {
3306                                 DTRACE_PROBE1(l2arc__miss,
3307                                     arc_buf_hdr_t *, hdr);
3308                                 ARCSTAT_BUMP(arcstat_l2_misses);
3309                         }
3310                 }
3311
3312                 rzio = zio_read(pio, spa, bp, buf->b_data, size,
3313                     arc_read_done, buf, priority, zio_flags, zb);
3314
3315                 if (*arc_flags & ARC_WAIT)
3316                         return (zio_wait(rzio));
3317
3318                 ASSERT(*arc_flags & ARC_NOWAIT);
3319                 zio_nowait(rzio);
3320         }
3321         return (0);
3322 }
3323
3324 void
3325 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3326 {
3327         ASSERT(buf->b_hdr != NULL);
3328         ASSERT(buf->b_hdr->b_state != arc_anon);
3329         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3330         ASSERT(buf->b_efunc == NULL);
3331         ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3332
3333         buf->b_efunc = func;
3334         buf->b_private = private;
3335 }
3336
3337 /*
3338  * This is used by the DMU to let the ARC know that a buffer is
3339  * being evicted, so the ARC should clean up.  If this arc buf
3340  * is not yet in the evicted state, it will be put there.
3341  */
3342 int
3343 arc_buf_evict(arc_buf_t *buf)
3344 {
3345         arc_buf_hdr_t *hdr;
3346         kmutex_t *hash_lock;
3347         arc_buf_t **bufp;
3348         list_t *list, *evicted_list;
3349         kmutex_t *lock, *evicted_lock;
3350
3351         mutex_enter(&buf->b_evict_lock);
3352         hdr = buf->b_hdr;
3353         if (hdr == NULL) {
3354                 /*
3355                  * We are in arc_do_user_evicts().
3356                  */
3357                 ASSERT(buf->b_data == NULL);
3358                 mutex_exit(&buf->b_evict_lock);
3359                 return (0);
3360         } else if (buf->b_data == NULL) {
3361                 arc_buf_t copy = *buf; /* structure assignment */
3362                 /*
3363                  * We are on the eviction list; process this buffer now
3364                  * but let arc_do_user_evicts() do the reaping.
3365                  */
3366                 buf->b_efunc = NULL;
3367                 mutex_exit(&buf->b_evict_lock);
3368                 VERIFY(copy.b_efunc(&copy) == 0);
3369                 return (1);
3370         }
3371         hash_lock = HDR_LOCK(hdr);
3372         mutex_enter(hash_lock);
3373         hdr = buf->b_hdr;
3374         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3375
3376         ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3377         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3378
3379         /*
3380          * Pull this buffer off of the hdr
3381          */
3382         bufp = &hdr->b_buf;
3383         while (*bufp != buf)
3384                 bufp = &(*bufp)->b_next;
3385         *bufp = buf->b_next;
3386
3387         ASSERT(buf->b_data != NULL);
3388         arc_buf_destroy(buf, FALSE, FALSE);
3389
3390         if (hdr->b_datacnt == 0) {
3391                 arc_state_t *old_state = hdr->b_state;
3392                 arc_state_t *evicted_state;
3393
3394                 ASSERT(hdr->b_buf == NULL);
3395                 ASSERT(refcount_is_zero(&hdr->b_refcnt));
3396
3397                 evicted_state =
3398                     (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3399
3400                 get_buf_info(hdr, old_state, &list, &lock);
3401                 get_buf_info(hdr, evicted_state, &evicted_list, &evicted_lock);
3402                 mutex_enter(lock);
3403                 mutex_enter(evicted_lock);
3404
3405                 arc_change_state(evicted_state, hdr, hash_lock);
3406                 ASSERT(HDR_IN_HASH_TABLE(hdr));
3407                 hdr->b_flags |= ARC_IN_HASH_TABLE;
3408                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3409
3410                 mutex_exit(evicted_lock);
3411                 mutex_exit(lock);
3412         }
3413         mutex_exit(hash_lock);
3414         mutex_exit(&buf->b_evict_lock);
3415
3416         VERIFY(buf->b_efunc(buf) == 0);
3417         buf->b_efunc = NULL;
3418         buf->b_private = NULL;
3419         buf->b_hdr = NULL;
3420         buf->b_next = NULL;
3421         kmem_cache_free(buf_cache, buf);
3422         return (1);
3423 }
3424
3425 /*
3426  * Release this buffer from the cache, making it an anonymous buffer.  This
3427  * must be done after a read and prior to modifying the buffer contents.
3428  * If the buffer has more than one reference, we must make
3429  * a new hdr for the buffer.
3430  */
3431 void
3432 arc_release(arc_buf_t *buf, void *tag)
3433 {
3434         arc_buf_hdr_t *hdr;
3435         kmutex_t *hash_lock = NULL;
3436         l2arc_buf_hdr_t *l2hdr;
3437         uint64_t buf_size;
3438
3439         /*
3440          * It would be nice to assert that if it's DMU metadata (level >
3441          * 0 || it's the dnode file), then it must be syncing context.
3442          * But we don't know that information at this level.
3443          */
3444
3445         mutex_enter(&buf->b_evict_lock);
3446         hdr = buf->b_hdr;
3447
3448         /* this buffer is not on any list */
3449         ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3450
3451         if (hdr->b_state == arc_anon) {
3452                 /* this buffer is already released */
3453                 ASSERT(buf->b_efunc == NULL);
3454         } else {
3455                 hash_lock = HDR_LOCK(hdr);
3456                 mutex_enter(hash_lock);
3457                 hdr = buf->b_hdr;
3458                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3459         }
3460
3461         l2hdr = hdr->b_l2hdr;
3462         if (l2hdr) {
3463                 mutex_enter(&l2arc_buflist_mtx);
3464                 hdr->b_l2hdr = NULL;
3465         }
3466         buf_size = hdr->b_size;
3467
3468         /*
3469          * Do we have more than one buf?
3470          */
3471         if (hdr->b_datacnt > 1) {
3472                 arc_buf_hdr_t *nhdr;
3473                 arc_buf_t **bufp;
3474                 uint64_t blksz = hdr->b_size;
3475                 uint64_t spa = hdr->b_spa;
3476                 arc_buf_contents_t type = hdr->b_type;
3477                 uint32_t flags = hdr->b_flags;
3478
3479                 ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3480                 /*
3481                  * Pull the data off of this hdr and attach it to
3482                  * a new anonymous hdr.
3483                  */
3484                 (void) remove_reference(hdr, hash_lock, tag);
3485                 bufp = &hdr->b_buf;
3486                 while (*bufp != buf)
3487                         bufp = &(*bufp)->b_next;
3488                 *bufp = buf->b_next;
3489                 buf->b_next = NULL;
3490
3491                 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3492                 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3493                 if (refcount_is_zero(&hdr->b_refcnt)) {
3494                         uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3495                         ASSERT3U(*size, >=, hdr->b_size);
3496                         atomic_add_64(size, -hdr->b_size);
3497                 }
3498
3499                 /*
3500                  * We're releasing a duplicate user data buffer, update
3501                  * our statistics accordingly.
3502                  */
3503                 if (hdr->b_type == ARC_BUFC_DATA) {
3504                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3505                         ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3506                             -hdr->b_size);
3507                 }
3508                 hdr->b_datacnt -= 1;
3509                 arc_cksum_verify(buf);
3510 #ifdef illumos
3511                 arc_buf_unwatch(buf);
3512 #endif /* illumos */
3513
3514                 mutex_exit(hash_lock);
3515
3516                 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3517                 nhdr->b_size = blksz;
3518                 nhdr->b_spa = spa;
3519                 nhdr->b_type = type;
3520                 nhdr->b_buf = buf;
3521                 nhdr->b_state = arc_anon;
3522                 nhdr->b_arc_access = 0;
3523                 nhdr->b_flags = flags & ARC_L2_WRITING;
3524                 nhdr->b_l2hdr = NULL;
3525                 nhdr->b_datacnt = 1;
3526                 nhdr->b_freeze_cksum = NULL;
3527                 (void) refcount_add(&nhdr->b_refcnt, tag);
3528                 buf->b_hdr = nhdr;
3529                 mutex_exit(&buf->b_evict_lock);
3530                 atomic_add_64(&arc_anon->arcs_size, blksz);
3531         } else {
3532                 mutex_exit(&buf->b_evict_lock);
3533                 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3534                 ASSERT(!list_link_active(&hdr->b_arc_node));
3535                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3536                 if (hdr->b_state != arc_anon)
3537                         arc_change_state(arc_anon, hdr, hash_lock);
3538                 hdr->b_arc_access = 0;
3539                 if (hash_lock)
3540                         mutex_exit(hash_lock);
3541
3542                 buf_discard_identity(hdr);
3543                 arc_buf_thaw(buf);
3544         }
3545         buf->b_efunc = NULL;
3546         buf->b_private = NULL;
3547
3548         if (l2hdr) {
3549                 trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
3550                     hdr->b_size, 0);
3551                 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3552                 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3553                 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3554                 mutex_exit(&l2arc_buflist_mtx);
3555         }
3556 }
3557
3558 int
3559 arc_released(arc_buf_t *buf)
3560 {
3561         int released;
3562
3563         mutex_enter(&buf->b_evict_lock);
3564         released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3565         mutex_exit(&buf->b_evict_lock);
3566         return (released);
3567 }
3568
3569 int
3570 arc_has_callback(arc_buf_t *buf)
3571 {
3572         int callback;
3573
3574         mutex_enter(&buf->b_evict_lock);
3575         callback = (buf->b_efunc != NULL);
3576         mutex_exit(&buf->b_evict_lock);
3577         return (callback);
3578 }
3579
3580 #ifdef ZFS_DEBUG
3581 int
3582 arc_referenced(arc_buf_t *buf)
3583 {
3584         int referenced;
3585
3586         mutex_enter(&buf->b_evict_lock);
3587         referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3588         mutex_exit(&buf->b_evict_lock);
3589         return (referenced);
3590 }
3591 #endif
3592
3593 static void
3594 arc_write_ready(zio_t *zio)
3595 {
3596         arc_write_callback_t *callback = zio->io_private;
3597         arc_buf_t *buf = callback->awcb_buf;
3598         arc_buf_hdr_t *hdr = buf->b_hdr;
3599
3600         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3601         callback->awcb_ready(zio, buf, callback->awcb_private);
3602
3603         /*
3604          * If the IO is already in progress, then this is a re-write
3605          * attempt, so we need to thaw and re-compute the cksum.
3606          * It is the responsibility of the callback to handle the
3607          * accounting for any re-write attempt.
3608          */
3609         if (HDR_IO_IN_PROGRESS(hdr)) {
3610                 mutex_enter(&hdr->b_freeze_lock);
3611                 if (hdr->b_freeze_cksum != NULL) {
3612                         kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3613                         hdr->b_freeze_cksum = NULL;
3614                 }
3615                 mutex_exit(&hdr->b_freeze_lock);
3616         }
3617         arc_cksum_compute(buf, B_FALSE);
3618         hdr->b_flags |= ARC_IO_IN_PROGRESS;
3619 }
3620
3621 static void
3622 arc_write_done(zio_t *zio)
3623 {
3624         arc_write_callback_t *callback = zio->io_private;
3625         arc_buf_t *buf = callback->awcb_buf;
3626         arc_buf_hdr_t *hdr = buf->b_hdr;
3627
3628         ASSERT(hdr->b_acb == NULL);
3629
3630         if (zio->io_error == 0) {
3631                 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3632                 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3633                 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3634         } else {
3635                 ASSERT(BUF_EMPTY(hdr));
3636         }
3637
3638         /*
3639          * If the block to be written was all-zero, we may have
3640          * compressed it away.  In this case no write was performed
3641          * so there will be no dva/birth/checksum.  The buffer must
3642          * therefore remain anonymous (and uncached).
3643          */
3644         if (!BUF_EMPTY(hdr)) {
3645                 arc_buf_hdr_t *exists;
3646                 kmutex_t *hash_lock;
3647
3648                 ASSERT(zio->io_error == 0);
3649
3650                 arc_cksum_verify(buf);
3651
3652                 exists = buf_hash_insert(hdr, &hash_lock);
3653                 if (exists) {
3654                         /*
3655                          * This can only happen if we overwrite for
3656                          * sync-to-convergence, because we remove
3657                          * buffers from the hash table when we arc_free().
3658                          */
3659                         if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3660                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3661                                         panic("bad overwrite, hdr=%p exists=%p",
3662                                             (void *)hdr, (void *)exists);
3663                                 ASSERT(refcount_is_zero(&exists->b_refcnt));
3664                                 arc_change_state(arc_anon, exists, hash_lock);
3665                                 mutex_exit(hash_lock);
3666                                 arc_hdr_destroy(exists);
3667                                 exists = buf_hash_insert(hdr, &hash_lock);
3668                                 ASSERT3P(exists, ==, NULL);
3669                         } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3670                                 /* nopwrite */
3671                                 ASSERT(zio->io_prop.zp_nopwrite);
3672                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3673                                         panic("bad nopwrite, hdr=%p exists=%p",
3674                                             (void *)hdr, (void *)exists);
3675                         } else {
3676                                 /* Dedup */
3677                                 ASSERT(hdr->b_datacnt == 1);
3678                                 ASSERT(hdr->b_state == arc_anon);
3679                                 ASSERT(BP_GET_DEDUP(zio->io_bp));
3680                                 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3681                         }
3682                 }
3683                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3684                 /* if it's not anon, we are doing a scrub */
3685                 if (!exists && hdr->b_state == arc_anon)
3686                         arc_access(hdr, hash_lock);
3687                 mutex_exit(hash_lock);
3688         } else {
3689                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3690         }
3691
3692         ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3693         callback->awcb_done(zio, buf, callback->awcb_private);
3694
3695         kmem_free(callback, sizeof (arc_write_callback_t));
3696 }
3697
3698 zio_t *
3699 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3700     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
3701     arc_done_func_t *ready, arc_done_func_t *done, void *private,
3702     int priority, int zio_flags, const zbookmark_t *zb)
3703 {
3704         arc_buf_hdr_t *hdr = buf->b_hdr;
3705         arc_write_callback_t *callback;
3706         zio_t *zio;
3707
3708         ASSERT(ready != NULL);
3709         ASSERT(done != NULL);
3710         ASSERT(!HDR_IO_ERROR(hdr));
3711         ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3712         ASSERT(hdr->b_acb == NULL);
3713         if (l2arc)
3714                 hdr->b_flags |= ARC_L2CACHE;
3715         callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3716         callback->awcb_ready = ready;
3717         callback->awcb_done = done;
3718         callback->awcb_private = private;
3719         callback->awcb_buf = buf;
3720
3721         zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3722             arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
3723
3724         return (zio);
3725 }
3726
3727 static int
3728 arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
3729 {
3730 #ifdef _KERNEL
3731         uint64_t available_memory =
3732             ptoa((uintmax_t)cnt.v_free_count + cnt.v_cache_count);
3733         static uint64_t page_load = 0;
3734         static uint64_t last_txg = 0;
3735
3736 #ifdef sun
3737 #if defined(__i386)
3738         available_memory =
3739             MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3740 #endif
3741 #endif  /* sun */
3742         if (available_memory >= zfs_write_limit_max)
3743                 return (0);
3744
3745         if (txg > last_txg) {
3746                 last_txg = txg;
3747                 page_load = 0;
3748         }
3749         /*
3750          * If we are in pageout, we know that memory is already tight,
3751          * the arc is already going to be evicting, so we just want to
3752          * continue to let page writes occur as quickly as possible.
3753          */
3754         if (curproc == pageproc) {
3755                 if (page_load > available_memory / 4)
3756                         return (SET_ERROR(ERESTART));
3757                 /* Note: reserve is inflated, so we deflate */
3758                 page_load += reserve / 8;
3759                 return (0);
3760         } else if (page_load > 0 && arc_reclaim_needed()) {
3761                 /* memory is low, delay before restarting */
3762                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3763                 return (SET_ERROR(EAGAIN));
3764         }
3765         page_load = 0;
3766
3767         if (arc_size > arc_c_min) {
3768                 uint64_t evictable_memory =
3769                     arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3770                     arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3771                     arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3772                     arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3773                 available_memory += MIN(evictable_memory, arc_size - arc_c_min);
3774         }
3775
3776         if (inflight_data > available_memory / 4) {
3777                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3778                 return (SET_ERROR(ERESTART));
3779         }
3780 #endif
3781         return (0);
3782 }
3783
3784 void
3785 arc_tempreserve_clear(uint64_t reserve)
3786 {
3787         atomic_add_64(&arc_tempreserve, -reserve);
3788         ASSERT((int64_t)arc_tempreserve >= 0);
3789 }
3790
3791 int
3792 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3793 {
3794         int error;
3795         uint64_t anon_size;
3796
3797 #ifdef ZFS_DEBUG
3798         /*
3799          * Once in a while, fail for no reason.  Everything should cope.
3800          */
3801         if (spa_get_random(10000) == 0) {
3802                 dprintf("forcing random failure\n");
3803                 return (SET_ERROR(ERESTART));
3804         }
3805 #endif
3806         if (reserve > arc_c/4 && !arc_no_grow)
3807                 arc_c = MIN(arc_c_max, reserve * 4);
3808         if (reserve > arc_c)
3809                 return (SET_ERROR(ENOMEM));
3810
3811         /*
3812          * Don't count loaned bufs as in flight dirty data to prevent long
3813          * network delays from blocking transactions that are ready to be
3814          * assigned to a txg.
3815          */
3816         anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3817
3818         /*
3819          * Writes will, almost always, require additional memory allocations
3820          * in order to compress/encrypt/etc the data.  We therefor need to
3821          * make sure that there is sufficient available memory for this.
3822          */
3823         if (error = arc_memory_throttle(reserve, anon_size, txg))
3824                 return (error);
3825
3826         /*
3827          * Throttle writes when the amount of dirty data in the cache
3828          * gets too large.  We try to keep the cache less than half full
3829          * of dirty blocks so that our sync times don't grow too large.
3830          * Note: if two requests come in concurrently, we might let them
3831          * both succeed, when one of them should fail.  Not a huge deal.
3832          */
3833
3834         if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3835             anon_size > arc_c / 4) {
3836                 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3837                     "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3838                     arc_tempreserve>>10,
3839                     arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3840                     arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3841                     reserve>>10, arc_c>>10);
3842                 return (SET_ERROR(ERESTART));
3843         }
3844         atomic_add_64(&arc_tempreserve, reserve);
3845         return (0);
3846 }
3847
3848 static kmutex_t arc_lowmem_lock;
3849 #ifdef _KERNEL
3850 static eventhandler_tag arc_event_lowmem = NULL;
3851
3852 static void
3853 arc_lowmem(void *arg __unused, int howto __unused)
3854 {
3855
3856         /* Serialize access via arc_lowmem_lock. */
3857         mutex_enter(&arc_lowmem_lock);
3858         mutex_enter(&arc_reclaim_thr_lock);
3859         needfree = 1;
3860         cv_signal(&arc_reclaim_thr_cv);
3861
3862         /*
3863          * It is unsafe to block here in arbitrary threads, because we can come
3864          * here from ARC itself and may hold ARC locks and thus risk a deadlock
3865          * with ARC reclaim thread.
3866          */
3867         if (curproc == pageproc) {
3868                 while (needfree)
3869                         msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0);
3870         }
3871         mutex_exit(&arc_reclaim_thr_lock);
3872         mutex_exit(&arc_lowmem_lock);
3873 }
3874 #endif
3875
3876 void
3877 arc_init(void)
3878 {
3879         int i, prefetch_tunable_set = 0;
3880
3881         mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3882         cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3883         mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL);
3884
3885         /* Convert seconds to clock ticks */
3886         arc_min_prefetch_lifespan = 1 * hz;
3887
3888         /* Start out with 1/8 of all memory */
3889         arc_c = kmem_size() / 8;
3890
3891 #ifdef sun
3892 #ifdef _KERNEL
3893         /*
3894          * On architectures where the physical memory can be larger
3895          * than the addressable space (intel in 32-bit mode), we may
3896          * need to limit the cache to 1/8 of VM size.
3897          */
3898         arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3899 #endif
3900 #endif  /* sun */
3901         /* set min cache to 1/32 of all memory, or 16MB, whichever is more */
3902         arc_c_min = MAX(arc_c / 4, 64<<18);
3903         /* set max to 1/2 of all memory, or all but 1GB, whichever is more */
3904         if (arc_c * 8 >= 1<<30)
3905                 arc_c_max = (arc_c * 8) - (1<<30);
3906         else
3907                 arc_c_max = arc_c_min;
3908         arc_c_max = MAX(arc_c * 5, arc_c_max);
3909
3910 #ifdef _KERNEL
3911         /*
3912          * Allow the tunables to override our calculations if they are
3913          * reasonable (ie. over 16MB)
3914          */
3915         if (zfs_arc_max > 64<<18 && zfs_arc_max < kmem_size())
3916                 arc_c_max = zfs_arc_max;
3917         if (zfs_arc_min > 64<<18 && zfs_arc_min <= arc_c_max)
3918                 arc_c_min = zfs_arc_min;
3919 #endif
3920
3921         arc_c = arc_c_max;
3922         arc_p = (arc_c >> 1);
3923
3924         /* limit meta-data to 1/4 of the arc capacity */
3925         arc_meta_limit = arc_c_max / 4;
3926
3927         /* Allow the tunable to override if it is reasonable */
3928         if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
3929                 arc_meta_limit = zfs_arc_meta_limit;
3930
3931         if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
3932                 arc_c_min = arc_meta_limit / 2;
3933
3934         if (zfs_arc_grow_retry > 0)
3935                 arc_grow_retry = zfs_arc_grow_retry;
3936
3937         if (zfs_arc_shrink_shift > 0)
3938                 arc_shrink_shift = zfs_arc_shrink_shift;
3939
3940         if (zfs_arc_p_min_shift > 0)
3941                 arc_p_min_shift = zfs_arc_p_min_shift;
3942
3943         /* if kmem_flags are set, lets try to use less memory */
3944         if (kmem_debugging())
3945                 arc_c = arc_c / 2;
3946         if (arc_c < arc_c_min)
3947                 arc_c = arc_c_min;
3948
3949         zfs_arc_min = arc_c_min;
3950         zfs_arc_max = arc_c_max;
3951
3952         arc_anon = &ARC_anon;
3953         arc_mru = &ARC_mru;
3954         arc_mru_ghost = &ARC_mru_ghost;
3955         arc_mfu = &ARC_mfu;
3956         arc_mfu_ghost = &ARC_mfu_ghost;
3957         arc_l2c_only = &ARC_l2c_only;
3958         arc_size = 0;
3959
3960         for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
3961                 mutex_init(&arc_anon->arcs_locks[i].arcs_lock,
3962                     NULL, MUTEX_DEFAULT, NULL);
3963                 mutex_init(&arc_mru->arcs_locks[i].arcs_lock,
3964                     NULL, MUTEX_DEFAULT, NULL);
3965                 mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock,
3966                     NULL, MUTEX_DEFAULT, NULL);
3967                 mutex_init(&arc_mfu->arcs_locks[i].arcs_lock,
3968                     NULL, MUTEX_DEFAULT, NULL);
3969                 mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock,
3970                     NULL, MUTEX_DEFAULT, NULL);
3971                 mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock,
3972                     NULL, MUTEX_DEFAULT, NULL);
3973
3974                 list_create(&arc_mru->arcs_lists[i],
3975                     sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3976                 list_create(&arc_mru_ghost->arcs_lists[i],
3977                     sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3978                 list_create(&arc_mfu->arcs_lists[i],
3979                     sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3980                 list_create(&arc_mfu_ghost->arcs_lists[i],
3981                     sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3982                 list_create(&arc_mfu_ghost->arcs_lists[i],
3983                     sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3984                 list_create(&arc_l2c_only->arcs_lists[i],
3985                     sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3986         }
3987
3988         buf_init();
3989
3990         arc_thread_exit = 0;
3991         arc_eviction_list = NULL;
3992         mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3993         bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3994
3995         arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3996             sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3997
3998         if (arc_ksp != NULL) {
3999                 arc_ksp->ks_data = &arc_stats;
4000                 kstat_install(arc_ksp);
4001         }
4002
4003         (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
4004             TS_RUN, minclsyspri);
4005
4006 #ifdef _KERNEL
4007         arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
4008             EVENTHANDLER_PRI_FIRST);
4009 #endif
4010
4011         arc_dead = FALSE;
4012         arc_warm = B_FALSE;
4013
4014         if (zfs_write_limit_max == 0)
4015                 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
4016         else
4017                 zfs_write_limit_shift = 0;
4018         mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
4019
4020 #ifdef _KERNEL
4021         if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
4022                 prefetch_tunable_set = 1;
4023
4024 #ifdef __i386__
4025         if (prefetch_tunable_set == 0) {
4026                 printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
4027                     "-- to enable,\n");
4028                 printf("            add \"vfs.zfs.prefetch_disable=0\" "
4029                     "to /boot/loader.conf.\n");
4030                 zfs_prefetch_disable = 1;
4031         }
4032 #else
4033         if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
4034             prefetch_tunable_set == 0) {
4035                 printf("ZFS NOTICE: Prefetch is disabled by default if less "
4036                     "than 4GB of RAM is present;\n"
4037                     "            to enable, add \"vfs.zfs.prefetch_disable=0\" "
4038                     "to /boot/loader.conf.\n");
4039                 zfs_prefetch_disable = 1;
4040         }
4041 #endif
4042         /* Warn about ZFS memory and address space requirements. */
4043         if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
4044                 printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
4045                     "expect unstable behavior.\n");
4046         }
4047         if (kmem_size() < 512 * (1 << 20)) {
4048                 printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
4049                     "expect unstable behavior.\n");
4050                 printf("             Consider tuning vm.kmem_size and "
4051                     "vm.kmem_size_max\n");
4052                 printf("             in /boot/loader.conf.\n");
4053         }
4054 #endif
4055 }
4056
4057 void
4058 arc_fini(void)
4059 {
4060         int i;
4061
4062         mutex_enter(&arc_reclaim_thr_lock);
4063         arc_thread_exit = 1;
4064         cv_signal(&arc_reclaim_thr_cv);
4065         while (arc_thread_exit != 0)
4066                 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
4067         mutex_exit(&arc_reclaim_thr_lock);
4068
4069         arc_flush(NULL);
4070
4071         arc_dead = TRUE;
4072
4073         if (arc_ksp != NULL) {
4074                 kstat_delete(arc_ksp);
4075                 arc_ksp = NULL;
4076         }
4077
4078         mutex_destroy(&arc_eviction_mtx);
4079         mutex_destroy(&arc_reclaim_thr_lock);
4080         cv_destroy(&arc_reclaim_thr_cv);
4081
4082         for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
4083                 list_destroy(&arc_mru->arcs_lists[i]);
4084                 list_destroy(&arc_mru_ghost->arcs_lists[i]);
4085                 list_destroy(&arc_mfu->arcs_lists[i]);
4086                 list_destroy(&arc_mfu_ghost->arcs_lists[i]);
4087                 list_destroy(&arc_l2c_only->arcs_lists[i]);
4088
4089                 mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock);
4090                 mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock);
4091                 mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock);
4092                 mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock);
4093                 mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock);
4094                 mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock);
4095         }
4096
4097         mutex_destroy(&zfs_write_limit_lock);
4098
4099         buf_fini();
4100
4101         ASSERT(arc_loaned_bytes == 0);
4102
4103         mutex_destroy(&arc_lowmem_lock);
4104 #ifdef _KERNEL
4105         if (arc_event_lowmem != NULL)
4106                 EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
4107 #endif
4108 }
4109
4110 /*
4111  * Level 2 ARC
4112  *
4113  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
4114  * It uses dedicated storage devices to hold cached data, which are populated
4115  * using large infrequent writes.  The main role of this cache is to boost
4116  * the performance of random read workloads.  The intended L2ARC devices
4117  * include short-stroked disks, solid state disks, and other media with
4118  * substantially faster read latency than disk.
4119  *
4120  *                 +-----------------------+
4121  *                 |         ARC           |
4122  *                 +-----------------------+
4123  *                    |         ^     ^
4124  *                    |         |     |
4125  *      l2arc_feed_thread()    arc_read()
4126  *                    |         |     |
4127  *                    |  l2arc read   |
4128  *                    V         |     |
4129  *               +---------------+    |
4130  *               |     L2ARC     |    |
4131  *               +---------------+    |
4132  *                   |    ^           |
4133  *          l2arc_write() |           |
4134  *                   |    |           |
4135  *                   V    |           |
4136  *                 +-------+      +-------+
4137  *                 | vdev  |      | vdev  |
4138  *                 | cache |      | cache |
4139  *                 +-------+      +-------+
4140  *                 +=========+     .-----.
4141  *                 :  L2ARC  :    |-_____-|
4142  *                 : devices :    | Disks |
4143  *                 +=========+    `-_____-'
4144  *
4145  * Read requests are satisfied from the following sources, in order:
4146  *
4147  *      1) ARC
4148  *      2) vdev cache of L2ARC devices
4149  *      3) L2ARC devices
4150  *      4) vdev cache of disks
4151  *      5) disks
4152  *
4153  * Some L2ARC device types exhibit extremely slow write performance.
4154  * To accommodate for this there are some significant differences between
4155  * the L2ARC and traditional cache design:
4156  *
4157  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
4158  * the ARC behave as usual, freeing buffers and placing headers on ghost
4159  * lists.  The ARC does not send buffers to the L2ARC during eviction as
4160  * this would add inflated write latencies for all ARC memory pressure.
4161  *
4162  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
4163  * It does this by periodically scanning buffers from the eviction-end of
4164  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
4165  * not already there.  It scans until a headroom of buffers is satisfied,
4166  * which itself is a buffer for ARC eviction.  The thread that does this is
4167  * l2arc_feed_thread(), illustrated below; example sizes are included to
4168  * provide a better sense of ratio than this diagram:
4169  *
4170  *             head -->                        tail
4171  *              +---------------------+----------+
4172  *      ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
4173  *              +---------------------+----------+   |   o L2ARC eligible
4174  *      ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
4175  *              +---------------------+----------+   |
4176  *                   15.9 Gbytes      ^ 32 Mbytes    |
4177  *                                 headroom          |
4178  *                                            l2arc_feed_thread()
4179  *                                                   |
4180  *                       l2arc write hand <--[oooo]--'
4181  *                               |           8 Mbyte
4182  *                               |          write max
4183  *                               V
4184  *                +==============================+
4185  *      L2ARC dev |####|#|###|###|    |####| ... |
4186  *                +==============================+
4187  *                           32 Gbytes
4188  *
4189  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
4190  * evicted, then the L2ARC has cached a buffer much sooner than it probably
4191  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
4192  * safe to say that this is an uncommon case, since buffers at the end of
4193  * the ARC lists have moved there due to inactivity.
4194  *
4195  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
4196  * then the L2ARC simply misses copying some buffers.  This serves as a
4197  * pressure valve to prevent heavy read workloads from both stalling the ARC
4198  * with waits and clogging the L2ARC with writes.  This also helps prevent
4199  * the potential for the L2ARC to churn if it attempts to cache content too
4200  * quickly, such as during backups of the entire pool.
4201  *
4202  * 5. After system boot and before the ARC has filled main memory, there are
4203  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
4204  * lists can remain mostly static.  Instead of searching from tail of these
4205  * lists as pictured, the l2arc_feed_thread() will search from the list heads
4206  * for eligible buffers, greatly increasing its chance of finding them.
4207  *
4208  * The L2ARC device write speed is also boosted during this time so that
4209  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
4210  * there are no L2ARC reads, and no fear of degrading read performance
4211  * through increased writes.
4212  *
4213  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
4214  * the vdev queue can aggregate them into larger and fewer writes.  Each
4215  * device is written to in a rotor fashion, sweeping writes through
4216  * available space then repeating.
4217  *
4218  * 7. The L2ARC does not store dirty content.  It never needs to flush
4219  * write buffers back to disk based storage.
4220  *
4221  * 8. If an ARC buffer is written (and dirtied) which also exists in the
4222  * L2ARC, the now stale L2ARC buffer is immediately dropped.
4223  *
4224  * The performance of the L2ARC can be tweaked by a number of tunables, which
4225  * may be necessary for different workloads:
4226  *
4227  *      l2arc_write_max         max write bytes per interval
4228  *      l2arc_write_boost       extra write bytes during device warmup
4229  *      l2arc_noprefetch        skip caching prefetched buffers
4230  *      l2arc_headroom          number of max device writes to precache
4231  *      l2arc_feed_secs         seconds between L2ARC writing
4232  *
4233  * Tunables may be removed or added as future performance improvements are
4234  * integrated, and also may become zpool properties.
4235  *
4236  * There are three key functions that control how the L2ARC warms up:
4237  *
4238  *      l2arc_write_eligible()  check if a buffer is eligible to cache
4239  *      l2arc_write_size()      calculate how much to write
4240  *      l2arc_write_interval()  calculate sleep delay between writes
4241  *
4242  * These three functions determine what to write, how much, and how quickly
4243  * to send writes.
4244  */
4245
4246 static boolean_t
4247 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4248 {
4249         /*
4250          * A buffer is *not* eligible for the L2ARC if it:
4251          * 1. belongs to a different spa.
4252          * 2. is already cached on the L2ARC.
4253          * 3. has an I/O in progress (it may be an incomplete read).
4254          * 4. is flagged not eligible (zfs property).
4255          */
4256         if (ab->b_spa != spa_guid) {
4257                 ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
4258                 return (B_FALSE);
4259         }
4260         if (ab->b_l2hdr != NULL) {
4261                 ARCSTAT_BUMP(arcstat_l2_write_in_l2);
4262                 return (B_FALSE);
4263         }
4264         if (HDR_IO_IN_PROGRESS(ab)) {
4265                 ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
4266                 return (B_FALSE);
4267         }
4268         if (!HDR_L2CACHE(ab)) {
4269                 ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
4270                 return (B_FALSE);
4271         }
4272
4273         return (B_TRUE);
4274 }
4275
4276 static uint64_t
4277 l2arc_write_size(l2arc_dev_t *dev)
4278 {
4279         uint64_t size;
4280
4281         size = dev->l2ad_write;
4282
4283         if (arc_warm == B_FALSE)
4284                 size += dev->l2ad_boost;
4285
4286         return (size);
4287
4288 }
4289
4290 static clock_t
4291 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4292 {
4293         clock_t interval, next, now;
4294
4295         /*
4296          * If the ARC lists are busy, increase our write rate; if the
4297          * lists are stale, idle back.  This is achieved by checking
4298          * how much we previously wrote - if it was more than half of
4299          * what we wanted, schedule the next write much sooner.
4300          */
4301         if (l2arc_feed_again && wrote > (wanted / 2))
4302                 interval = (hz * l2arc_feed_min_ms) / 1000;
4303         else
4304                 interval = hz * l2arc_feed_secs;
4305
4306         now = ddi_get_lbolt();
4307         next = MAX(now, MIN(now + interval, began + interval));
4308
4309         return (next);
4310 }
4311
4312 static void
4313 l2arc_hdr_stat_add(void)
4314 {
4315         ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4316         ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4317 }
4318
4319 static void
4320 l2arc_hdr_stat_remove(void)
4321 {
4322         ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4323         ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4324 }
4325
4326 /*
4327  * Cycle through L2ARC devices.  This is how L2ARC load balances.
4328  * If a device is returned, this also returns holding the spa config lock.
4329  */
4330 static l2arc_dev_t *
4331 l2arc_dev_get_next(void)
4332 {
4333         l2arc_dev_t *first, *next = NULL;
4334
4335         /*
4336          * Lock out the removal of spas (spa_namespace_lock), then removal
4337          * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
4338          * both locks will be dropped and a spa config lock held instead.
4339          */
4340         mutex_enter(&spa_namespace_lock);
4341         mutex_enter(&l2arc_dev_mtx);
4342
4343         /* if there are no vdevs, there is nothing to do */
4344         if (l2arc_ndev == 0)
4345                 goto out;
4346
4347         first = NULL;
4348         next = l2arc_dev_last;
4349         do {
4350                 /* loop around the list looking for a non-faulted vdev */
4351                 if (next == NULL) {
4352                         next = list_head(l2arc_dev_list);
4353                 } else {
4354                         next = list_next(l2arc_dev_list, next);
4355                         if (next == NULL)
4356                                 next = list_head(l2arc_dev_list);
4357                 }
4358
4359                 /* if we have come back to the start, bail out */
4360                 if (first == NULL)
4361                         first = next;
4362                 else if (next == first)
4363                         break;
4364
4365         } while (vdev_is_dead(next->l2ad_vdev));
4366
4367         /* if we were unable to find any usable vdevs, return NULL */
4368         if (vdev_is_dead(next->l2ad_vdev))
4369                 next = NULL;
4370
4371         l2arc_dev_last = next;
4372
4373 out:
4374         mutex_exit(&l2arc_dev_mtx);
4375
4376         /*
4377          * Grab the config lock to prevent the 'next' device from being
4378          * removed while we are writing to it.
4379          */
4380         if (next != NULL)
4381                 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4382         mutex_exit(&spa_namespace_lock);
4383
4384         return (next);
4385 }
4386
4387 /*
4388  * Free buffers that were tagged for destruction.
4389  */
4390 static void
4391 l2arc_do_free_on_write()
4392 {
4393         list_t *buflist;
4394         l2arc_data_free_t *df, *df_prev;
4395
4396         mutex_enter(&l2arc_free_on_write_mtx);
4397         buflist = l2arc_free_on_write;
4398
4399         for (df = list_tail(buflist); df; df = df_prev) {
4400                 df_prev = list_prev(buflist, df);
4401                 ASSERT(df->l2df_data != NULL);
4402                 ASSERT(df->l2df_func != NULL);
4403                 df->l2df_func(df->l2df_data, df->l2df_size);
4404                 list_remove(buflist, df);
4405                 kmem_free(df, sizeof (l2arc_data_free_t));
4406         }
4407
4408         mutex_exit(&l2arc_free_on_write_mtx);
4409 }
4410
4411 /*
4412  * A write to a cache device has completed.  Update all headers to allow
4413  * reads from these buffers to begin.
4414  */
4415 static void
4416 l2arc_write_done(zio_t *zio)
4417 {
4418         l2arc_write_callback_t *cb;
4419         l2arc_dev_t *dev;
4420         list_t *buflist;
4421         arc_buf_hdr_t *head, *ab, *ab_prev;
4422         l2arc_buf_hdr_t *abl2;
4423         kmutex_t *hash_lock;
4424
4425         cb = zio->io_private;
4426         ASSERT(cb != NULL);
4427         dev = cb->l2wcb_dev;
4428         ASSERT(dev != NULL);
4429         head = cb->l2wcb_head;
4430         ASSERT(head != NULL);
4431         buflist = dev->l2ad_buflist;
4432         ASSERT(buflist != NULL);
4433         DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4434             l2arc_write_callback_t *, cb);
4435
4436         if (zio->io_error != 0)
4437                 ARCSTAT_BUMP(arcstat_l2_writes_error);
4438
4439         mutex_enter(&l2arc_buflist_mtx);
4440
4441         /*
4442          * All writes completed, or an error was hit.
4443          */
4444         for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4445                 ab_prev = list_prev(buflist, ab);
4446
4447                 hash_lock = HDR_LOCK(ab);
4448                 if (!mutex_tryenter(hash_lock)) {
4449                         /*
4450                          * This buffer misses out.  It may be in a stage
4451                          * of eviction.  Its ARC_L2_WRITING flag will be
4452                          * left set, denying reads to this buffer.
4453                          */
4454                         ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4455                         continue;
4456                 }
4457
4458                 if (zio->io_error != 0) {
4459                         /*
4460                          * Error - drop L2ARC entry.
4461                          */
4462                         list_remove(buflist, ab);
4463                         abl2 = ab->b_l2hdr;
4464                         ab->b_l2hdr = NULL;
4465                         trim_map_free(abl2->b_dev->l2ad_vdev, abl2->b_daddr,
4466                             ab->b_size, 0);
4467                         kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4468                         ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4469                 }
4470
4471                 /*
4472                  * Allow ARC to begin reads to this L2ARC entry.
4473                  */
4474                 ab->b_flags &= ~ARC_L2_WRITING;
4475
4476                 mutex_exit(hash_lock);
4477         }
4478
4479         atomic_inc_64(&l2arc_writes_done);
4480         list_remove(buflist, head);
4481         kmem_cache_free(hdr_cache, head);
4482         mutex_exit(&l2arc_buflist_mtx);
4483
4484         l2arc_do_free_on_write();
4485
4486         kmem_free(cb, sizeof (l2arc_write_callback_t));
4487 }
4488
4489 /*
4490  * A read to a cache device completed.  Validate buffer contents before
4491  * handing over to the regular ARC routines.
4492  */
4493 static void
4494 l2arc_read_done(zio_t *zio)
4495 {
4496         l2arc_read_callback_t *cb;
4497         arc_buf_hdr_t *hdr;
4498         arc_buf_t *buf;
4499         kmutex_t *hash_lock;
4500         int equal;
4501
4502         ASSERT(zio->io_vd != NULL);
4503         ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4504
4505         spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4506
4507         cb = zio->io_private;
4508         ASSERT(cb != NULL);
4509         buf = cb->l2rcb_buf;
4510         ASSERT(buf != NULL);
4511
4512         hash_lock = HDR_LOCK(buf->b_hdr);
4513         mutex_enter(hash_lock);
4514         hdr = buf->b_hdr;
4515         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4516
4517         /*
4518          * Check this survived the L2ARC journey.
4519          */
4520         equal = arc_cksum_equal(buf);
4521         if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4522                 mutex_exit(hash_lock);
4523                 zio->io_private = buf;
4524                 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
4525                 zio->io_bp = &zio->io_bp_copy;  /* XXX fix in L2ARC 2.0 */
4526                 arc_read_done(zio);
4527         } else {
4528                 mutex_exit(hash_lock);
4529                 /*
4530                  * Buffer didn't survive caching.  Increment stats and
4531                  * reissue to the original storage device.
4532                  */
4533                 if (zio->io_error != 0) {
4534                         ARCSTAT_BUMP(arcstat_l2_io_error);
4535                 } else {
4536                         zio->io_error = SET_ERROR(EIO);
4537                 }
4538                 if (!equal)
4539                         ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4540
4541                 /*
4542                  * If there's no waiter, issue an async i/o to the primary
4543                  * storage now.  If there *is* a waiter, the caller must
4544                  * issue the i/o in a context where it's OK to block.
4545                  */
4546                 if (zio->io_waiter == NULL) {
4547                         zio_t *pio = zio_unique_parent(zio);
4548
4549                         ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4550
4551                         zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4552                             buf->b_data, zio->io_size, arc_read_done, buf,
4553                             zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4554                 }
4555         }
4556
4557         kmem_free(cb, sizeof (l2arc_read_callback_t));
4558 }
4559
4560 /*
4561  * This is the list priority from which the L2ARC will search for pages to
4562  * cache.  This is used within loops (0..3) to cycle through lists in the
4563  * desired order.  This order can have a significant effect on cache
4564  * performance.
4565  *
4566  * Currently the metadata lists are hit first, MFU then MRU, followed by
4567  * the data lists.  This function returns a locked list, and also returns
4568  * the lock pointer.
4569  */
4570 static list_t *
4571 l2arc_list_locked(int list_num, kmutex_t **lock)
4572 {
4573         list_t *list = NULL;
4574         int idx;
4575
4576         ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS);
4577
4578         if (list_num < ARC_BUFC_NUMMETADATALISTS) {
4579                 idx = list_num;
4580                 list = &arc_mfu->arcs_lists[idx];
4581                 *lock = ARCS_LOCK(arc_mfu, idx);
4582         } else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) {
4583                 idx = list_num - ARC_BUFC_NUMMETADATALISTS;
4584                 list = &arc_mru->arcs_lists[idx];
4585                 *lock = ARCS_LOCK(arc_mru, idx);
4586         } else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 +
4587                 ARC_BUFC_NUMDATALISTS)) {
4588                 idx = list_num - ARC_BUFC_NUMMETADATALISTS;
4589                 list = &arc_mfu->arcs_lists[idx];
4590                 *lock = ARCS_LOCK(arc_mfu, idx);
4591         } else {
4592                 idx = list_num - ARC_BUFC_NUMLISTS;
4593                 list = &arc_mru->arcs_lists[idx];
4594                 *lock = ARCS_LOCK(arc_mru, idx);
4595         }
4596
4597         ASSERT(!(MUTEX_HELD(*lock)));
4598         mutex_enter(*lock);
4599         return (list);
4600 }
4601
4602 /*
4603  * Evict buffers from the device write hand to the distance specified in
4604  * bytes.  This distance may span populated buffers, it may span nothing.
4605  * This is clearing a region on the L2ARC device ready for writing.
4606  * If the 'all' boolean is set, every buffer is evicted.
4607  */
4608 static void
4609 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4610 {
4611         list_t *buflist;
4612         l2arc_buf_hdr_t *abl2;
4613         arc_buf_hdr_t *ab, *ab_prev;
4614         kmutex_t *hash_lock;
4615         uint64_t taddr;
4616
4617         buflist = dev->l2ad_buflist;
4618
4619         if (buflist == NULL)
4620                 return;
4621
4622         if (!all && dev->l2ad_first) {
4623                 /*
4624                  * This is the first sweep through the device.  There is
4625                  * nothing to evict.
4626                  */
4627                 return;
4628         }
4629
4630         if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4631                 /*
4632                  * When nearing the end of the device, evict to the end
4633                  * before the device write hand jumps to the start.
4634                  */
4635                 taddr = dev->l2ad_end;
4636         } else {
4637                 taddr = dev->l2ad_hand + distance;
4638         }
4639         DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4640             uint64_t, taddr, boolean_t, all);
4641
4642 top:
4643         mutex_enter(&l2arc_buflist_mtx);
4644         for (ab = list_tail(buflist); ab; ab = ab_prev) {
4645                 ab_prev = list_prev(buflist, ab);
4646
4647                 hash_lock = HDR_LOCK(ab);
4648                 if (!mutex_tryenter(hash_lock)) {
4649                         /*
4650                          * Missed the hash lock.  Retry.
4651                          */
4652                         ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4653                         mutex_exit(&l2arc_buflist_mtx);
4654                         mutex_enter(hash_lock);
4655                         mutex_exit(hash_lock);
4656                         goto top;
4657                 }
4658
4659                 if (HDR_L2_WRITE_HEAD(ab)) {
4660                         /*
4661                          * We hit a write head node.  Leave it for
4662                          * l2arc_write_done().
4663                          */
4664                         list_remove(buflist, ab);
4665                         mutex_exit(hash_lock);
4666                         continue;
4667                 }
4668
4669                 if (!all && ab->b_l2hdr != NULL &&
4670                     (ab->b_l2hdr->b_daddr > taddr ||
4671                     ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4672                         /*
4673                          * We've evicted to the target address,
4674                          * or the end of the device.
4675                          */
4676                         mutex_exit(hash_lock);
4677                         break;
4678                 }
4679
4680                 if (HDR_FREE_IN_PROGRESS(ab)) {
4681                         /*
4682                          * Already on the path to destruction.
4683                          */
4684                         mutex_exit(hash_lock);
4685                         continue;
4686                 }
4687
4688                 if (ab->b_state == arc_l2c_only) {
4689                         ASSERT(!HDR_L2_READING(ab));
4690                         /*
4691                          * This doesn't exist in the ARC.  Destroy.
4692                          * arc_hdr_destroy() will call list_remove()
4693                          * and decrement arcstat_l2_size.
4694                          */
4695                         arc_change_state(arc_anon, ab, hash_lock);
4696                         arc_hdr_destroy(ab);
4697                 } else {
4698                         /*
4699                          * Invalidate issued or about to be issued
4700                          * reads, since we may be about to write
4701                          * over this location.
4702                          */
4703                         if (HDR_L2_READING(ab)) {
4704                                 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4705                                 ab->b_flags |= ARC_L2_EVICTED;
4706                         }
4707
4708                         /*
4709                          * Tell ARC this no longer exists in L2ARC.
4710                          */
4711                         if (ab->b_l2hdr != NULL) {
4712                                 abl2 = ab->b_l2hdr;
4713                                 ab->b_l2hdr = NULL;
4714                                 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4715                                 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4716                         }
4717                         list_remove(buflist, ab);
4718
4719                         /*
4720                          * This may have been leftover after a
4721                          * failed write.
4722                          */
4723                         ab->b_flags &= ~ARC_L2_WRITING;
4724                 }
4725                 mutex_exit(hash_lock);
4726         }
4727         mutex_exit(&l2arc_buflist_mtx);
4728
4729         vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4730         dev->l2ad_evict = taddr;
4731 }
4732
4733 /*
4734  * Find and write ARC buffers to the L2ARC device.
4735  *
4736  * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4737  * for reading until they have completed writing.
4738  */
4739 static uint64_t
4740 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
4741 {
4742         arc_buf_hdr_t *ab, *ab_prev, *head;
4743         l2arc_buf_hdr_t *hdrl2;
4744         list_t *list;
4745         uint64_t passed_sz, write_sz, buf_sz, headroom;
4746         void *buf_data;
4747         kmutex_t *hash_lock, *list_lock;
4748         boolean_t have_lock, full;
4749         l2arc_write_callback_t *cb;
4750         zio_t *pio, *wzio;
4751         uint64_t guid = spa_load_guid(spa);
4752         int try;
4753
4754         ASSERT(dev->l2ad_vdev != NULL);
4755
4756         pio = NULL;
4757         write_sz = 0;
4758         full = B_FALSE;
4759         head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4760         head->b_flags |= ARC_L2_WRITE_HEAD;
4761
4762         ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
4763         /*
4764          * Copy buffers for L2ARC writing.
4765          */
4766         mutex_enter(&l2arc_buflist_mtx);
4767         for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) {
4768                 list = l2arc_list_locked(try, &list_lock);
4769                 passed_sz = 0;
4770                 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
4771
4772                 /*
4773                  * L2ARC fast warmup.
4774                  *
4775                  * Until the ARC is warm and starts to evict, read from the
4776                  * head of the ARC lists rather than the tail.
4777                  */
4778                 headroom = target_sz * l2arc_headroom;
4779                 if (arc_warm == B_FALSE)
4780                         ab = list_head(list);
4781                 else
4782                         ab = list_tail(list);
4783                 if (ab == NULL)
4784                         ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
4785
4786                 for (; ab; ab = ab_prev) {
4787                         if (arc_warm == B_FALSE)
4788                                 ab_prev = list_next(list, ab);
4789                         else
4790                                 ab_prev = list_prev(list, ab);
4791                         ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, ab->b_size);
4792
4793                         hash_lock = HDR_LOCK(ab);
4794                         have_lock = MUTEX_HELD(hash_lock);
4795                         if (!have_lock && !mutex_tryenter(hash_lock)) {
4796                                 ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
4797                                 /*
4798                                  * Skip this buffer rather than waiting.
4799                                  */
4800                                 continue;
4801                         }
4802
4803                         passed_sz += ab->b_size;
4804                         if (passed_sz > headroom) {
4805                                 /*
4806                                  * Searched too far.
4807                                  */
4808                                 mutex_exit(hash_lock);
4809                                 ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
4810                                 break;
4811                         }
4812
4813                         if (!l2arc_write_eligible(guid, ab)) {
4814                                 mutex_exit(hash_lock);
4815                                 continue;
4816                         }
4817
4818                         if ((write_sz + ab->b_size) > target_sz) {
4819                                 full = B_TRUE;
4820                                 mutex_exit(hash_lock);
4821                                 ARCSTAT_BUMP(arcstat_l2_write_full);
4822                                 break;
4823                         }
4824
4825                         if (pio == NULL) {
4826                                 /*
4827                                  * Insert a dummy header on the buflist so
4828                                  * l2arc_write_done() can find where the
4829                                  * write buffers begin without searching.
4830                                  */
4831                                 list_insert_head(dev->l2ad_buflist, head);
4832
4833                                 cb = kmem_alloc(
4834                                     sizeof (l2arc_write_callback_t), KM_SLEEP);
4835                                 cb->l2wcb_dev = dev;
4836                                 cb->l2wcb_head = head;
4837                                 pio = zio_root(spa, l2arc_write_done, cb,
4838                                     ZIO_FLAG_CANFAIL);
4839                                 ARCSTAT_BUMP(arcstat_l2_write_pios);
4840                         }
4841
4842                         /*
4843                          * Create and add a new L2ARC header.
4844                          */
4845                         hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4846                         hdrl2->b_dev = dev;
4847                         hdrl2->b_daddr = dev->l2ad_hand;
4848
4849                         ab->b_flags |= ARC_L2_WRITING;
4850                         ab->b_l2hdr = hdrl2;
4851                         list_insert_head(dev->l2ad_buflist, ab);
4852                         buf_data = ab->b_buf->b_data;
4853                         buf_sz = ab->b_size;
4854
4855                         /*
4856                          * Compute and store the buffer cksum before
4857                          * writing.  On debug the cksum is verified first.
4858                          */
4859                         arc_cksum_verify(ab->b_buf);
4860                         arc_cksum_compute(ab->b_buf, B_TRUE);
4861
4862                         mutex_exit(hash_lock);
4863
4864                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
4865                             dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4866                             NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4867                             ZIO_FLAG_CANFAIL, B_FALSE);
4868
4869                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4870                             zio_t *, wzio);
4871                         (void) zio_nowait(wzio);
4872
4873                         /*
4874                          * Keep the clock hand suitably device-aligned.
4875                          */
4876                         buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4877
4878                         write_sz += buf_sz;
4879                         dev->l2ad_hand += buf_sz;
4880                 }
4881
4882                 mutex_exit(list_lock);
4883
4884                 if (full == B_TRUE)
4885                         break;
4886         }
4887         mutex_exit(&l2arc_buflist_mtx);
4888
4889         if (pio == NULL) {
4890                 ASSERT0(write_sz);
4891                 kmem_cache_free(hdr_cache, head);
4892                 return (0);
4893         }
4894
4895         ASSERT3U(write_sz, <=, target_sz);
4896         ARCSTAT_BUMP(arcstat_l2_writes_sent);
4897         ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
4898         ARCSTAT_INCR(arcstat_l2_size, write_sz);
4899         vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
4900
4901         /*
4902          * Bump device hand to the device start if it is approaching the end.
4903          * l2arc_evict() will already have evicted ahead for this case.
4904          */
4905         if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4906                 vdev_space_update(dev->l2ad_vdev,
4907                     dev->l2ad_end - dev->l2ad_hand, 0, 0);
4908                 dev->l2ad_hand = dev->l2ad_start;
4909                 dev->l2ad_evict = dev->l2ad_start;
4910                 dev->l2ad_first = B_FALSE;
4911         }
4912
4913         dev->l2ad_writing = B_TRUE;
4914         (void) zio_wait(pio);
4915         dev->l2ad_writing = B_FALSE;
4916
4917         return (write_sz);
4918 }
4919
4920 /*
4921  * This thread feeds the L2ARC at regular intervals.  This is the beating
4922  * heart of the L2ARC.
4923  */
4924 static void
4925 l2arc_feed_thread(void *dummy __unused)
4926 {
4927         callb_cpr_t cpr;
4928         l2arc_dev_t *dev;
4929         spa_t *spa;
4930         uint64_t size, wrote;
4931         clock_t begin, next = ddi_get_lbolt();
4932
4933         CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
4934
4935         mutex_enter(&l2arc_feed_thr_lock);
4936
4937         while (l2arc_thread_exit == 0) {
4938                 CALLB_CPR_SAFE_BEGIN(&cpr);
4939                 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
4940                     next - ddi_get_lbolt());
4941                 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
4942                 next = ddi_get_lbolt() + hz;
4943
4944                 /*
4945                  * Quick check for L2ARC devices.
4946                  */
4947                 mutex_enter(&l2arc_dev_mtx);
4948                 if (l2arc_ndev == 0) {
4949                         mutex_exit(&l2arc_dev_mtx);
4950                         continue;
4951                 }
4952                 mutex_exit(&l2arc_dev_mtx);
4953                 begin = ddi_get_lbolt();
4954
4955                 /*
4956                  * This selects the next l2arc device to write to, and in
4957                  * doing so the next spa to feed from: dev->l2ad_spa.   This
4958                  * will return NULL if there are now no l2arc devices or if
4959                  * they are all faulted.
4960                  *
4961                  * If a device is returned, its spa's config lock is also
4962                  * held to prevent device removal.  l2arc_dev_get_next()
4963                  * will grab and release l2arc_dev_mtx.
4964                  */
4965                 if ((dev = l2arc_dev_get_next()) == NULL)
4966                         continue;
4967
4968                 spa = dev->l2ad_spa;
4969                 ASSERT(spa != NULL);
4970
4971                 /*
4972                  * If the pool is read-only then force the feed thread to
4973                  * sleep a little longer.
4974                  */
4975                 if (!spa_writeable(spa)) {
4976                         next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
4977                         spa_config_exit(spa, SCL_L2ARC, dev);
4978                         continue;
4979                 }
4980
4981                 /*
4982                  * Avoid contributing to memory pressure.
4983                  */
4984                 if (arc_reclaim_needed()) {
4985                         ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
4986                         spa_config_exit(spa, SCL_L2ARC, dev);
4987                         continue;
4988                 }
4989
4990                 ARCSTAT_BUMP(arcstat_l2_feeds);
4991
4992                 size = l2arc_write_size(dev);
4993
4994                 /*
4995                  * Evict L2ARC buffers that will be overwritten.
4996                  */
4997                 l2arc_evict(dev, size, B_FALSE);
4998
4999                 /*
5000                  * Write ARC buffers.
5001                  */
5002                 wrote = l2arc_write_buffers(spa, dev, size);
5003
5004                 /*
5005                  * Calculate interval between writes.
5006                  */
5007                 next = l2arc_write_interval(begin, size, wrote);
5008                 spa_config_exit(spa, SCL_L2ARC, dev);
5009         }
5010
5011         l2arc_thread_exit = 0;
5012         cv_broadcast(&l2arc_feed_thr_cv);
5013         CALLB_CPR_EXIT(&cpr);           /* drops l2arc_feed_thr_lock */
5014         thread_exit();
5015 }
5016
5017 boolean_t
5018 l2arc_vdev_present(vdev_t *vd)
5019 {
5020         l2arc_dev_t *dev;
5021
5022         mutex_enter(&l2arc_dev_mtx);
5023         for (dev = list_head(l2arc_dev_list); dev != NULL;
5024             dev = list_next(l2arc_dev_list, dev)) {
5025                 if (dev->l2ad_vdev == vd)
5026                         break;
5027         }
5028         mutex_exit(&l2arc_dev_mtx);
5029
5030         return (dev != NULL);
5031 }
5032
5033 /*
5034  * Add a vdev for use by the L2ARC.  By this point the spa has already
5035  * validated the vdev and opened it.
5036  */
5037 void
5038 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
5039 {
5040         l2arc_dev_t *adddev;
5041
5042         ASSERT(!l2arc_vdev_present(vd));
5043
5044         /*
5045          * Create a new l2arc device entry.
5046          */
5047         adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5048         adddev->l2ad_spa = spa;
5049         adddev->l2ad_vdev = vd;
5050         adddev->l2ad_write = l2arc_write_max;
5051         adddev->l2ad_boost = l2arc_write_boost;
5052         adddev->l2ad_start = VDEV_LABEL_START_SIZE;
5053         adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5054         adddev->l2ad_hand = adddev->l2ad_start;
5055         adddev->l2ad_evict = adddev->l2ad_start;
5056         adddev->l2ad_first = B_TRUE;
5057         adddev->l2ad_writing = B_FALSE;
5058         ASSERT3U(adddev->l2ad_write, >, 0);
5059
5060         /*
5061          * This is a list of all ARC buffers that are still valid on the
5062          * device.
5063          */
5064         adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5065         list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5066             offsetof(arc_buf_hdr_t, b_l2node));
5067
5068         vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5069
5070         /*
5071          * Add device to global list
5072          */
5073         mutex_enter(&l2arc_dev_mtx);
5074         list_insert_head(l2arc_dev_list, adddev);
5075         atomic_inc_64(&l2arc_ndev);
5076         mutex_exit(&l2arc_dev_mtx);
5077 }
5078
5079 /*
5080  * Remove a vdev from the L2ARC.
5081  */
5082 void
5083 l2arc_remove_vdev(vdev_t *vd)
5084 {
5085         l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5086
5087         /*
5088          * Find the device by vdev
5089          */
5090         mutex_enter(&l2arc_dev_mtx);
5091         for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5092                 nextdev = list_next(l2arc_dev_list, dev);
5093                 if (vd == dev->l2ad_vdev) {
5094                         remdev = dev;
5095                         break;
5096                 }
5097         }
5098         ASSERT(remdev != NULL);
5099
5100         /*
5101          * Remove device from global list
5102          */
5103         list_remove(l2arc_dev_list, remdev);
5104         l2arc_dev_last = NULL;          /* may have been invalidated */
5105         atomic_dec_64(&l2arc_ndev);
5106         mutex_exit(&l2arc_dev_mtx);
5107
5108         /*
5109          * Clear all buflists and ARC references.  L2ARC device flush.
5110          */
5111         l2arc_evict(remdev, 0, B_TRUE);
5112         list_destroy(remdev->l2ad_buflist);
5113         kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5114         kmem_free(remdev, sizeof (l2arc_dev_t));
5115 }
5116
5117 void
5118 l2arc_init(void)
5119 {
5120         l2arc_thread_exit = 0;
5121         l2arc_ndev = 0;
5122         l2arc_writes_sent = 0;
5123         l2arc_writes_done = 0;
5124
5125         mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5126         cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5127         mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5128         mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5129         mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5130
5131         l2arc_dev_list = &L2ARC_dev_list;
5132         l2arc_free_on_write = &L2ARC_free_on_write;
5133         list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
5134             offsetof(l2arc_dev_t, l2ad_node));
5135         list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
5136             offsetof(l2arc_data_free_t, l2df_list_node));
5137 }
5138
5139 void
5140 l2arc_fini(void)
5141 {
5142         /*
5143          * This is called from dmu_fini(), which is called from spa_fini();
5144          * Because of this, we can assume that all l2arc devices have
5145          * already been removed when the pools themselves were removed.
5146          */
5147
5148         l2arc_do_free_on_write();
5149
5150         mutex_destroy(&l2arc_feed_thr_lock);
5151         cv_destroy(&l2arc_feed_thr_cv);
5152         mutex_destroy(&l2arc_dev_mtx);
5153         mutex_destroy(&l2arc_buflist_mtx);
5154         mutex_destroy(&l2arc_free_on_write_mtx);
5155
5156         list_destroy(l2arc_dev_list);
5157         list_destroy(l2arc_free_on_write);
5158 }
5159
5160 void
5161 l2arc_start(void)
5162 {
5163         if (!(spa_mode_global & FWRITE))
5164                 return;
5165
5166         (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5167             TS_RUN, minclsyspri);
5168 }
5169
5170 void
5171 l2arc_stop(void)
5172 {
5173         if (!(spa_mode_global & FWRITE))
5174                 return;
5175
5176         mutex_enter(&l2arc_feed_thr_lock);
5177         cv_signal(&l2arc_feed_thr_cv);  /* kick thread out of startup */
5178         l2arc_thread_exit = 1;
5179         while (l2arc_thread_exit != 0)
5180                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5181         mutex_exit(&l2arc_feed_thr_lock);
5182 }