4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 * DVA-based Adjustable Replacement Cache
29 * While much of the theory of operation used here is
30 * based on the self-tuning, low overhead replacement cache
31 * presented by Megiddo and Modha at FAST 2003, there are some
32 * significant differences:
34 * 1. The Megiddo and Modha model assumes any page is evictable.
35 * Pages in its cache cannot be "locked" into memory. This makes
36 * the eviction algorithm simple: evict the last page in the list.
37 * This also make the performance characteristics easy to reason
38 * about. Our cache is not so simple. At any given moment, some
39 * subset of the blocks in the cache are un-evictable because we
40 * have handed out a reference to them. Blocks are only evictable
41 * when there are no external references active. This makes
42 * eviction far more problematic: we choose to evict the evictable
43 * blocks that are the "lowest" in the list.
45 * There are times when it is not possible to evict the requested
46 * space. In these circumstances we are unable to adjust the cache
47 * size. To prevent the cache growing unbounded at these times we
48 * implement a "cache throttle" that slows the flow of new data
49 * into the cache until we can make space available.
51 * 2. The Megiddo and Modha model assumes a fixed cache size.
52 * Pages are evicted when the cache is full and there is a cache
53 * miss. Our model has a variable sized cache. It grows with
54 * high use, but also tries to react to memory pressure from the
55 * operating system: decreasing its size when system memory is
58 * 3. The Megiddo and Modha model assumes a fixed page size. All
59 * elements of the cache are therefor exactly the same size. So
60 * when adjusting the cache size following a cache miss, its simply
61 * a matter of choosing a single page to evict. In our model, we
62 * have variable sized cache blocks (rangeing from 512 bytes to
63 * 128K bytes). We therefor choose a set of blocks to evict to make
64 * space for a cache miss that approximates as closely as possible
65 * the space used by the new block.
67 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
68 * by N. Megiddo & D. Modha, FAST 2003
74 * A new reference to a cache buffer can be obtained in two
75 * ways: 1) via a hash table lookup using the DVA as a key,
76 * or 2) via one of the ARC lists. The arc_read() interface
77 * uses method 1, while the internal arc algorithms for
78 * adjusting the cache use method 2. We therefor provide two
79 * types of locks: 1) the hash table lock array, and 2) the
82 * Buffers do not have their own mutexs, rather they rely on the
83 * hash table mutexs for the bulk of their protection (i.e. most
84 * fields in the arc_buf_hdr_t are protected by these mutexs).
86 * buf_hash_find() returns the appropriate mutex (held) when it
87 * locates the requested buffer in the hash table. It returns
88 * NULL for the mutex if the buffer was not in the table.
90 * buf_hash_remove() expects the appropriate hash mutex to be
91 * already held before it is invoked.
93 * Each arc state also has a mutex which is used to protect the
94 * buffer list associated with the state. When attempting to
95 * obtain a hash table lock while holding an arc list lock you
96 * must use: mutex_tryenter() to avoid deadlock. Also note that
97 * the active state mutex must be held before the ghost state mutex.
99 * Arc buffers may have an associated eviction callback function.
100 * This function will be invoked prior to removing the buffer (e.g.
101 * in arc_do_user_evicts()). Note however that the data associated
102 * with the buffer may be evicted prior to the callback. The callback
103 * must be made with *no locks held* (to prevent deadlock). Additionally,
104 * the users of callbacks must ensure that their private data is
105 * protected from simultaneous callbacks from arc_buf_evict()
106 * and arc_do_user_evicts().
108 * Note that the majority of the performance stats are manipulated
109 * with atomic operations.
111 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
113 * - L2ARC buflist creation
114 * - L2ARC buflist eviction
115 * - L2ARC write completion, which walks L2ARC buflists
116 * - ARC header destruction, as it removes from L2ARC buflists
117 * - ARC header release, as it removes from L2ARC buflists
122 #include <sys/zio_checksum.h>
123 #include <sys/zfs_context.h>
125 #include <sys/refcount.h>
126 #include <sys/vdev.h>
128 #include <sys/dnlc.h>
130 #include <sys/callb.h>
131 #include <sys/kstat.h>
134 #include <vm/vm_pageout.h>
136 static kmutex_t arc_reclaim_thr_lock;
137 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
138 static uint8_t arc_thread_exit;
140 extern int zfs_write_limit_shift;
141 extern uint64_t zfs_write_limit_max;
142 extern kmutex_t zfs_write_limit_lock;
144 #define ARC_REDUCE_DNLC_PERCENT 3
145 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
147 typedef enum arc_reclaim_strategy {
148 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
149 ARC_RECLAIM_CONS /* Conservative reclaim strategy */
150 } arc_reclaim_strategy_t;
152 /* number of seconds before growing cache again */
153 static int arc_grow_retry = 60;
156 * minimum lifespan of a prefetch block in clock ticks
157 * (initialized in arc_init())
159 static int arc_min_prefetch_lifespan;
161 extern int zfs_prefetch_disable;
165 * The arc has filled available memory and has now warmed up.
167 static boolean_t arc_warm;
170 * These tunables are for performance analysis.
172 uint64_t zfs_arc_max;
173 uint64_t zfs_arc_min;
174 uint64_t zfs_arc_meta_limit = 0;
175 int zfs_mdcomp_disable = 0;
177 TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
178 TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
179 TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
180 TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
181 SYSCTL_DECL(_vfs_zfs);
182 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
184 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
186 SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
187 &zfs_mdcomp_disable, 0, "Disable metadata compression");
190 * Note that buffers can be in one of 6 states:
191 * ARC_anon - anonymous (discussed below)
192 * ARC_mru - recently used, currently cached
193 * ARC_mru_ghost - recentely used, no longer in cache
194 * ARC_mfu - frequently used, currently cached
195 * ARC_mfu_ghost - frequently used, no longer in cache
196 * ARC_l2c_only - exists in L2ARC but not other states
197 * When there are no active references to the buffer, they are
198 * are linked onto a list in one of these arc states. These are
199 * the only buffers that can be evicted or deleted. Within each
200 * state there are multiple lists, one for meta-data and one for
201 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
202 * etc.) is tracked separately so that it can be managed more
203 * explicitly: favored over data, limited explicitly.
205 * Anonymous buffers are buffers that are not associated with
206 * a DVA. These are buffers that hold dirty block copies
207 * before they are written to stable storage. By definition,
208 * they are "ref'd" and are considered part of arc_mru
209 * that cannot be freed. Generally, they will aquire a DVA
210 * as they are written and migrate onto the arc_mru list.
212 * The ARC_l2c_only state is for buffers that are in the second
213 * level ARC but no longer in any of the ARC_m* lists. The second
214 * level ARC itself may also contain buffers that are in any of
215 * the ARC_m* states - meaning that a buffer can exist in two
216 * places. The reason for the ARC_l2c_only state is to keep the
217 * buffer header in the hash table, so that reads that hit the
218 * second level ARC benefit from these fast lookups.
221 typedef struct arc_state {
222 list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */
223 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
224 uint64_t arcs_size; /* total amount of data in this state */
229 static arc_state_t ARC_anon;
230 static arc_state_t ARC_mru;
231 static arc_state_t ARC_mru_ghost;
232 static arc_state_t ARC_mfu;
233 static arc_state_t ARC_mfu_ghost;
234 static arc_state_t ARC_l2c_only;
236 typedef struct arc_stats {
237 kstat_named_t arcstat_hits;
238 kstat_named_t arcstat_misses;
239 kstat_named_t arcstat_demand_data_hits;
240 kstat_named_t arcstat_demand_data_misses;
241 kstat_named_t arcstat_demand_metadata_hits;
242 kstat_named_t arcstat_demand_metadata_misses;
243 kstat_named_t arcstat_prefetch_data_hits;
244 kstat_named_t arcstat_prefetch_data_misses;
245 kstat_named_t arcstat_prefetch_metadata_hits;
246 kstat_named_t arcstat_prefetch_metadata_misses;
247 kstat_named_t arcstat_mru_hits;
248 kstat_named_t arcstat_mru_ghost_hits;
249 kstat_named_t arcstat_mfu_hits;
250 kstat_named_t arcstat_mfu_ghost_hits;
251 kstat_named_t arcstat_deleted;
252 kstat_named_t arcstat_recycle_miss;
253 kstat_named_t arcstat_mutex_miss;
254 kstat_named_t arcstat_evict_skip;
255 kstat_named_t arcstat_hash_elements;
256 kstat_named_t arcstat_hash_elements_max;
257 kstat_named_t arcstat_hash_collisions;
258 kstat_named_t arcstat_hash_chains;
259 kstat_named_t arcstat_hash_chain_max;
260 kstat_named_t arcstat_p;
261 kstat_named_t arcstat_c;
262 kstat_named_t arcstat_c_min;
263 kstat_named_t arcstat_c_max;
264 kstat_named_t arcstat_size;
265 kstat_named_t arcstat_hdr_size;
266 kstat_named_t arcstat_l2_hits;
267 kstat_named_t arcstat_l2_misses;
268 kstat_named_t arcstat_l2_feeds;
269 kstat_named_t arcstat_l2_rw_clash;
270 kstat_named_t arcstat_l2_writes_sent;
271 kstat_named_t arcstat_l2_writes_done;
272 kstat_named_t arcstat_l2_writes_error;
273 kstat_named_t arcstat_l2_writes_hdr_miss;
274 kstat_named_t arcstat_l2_evict_lock_retry;
275 kstat_named_t arcstat_l2_evict_reading;
276 kstat_named_t arcstat_l2_free_on_write;
277 kstat_named_t arcstat_l2_abort_lowmem;
278 kstat_named_t arcstat_l2_cksum_bad;
279 kstat_named_t arcstat_l2_io_error;
280 kstat_named_t arcstat_l2_size;
281 kstat_named_t arcstat_l2_hdr_size;
282 kstat_named_t arcstat_memory_throttle_count;
285 static arc_stats_t arc_stats = {
286 { "hits", KSTAT_DATA_UINT64 },
287 { "misses", KSTAT_DATA_UINT64 },
288 { "demand_data_hits", KSTAT_DATA_UINT64 },
289 { "demand_data_misses", KSTAT_DATA_UINT64 },
290 { "demand_metadata_hits", KSTAT_DATA_UINT64 },
291 { "demand_metadata_misses", KSTAT_DATA_UINT64 },
292 { "prefetch_data_hits", KSTAT_DATA_UINT64 },
293 { "prefetch_data_misses", KSTAT_DATA_UINT64 },
294 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
295 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
296 { "mru_hits", KSTAT_DATA_UINT64 },
297 { "mru_ghost_hits", KSTAT_DATA_UINT64 },
298 { "mfu_hits", KSTAT_DATA_UINT64 },
299 { "mfu_ghost_hits", KSTAT_DATA_UINT64 },
300 { "deleted", KSTAT_DATA_UINT64 },
301 { "recycle_miss", KSTAT_DATA_UINT64 },
302 { "mutex_miss", KSTAT_DATA_UINT64 },
303 { "evict_skip", KSTAT_DATA_UINT64 },
304 { "hash_elements", KSTAT_DATA_UINT64 },
305 { "hash_elements_max", KSTAT_DATA_UINT64 },
306 { "hash_collisions", KSTAT_DATA_UINT64 },
307 { "hash_chains", KSTAT_DATA_UINT64 },
308 { "hash_chain_max", KSTAT_DATA_UINT64 },
309 { "p", KSTAT_DATA_UINT64 },
310 { "c", KSTAT_DATA_UINT64 },
311 { "c_min", KSTAT_DATA_UINT64 },
312 { "c_max", KSTAT_DATA_UINT64 },
313 { "size", KSTAT_DATA_UINT64 },
314 { "hdr_size", KSTAT_DATA_UINT64 },
315 { "l2_hits", KSTAT_DATA_UINT64 },
316 { "l2_misses", KSTAT_DATA_UINT64 },
317 { "l2_feeds", KSTAT_DATA_UINT64 },
318 { "l2_rw_clash", KSTAT_DATA_UINT64 },
319 { "l2_writes_sent", KSTAT_DATA_UINT64 },
320 { "l2_writes_done", KSTAT_DATA_UINT64 },
321 { "l2_writes_error", KSTAT_DATA_UINT64 },
322 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 },
323 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
324 { "l2_evict_reading", KSTAT_DATA_UINT64 },
325 { "l2_free_on_write", KSTAT_DATA_UINT64 },
326 { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
327 { "l2_cksum_bad", KSTAT_DATA_UINT64 },
328 { "l2_io_error", KSTAT_DATA_UINT64 },
329 { "l2_size", KSTAT_DATA_UINT64 },
330 { "l2_hdr_size", KSTAT_DATA_UINT64 },
331 { "memory_throttle_count", KSTAT_DATA_UINT64 }
334 #define ARCSTAT(stat) (arc_stats.stat.value.ui64)
336 #define ARCSTAT_INCR(stat, val) \
337 atomic_add_64(&arc_stats.stat.value.ui64, (val));
339 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
340 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
342 #define ARCSTAT_MAX(stat, val) { \
344 while ((val) > (m = arc_stats.stat.value.ui64) && \
345 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
349 #define ARCSTAT_MAXSTAT(stat) \
350 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
353 * We define a macro to allow ARC hits/misses to be easily broken down by
354 * two separate conditions, giving a total of four different subtypes for
355 * each of hits and misses (so eight statistics total).
357 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
360 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
362 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
366 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
368 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
373 static arc_state_t *arc_anon;
374 static arc_state_t *arc_mru;
375 static arc_state_t *arc_mru_ghost;
376 static arc_state_t *arc_mfu;
377 static arc_state_t *arc_mfu_ghost;
378 static arc_state_t *arc_l2c_only;
381 * There are several ARC variables that are critical to export as kstats --
382 * but we don't want to have to grovel around in the kstat whenever we wish to
383 * manipulate them. For these variables, we therefore define them to be in
384 * terms of the statistic variable. This assures that we are not introducing
385 * the possibility of inconsistency by having shadow copies of the variables,
386 * while still allowing the code to be readable.
388 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
389 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
390 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */
391 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
392 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
394 static int arc_no_grow; /* Don't try to grow cache size */
395 static uint64_t arc_tempreserve;
396 static uint64_t arc_meta_used;
397 static uint64_t arc_meta_limit;
398 static uint64_t arc_meta_max = 0;
399 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RDTUN,
400 &arc_meta_used, 0, "ARC metadata used");
401 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RDTUN,
402 &arc_meta_limit, 0, "ARC metadata limit");
404 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
406 typedef struct arc_callback arc_callback_t;
408 struct arc_callback {
410 arc_done_func_t *acb_done;
412 zio_t *acb_zio_dummy;
413 arc_callback_t *acb_next;
416 typedef struct arc_write_callback arc_write_callback_t;
418 struct arc_write_callback {
420 arc_done_func_t *awcb_ready;
421 arc_done_func_t *awcb_done;
426 /* protected by hash lock */
431 kmutex_t b_freeze_lock;
432 zio_cksum_t *b_freeze_cksum;
434 arc_buf_hdr_t *b_hash_next;
439 arc_callback_t *b_acb;
443 arc_buf_contents_t b_type;
447 /* protected by arc state mutex */
448 arc_state_t *b_state;
449 list_node_t b_arc_node;
451 /* updated atomically */
452 clock_t b_arc_access;
454 /* self protecting */
457 l2arc_buf_hdr_t *b_l2hdr;
458 list_node_t b_l2node;
461 static arc_buf_t *arc_eviction_list;
462 static kmutex_t arc_eviction_mtx;
463 static arc_buf_hdr_t arc_eviction_hdr;
464 static void arc_get_data_buf(arc_buf_t *buf);
465 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
466 static int arc_evict_needed(arc_buf_contents_t type);
467 static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes);
469 #define GHOST_STATE(state) \
470 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
471 (state) == arc_l2c_only)
474 * Private ARC flags. These flags are private ARC only flags that will show up
475 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can
476 * be passed in as arc_flags in things like arc_read. However, these flags
477 * should never be passed and should only be set by ARC code. When adding new
478 * public flags, make sure not to smash the private ones.
481 #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */
482 #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */
483 #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */
484 #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */
485 #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */
486 #define ARC_INDIRECT (1 << 14) /* this is an indirect block */
487 #define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */
488 #define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */
489 #define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */
490 #define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */
491 #define ARC_STORED (1 << 19) /* has been store()d to */
493 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
494 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
495 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR)
496 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ)
497 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE)
498 #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
499 #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE)
500 #define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \
501 (hdr)->b_l2hdr != NULL)
502 #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING)
503 #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED)
504 #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
510 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
511 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
514 * Hash table routines
517 #define HT_LOCK_PAD 128
522 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
526 #define BUF_LOCKS 256
527 typedef struct buf_hash_table {
529 arc_buf_hdr_t **ht_table;
530 struct ht_lock ht_locks[BUF_LOCKS];
533 static buf_hash_table_t buf_hash_table;
535 #define BUF_HASH_INDEX(spa, dva, birth) \
536 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
537 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
538 #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
539 #define HDR_LOCK(buf) \
540 (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
542 uint64_t zfs_crc64_table[256];
548 #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
549 #define L2ARC_HEADROOM 4 /* num of writes */
550 #define L2ARC_FEED_SECS 1 /* caching interval */
552 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
553 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
556 * L2ARC Performance Tunables
558 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
559 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
560 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
561 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
562 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
567 typedef struct l2arc_dev {
568 vdev_t *l2ad_vdev; /* vdev */
569 spa_t *l2ad_spa; /* spa */
570 uint64_t l2ad_hand; /* next write location */
571 uint64_t l2ad_write; /* desired write size, bytes */
572 uint64_t l2ad_boost; /* warmup write boost, bytes */
573 uint64_t l2ad_start; /* first addr on device */
574 uint64_t l2ad_end; /* last addr on device */
575 uint64_t l2ad_evict; /* last addr eviction reached */
576 boolean_t l2ad_first; /* first sweep through */
577 list_t *l2ad_buflist; /* buffer list */
578 list_node_t l2ad_node; /* device list node */
581 static list_t L2ARC_dev_list; /* device list */
582 static list_t *l2arc_dev_list; /* device list pointer */
583 static kmutex_t l2arc_dev_mtx; /* device list mutex */
584 static l2arc_dev_t *l2arc_dev_last; /* last device used */
585 static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
586 static list_t L2ARC_free_on_write; /* free after write buf list */
587 static list_t *l2arc_free_on_write; /* free after write list ptr */
588 static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
589 static uint64_t l2arc_ndev; /* number of devices */
591 typedef struct l2arc_read_callback {
592 arc_buf_t *l2rcb_buf; /* read buffer */
593 spa_t *l2rcb_spa; /* spa */
594 blkptr_t l2rcb_bp; /* original blkptr */
595 zbookmark_t l2rcb_zb; /* original bookmark */
596 int l2rcb_flags; /* original flags */
597 } l2arc_read_callback_t;
599 typedef struct l2arc_write_callback {
600 l2arc_dev_t *l2wcb_dev; /* device info */
601 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
602 } l2arc_write_callback_t;
604 struct l2arc_buf_hdr {
605 /* protected by arc_buf_hdr mutex */
606 l2arc_dev_t *b_dev; /* L2ARC device */
607 daddr_t b_daddr; /* disk address, offset byte */
610 typedef struct l2arc_data_free {
611 /* protected by l2arc_free_on_write_mtx */
614 void (*l2df_func)(void *, size_t);
615 list_node_t l2df_list_node;
618 static kmutex_t l2arc_feed_thr_lock;
619 static kcondvar_t l2arc_feed_thr_cv;
620 static uint8_t l2arc_thread_exit;
622 static void l2arc_read_done(zio_t *zio);
623 static void l2arc_hdr_stat_add(void);
624 static void l2arc_hdr_stat_remove(void);
627 buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth)
629 uintptr_t spav = (uintptr_t)spa;
630 uint8_t *vdva = (uint8_t *)dva;
631 uint64_t crc = -1ULL;
634 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
636 for (i = 0; i < sizeof (dva_t); i++)
637 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
639 crc ^= (spav>>8) ^ birth;
644 #define BUF_EMPTY(buf) \
645 ((buf)->b_dva.dva_word[0] == 0 && \
646 (buf)->b_dva.dva_word[1] == 0 && \
649 #define BUF_EQUAL(spa, dva, birth, buf) \
650 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
651 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
652 ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
654 static arc_buf_hdr_t *
655 buf_hash_find(spa_t *spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
657 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
658 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
661 mutex_enter(hash_lock);
662 for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
663 buf = buf->b_hash_next) {
664 if (BUF_EQUAL(spa, dva, birth, buf)) {
669 mutex_exit(hash_lock);
675 * Insert an entry into the hash table. If there is already an element
676 * equal to elem in the hash table, then the already existing element
677 * will be returned and the new element will not be inserted.
678 * Otherwise returns NULL.
680 static arc_buf_hdr_t *
681 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
683 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
684 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
688 ASSERT(!HDR_IN_HASH_TABLE(buf));
690 mutex_enter(hash_lock);
691 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
692 fbuf = fbuf->b_hash_next, i++) {
693 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
697 buf->b_hash_next = buf_hash_table.ht_table[idx];
698 buf_hash_table.ht_table[idx] = buf;
699 buf->b_flags |= ARC_IN_HASH_TABLE;
701 /* collect some hash table performance data */
703 ARCSTAT_BUMP(arcstat_hash_collisions);
705 ARCSTAT_BUMP(arcstat_hash_chains);
707 ARCSTAT_MAX(arcstat_hash_chain_max, i);
710 ARCSTAT_BUMP(arcstat_hash_elements);
711 ARCSTAT_MAXSTAT(arcstat_hash_elements);
717 buf_hash_remove(arc_buf_hdr_t *buf)
719 arc_buf_hdr_t *fbuf, **bufp;
720 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
722 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
723 ASSERT(HDR_IN_HASH_TABLE(buf));
725 bufp = &buf_hash_table.ht_table[idx];
726 while ((fbuf = *bufp) != buf) {
727 ASSERT(fbuf != NULL);
728 bufp = &fbuf->b_hash_next;
730 *bufp = buf->b_hash_next;
731 buf->b_hash_next = NULL;
732 buf->b_flags &= ~ARC_IN_HASH_TABLE;
734 /* collect some hash table performance data */
735 ARCSTAT_BUMPDOWN(arcstat_hash_elements);
737 if (buf_hash_table.ht_table[idx] &&
738 buf_hash_table.ht_table[idx]->b_hash_next == NULL)
739 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
743 * Global data structures and functions for the buf kmem cache.
745 static kmem_cache_t *hdr_cache;
746 static kmem_cache_t *buf_cache;
753 kmem_free(buf_hash_table.ht_table,
754 (buf_hash_table.ht_mask + 1) * sizeof (void *));
755 for (i = 0; i < BUF_LOCKS; i++)
756 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
757 kmem_cache_destroy(hdr_cache);
758 kmem_cache_destroy(buf_cache);
762 * Constructor callback - called when the cache is empty
763 * and a new buf is requested.
767 hdr_cons(void *vbuf, void *unused, int kmflag)
769 arc_buf_hdr_t *buf = vbuf;
771 bzero(buf, sizeof (arc_buf_hdr_t));
772 refcount_create(&buf->b_refcnt);
773 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
774 mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
776 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
782 buf_cons(void *vbuf, void *unused, int kmflag)
784 arc_buf_t *buf = vbuf;
786 bzero(buf, sizeof (arc_buf_t));
787 rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL);
792 * Destructor callback - called when a cached buf is
793 * no longer required.
797 hdr_dest(void *vbuf, void *unused)
799 arc_buf_hdr_t *buf = vbuf;
801 refcount_destroy(&buf->b_refcnt);
802 cv_destroy(&buf->b_cv);
803 mutex_destroy(&buf->b_freeze_lock);
805 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
810 buf_dest(void *vbuf, void *unused)
812 arc_buf_t *buf = vbuf;
814 rw_destroy(&buf->b_lock);
818 * Reclaim callback -- invoked when memory is low.
822 hdr_recl(void *unused)
824 dprintf("hdr_recl called\n");
826 * umem calls the reclaim func when we destroy the buf cache,
827 * which is after we do arc_fini().
830 cv_signal(&arc_reclaim_thr_cv);
837 uint64_t hsize = 1ULL << 12;
841 * The hash table is big enough to fill all of physical memory
842 * with an average 64K block size. The table will take up
843 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
845 while (hsize * 65536 < (uint64_t)physmem * PAGESIZE)
848 buf_hash_table.ht_mask = hsize - 1;
849 buf_hash_table.ht_table =
850 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
851 if (buf_hash_table.ht_table == NULL) {
852 ASSERT(hsize > (1ULL << 8));
857 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
858 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
859 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
860 0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
862 for (i = 0; i < 256; i++)
863 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
864 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
866 for (i = 0; i < BUF_LOCKS; i++) {
867 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
868 NULL, MUTEX_DEFAULT, NULL);
872 #define ARC_MINTIME (hz>>4) /* 62 ms */
875 arc_cksum_verify(arc_buf_t *buf)
879 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
882 mutex_enter(&buf->b_hdr->b_freeze_lock);
883 if (buf->b_hdr->b_freeze_cksum == NULL ||
884 (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
885 mutex_exit(&buf->b_hdr->b_freeze_lock);
888 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
889 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
890 panic("buffer modified while frozen!");
891 mutex_exit(&buf->b_hdr->b_freeze_lock);
895 arc_cksum_equal(arc_buf_t *buf)
900 mutex_enter(&buf->b_hdr->b_freeze_lock);
901 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
902 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
903 mutex_exit(&buf->b_hdr->b_freeze_lock);
909 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
911 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
914 mutex_enter(&buf->b_hdr->b_freeze_lock);
915 if (buf->b_hdr->b_freeze_cksum != NULL) {
916 mutex_exit(&buf->b_hdr->b_freeze_lock);
919 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
920 fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
921 buf->b_hdr->b_freeze_cksum);
922 mutex_exit(&buf->b_hdr->b_freeze_lock);
926 arc_buf_thaw(arc_buf_t *buf)
928 if (zfs_flags & ZFS_DEBUG_MODIFY) {
929 if (buf->b_hdr->b_state != arc_anon)
930 panic("modifying non-anon buffer!");
931 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
932 panic("modifying buffer while i/o in progress!");
933 arc_cksum_verify(buf);
936 mutex_enter(&buf->b_hdr->b_freeze_lock);
937 if (buf->b_hdr->b_freeze_cksum != NULL) {
938 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
939 buf->b_hdr->b_freeze_cksum = NULL;
941 mutex_exit(&buf->b_hdr->b_freeze_lock);
945 arc_buf_freeze(arc_buf_t *buf)
947 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
950 ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
951 buf->b_hdr->b_state == arc_anon);
952 arc_cksum_compute(buf, B_FALSE);
956 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
958 ASSERT(MUTEX_HELD(hash_lock));
960 if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
961 (ab->b_state != arc_anon)) {
962 uint64_t delta = ab->b_size * ab->b_datacnt;
963 list_t *list = &ab->b_state->arcs_list[ab->b_type];
964 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
966 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
967 mutex_enter(&ab->b_state->arcs_mtx);
968 ASSERT(list_link_active(&ab->b_arc_node));
969 list_remove(list, ab);
970 if (GHOST_STATE(ab->b_state)) {
971 ASSERT3U(ab->b_datacnt, ==, 0);
972 ASSERT3P(ab->b_buf, ==, NULL);
976 ASSERT3U(*size, >=, delta);
977 atomic_add_64(size, -delta);
978 mutex_exit(&ab->b_state->arcs_mtx);
979 /* remove the prefetch flag if we get a reference */
980 if (ab->b_flags & ARC_PREFETCH)
981 ab->b_flags &= ~ARC_PREFETCH;
986 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
989 arc_state_t *state = ab->b_state;
991 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
992 ASSERT(!GHOST_STATE(state));
994 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
995 (state != arc_anon)) {
996 uint64_t *size = &state->arcs_lsize[ab->b_type];
998 ASSERT(!MUTEX_HELD(&state->arcs_mtx));
999 mutex_enter(&state->arcs_mtx);
1000 ASSERT(!list_link_active(&ab->b_arc_node));
1001 list_insert_head(&state->arcs_list[ab->b_type], ab);
1002 ASSERT(ab->b_datacnt > 0);
1003 atomic_add_64(size, ab->b_size * ab->b_datacnt);
1004 mutex_exit(&state->arcs_mtx);
1010 * Move the supplied buffer to the indicated state. The mutex
1011 * for the buffer must be held by the caller.
1014 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1016 arc_state_t *old_state = ab->b_state;
1017 int64_t refcnt = refcount_count(&ab->b_refcnt);
1018 uint64_t from_delta, to_delta;
1020 ASSERT(MUTEX_HELD(hash_lock));
1021 ASSERT(new_state != old_state);
1022 ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1023 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1025 from_delta = to_delta = ab->b_datacnt * ab->b_size;
1028 * If this buffer is evictable, transfer it from the
1029 * old state list to the new state list.
1032 if (old_state != arc_anon) {
1033 int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1034 uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1037 mutex_enter(&old_state->arcs_mtx);
1039 ASSERT(list_link_active(&ab->b_arc_node));
1040 list_remove(&old_state->arcs_list[ab->b_type], ab);
1043 * If prefetching out of the ghost cache,
1044 * we will have a non-null datacnt.
1046 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1047 /* ghost elements have a ghost size */
1048 ASSERT(ab->b_buf == NULL);
1049 from_delta = ab->b_size;
1051 ASSERT3U(*size, >=, from_delta);
1052 atomic_add_64(size, -from_delta);
1055 mutex_exit(&old_state->arcs_mtx);
1057 if (new_state != arc_anon) {
1058 int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1059 uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1062 mutex_enter(&new_state->arcs_mtx);
1064 list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1066 /* ghost elements have a ghost size */
1067 if (GHOST_STATE(new_state)) {
1068 ASSERT(ab->b_datacnt == 0);
1069 ASSERT(ab->b_buf == NULL);
1070 to_delta = ab->b_size;
1072 atomic_add_64(size, to_delta);
1075 mutex_exit(&new_state->arcs_mtx);
1079 ASSERT(!BUF_EMPTY(ab));
1080 if (new_state == arc_anon) {
1081 buf_hash_remove(ab);
1084 /* adjust state sizes */
1086 atomic_add_64(&new_state->arcs_size, to_delta);
1088 ASSERT3U(old_state->arcs_size, >=, from_delta);
1089 atomic_add_64(&old_state->arcs_size, -from_delta);
1091 ab->b_state = new_state;
1093 /* adjust l2arc hdr stats */
1094 if (new_state == arc_l2c_only)
1095 l2arc_hdr_stat_add();
1096 else if (old_state == arc_l2c_only)
1097 l2arc_hdr_stat_remove();
1101 arc_space_consume(uint64_t space)
1103 atomic_add_64(&arc_meta_used, space);
1104 atomic_add_64(&arc_size, space);
1108 arc_space_return(uint64_t space)
1110 ASSERT(arc_meta_used >= space);
1111 if (arc_meta_max < arc_meta_used)
1112 arc_meta_max = arc_meta_used;
1113 atomic_add_64(&arc_meta_used, -space);
1114 ASSERT(arc_size >= space);
1115 atomic_add_64(&arc_size, -space);
1119 arc_data_buf_alloc(uint64_t size)
1121 if (arc_evict_needed(ARC_BUFC_DATA))
1122 cv_signal(&arc_reclaim_thr_cv);
1123 atomic_add_64(&arc_size, size);
1124 return (zio_data_buf_alloc(size));
1128 arc_data_buf_free(void *buf, uint64_t size)
1130 zio_data_buf_free(buf, size);
1131 ASSERT(arc_size >= size);
1132 atomic_add_64(&arc_size, -size);
1136 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1141 ASSERT3U(size, >, 0);
1142 hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1143 ASSERT(BUF_EMPTY(hdr));
1147 hdr->b_state = arc_anon;
1148 hdr->b_arc_access = 0;
1149 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1152 buf->b_efunc = NULL;
1153 buf->b_private = NULL;
1156 arc_get_data_buf(buf);
1159 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1160 (void) refcount_add(&hdr->b_refcnt, tag);
1166 arc_buf_clone(arc_buf_t *from)
1169 arc_buf_hdr_t *hdr = from->b_hdr;
1170 uint64_t size = hdr->b_size;
1172 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1175 buf->b_efunc = NULL;
1176 buf->b_private = NULL;
1177 buf->b_next = hdr->b_buf;
1179 arc_get_data_buf(buf);
1180 bcopy(from->b_data, buf->b_data, size);
1181 hdr->b_datacnt += 1;
1186 arc_buf_add_ref(arc_buf_t *buf, void* tag)
1189 kmutex_t *hash_lock;
1192 * Check to see if this buffer is evicted. Callers
1193 * must verify b_data != NULL to know if the add_ref
1196 rw_enter(&buf->b_lock, RW_READER);
1197 if (buf->b_data == NULL) {
1198 rw_exit(&buf->b_lock);
1202 ASSERT(hdr != NULL);
1203 hash_lock = HDR_LOCK(hdr);
1204 mutex_enter(hash_lock);
1205 rw_exit(&buf->b_lock);
1207 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1208 add_reference(hdr, hash_lock, tag);
1209 arc_access(hdr, hash_lock);
1210 mutex_exit(hash_lock);
1211 ARCSTAT_BUMP(arcstat_hits);
1212 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1213 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1214 data, metadata, hits);
1218 * Free the arc data buffer. If it is an l2arc write in progress,
1219 * the buffer is placed on l2arc_free_on_write to be freed later.
1222 arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t),
1223 void *data, size_t size)
1225 if (HDR_L2_WRITING(hdr)) {
1226 l2arc_data_free_t *df;
1227 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1228 df->l2df_data = data;
1229 df->l2df_size = size;
1230 df->l2df_func = free_func;
1231 mutex_enter(&l2arc_free_on_write_mtx);
1232 list_insert_head(l2arc_free_on_write, df);
1233 mutex_exit(&l2arc_free_on_write_mtx);
1234 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1236 free_func(data, size);
1241 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1245 /* free up data associated with the buf */
1247 arc_state_t *state = buf->b_hdr->b_state;
1248 uint64_t size = buf->b_hdr->b_size;
1249 arc_buf_contents_t type = buf->b_hdr->b_type;
1251 arc_cksum_verify(buf);
1253 if (type == ARC_BUFC_METADATA) {
1254 arc_buf_data_free(buf->b_hdr, zio_buf_free,
1256 arc_space_return(size);
1258 ASSERT(type == ARC_BUFC_DATA);
1259 arc_buf_data_free(buf->b_hdr,
1260 zio_data_buf_free, buf->b_data, size);
1261 atomic_add_64(&arc_size, -size);
1264 if (list_link_active(&buf->b_hdr->b_arc_node)) {
1265 uint64_t *cnt = &state->arcs_lsize[type];
1267 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1268 ASSERT(state != arc_anon);
1270 ASSERT3U(*cnt, >=, size);
1271 atomic_add_64(cnt, -size);
1273 ASSERT3U(state->arcs_size, >=, size);
1274 atomic_add_64(&state->arcs_size, -size);
1276 ASSERT(buf->b_hdr->b_datacnt > 0);
1277 buf->b_hdr->b_datacnt -= 1;
1280 /* only remove the buf if requested */
1284 /* remove the buf from the hdr list */
1285 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1287 *bufp = buf->b_next;
1289 ASSERT(buf->b_efunc == NULL);
1291 /* clean up the buf */
1293 kmem_cache_free(buf_cache, buf);
1297 arc_hdr_destroy(arc_buf_hdr_t *hdr)
1299 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1300 ASSERT3P(hdr->b_state, ==, arc_anon);
1301 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1302 ASSERT(!(hdr->b_flags & ARC_STORED));
1304 if (hdr->b_l2hdr != NULL) {
1305 if (!MUTEX_HELD(&l2arc_buflist_mtx)) {
1307 * To prevent arc_free() and l2arc_evict() from
1308 * attempting to free the same buffer at the same time,
1309 * a FREE_IN_PROGRESS flag is given to arc_free() to
1310 * give it priority. l2arc_evict() can't destroy this
1311 * header while we are waiting on l2arc_buflist_mtx.
1313 * The hdr may be removed from l2ad_buflist before we
1314 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1316 mutex_enter(&l2arc_buflist_mtx);
1317 if (hdr->b_l2hdr != NULL) {
1318 list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist,
1321 mutex_exit(&l2arc_buflist_mtx);
1323 list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr);
1325 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1326 kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t));
1327 if (hdr->b_state == arc_l2c_only)
1328 l2arc_hdr_stat_remove();
1329 hdr->b_l2hdr = NULL;
1332 if (!BUF_EMPTY(hdr)) {
1333 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1334 bzero(&hdr->b_dva, sizeof (dva_t));
1338 while (hdr->b_buf) {
1339 arc_buf_t *buf = hdr->b_buf;
1342 mutex_enter(&arc_eviction_mtx);
1343 rw_enter(&buf->b_lock, RW_WRITER);
1344 ASSERT(buf->b_hdr != NULL);
1345 arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1346 hdr->b_buf = buf->b_next;
1347 buf->b_hdr = &arc_eviction_hdr;
1348 buf->b_next = arc_eviction_list;
1349 arc_eviction_list = buf;
1350 rw_exit(&buf->b_lock);
1351 mutex_exit(&arc_eviction_mtx);
1353 arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1356 if (hdr->b_freeze_cksum != NULL) {
1357 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1358 hdr->b_freeze_cksum = NULL;
1361 ASSERT(!list_link_active(&hdr->b_arc_node));
1362 ASSERT3P(hdr->b_hash_next, ==, NULL);
1363 ASSERT3P(hdr->b_acb, ==, NULL);
1364 kmem_cache_free(hdr_cache, hdr);
1368 arc_buf_free(arc_buf_t *buf, void *tag)
1370 arc_buf_hdr_t *hdr = buf->b_hdr;
1371 int hashed = hdr->b_state != arc_anon;
1373 ASSERT(buf->b_efunc == NULL);
1374 ASSERT(buf->b_data != NULL);
1377 kmutex_t *hash_lock = HDR_LOCK(hdr);
1379 mutex_enter(hash_lock);
1380 (void) remove_reference(hdr, hash_lock, tag);
1381 if (hdr->b_datacnt > 1)
1382 arc_buf_destroy(buf, FALSE, TRUE);
1384 hdr->b_flags |= ARC_BUF_AVAILABLE;
1385 mutex_exit(hash_lock);
1386 } else if (HDR_IO_IN_PROGRESS(hdr)) {
1389 * We are in the middle of an async write. Don't destroy
1390 * this buffer unless the write completes before we finish
1391 * decrementing the reference count.
1393 mutex_enter(&arc_eviction_mtx);
1394 (void) remove_reference(hdr, NULL, tag);
1395 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1396 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1397 mutex_exit(&arc_eviction_mtx);
1399 arc_hdr_destroy(hdr);
1401 if (remove_reference(hdr, NULL, tag) > 0) {
1402 ASSERT(HDR_IO_ERROR(hdr));
1403 arc_buf_destroy(buf, FALSE, TRUE);
1405 arc_hdr_destroy(hdr);
1411 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1413 arc_buf_hdr_t *hdr = buf->b_hdr;
1414 kmutex_t *hash_lock = HDR_LOCK(hdr);
1415 int no_callback = (buf->b_efunc == NULL);
1417 if (hdr->b_state == arc_anon) {
1418 arc_buf_free(buf, tag);
1419 return (no_callback);
1422 mutex_enter(hash_lock);
1423 ASSERT(hdr->b_state != arc_anon);
1424 ASSERT(buf->b_data != NULL);
1426 (void) remove_reference(hdr, hash_lock, tag);
1427 if (hdr->b_datacnt > 1) {
1429 arc_buf_destroy(buf, FALSE, TRUE);
1430 } else if (no_callback) {
1431 ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1432 hdr->b_flags |= ARC_BUF_AVAILABLE;
1434 ASSERT(no_callback || hdr->b_datacnt > 1 ||
1435 refcount_is_zero(&hdr->b_refcnt));
1436 mutex_exit(hash_lock);
1437 return (no_callback);
1441 arc_buf_size(arc_buf_t *buf)
1443 return (buf->b_hdr->b_size);
1447 * Evict buffers from list until we've removed the specified number of
1448 * bytes. Move the removed buffers to the appropriate evict state.
1449 * If the recycle flag is set, then attempt to "recycle" a buffer:
1450 * - look for a buffer to evict that is `bytes' long.
1451 * - return the data block from this buffer rather than freeing it.
1452 * This flag is used by callers that are trying to make space for a
1453 * new buffer in a full arc cache.
1455 * This function makes a "best effort". It skips over any buffers
1456 * it can't get a hash_lock on, and so may not catch all candidates.
1457 * It may also return without evicting as much space as requested.
1460 arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle,
1461 arc_buf_contents_t type)
1463 arc_state_t *evicted_state;
1464 uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1465 arc_buf_hdr_t *ab, *ab_prev = NULL;
1466 list_t *list = &state->arcs_list[type];
1467 kmutex_t *hash_lock;
1468 boolean_t have_lock;
1469 void *stolen = NULL;
1471 ASSERT(state == arc_mru || state == arc_mfu);
1473 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1475 mutex_enter(&state->arcs_mtx);
1476 mutex_enter(&evicted_state->arcs_mtx);
1478 for (ab = list_tail(list); ab; ab = ab_prev) {
1479 ab_prev = list_prev(list, ab);
1480 /* prefetch buffers have a minimum lifespan */
1481 if (HDR_IO_IN_PROGRESS(ab) ||
1482 (spa && ab->b_spa != spa) ||
1483 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1484 LBOLT - ab->b_arc_access < arc_min_prefetch_lifespan)) {
1488 /* "lookahead" for better eviction candidate */
1489 if (recycle && ab->b_size != bytes &&
1490 ab_prev && ab_prev->b_size == bytes)
1492 hash_lock = HDR_LOCK(ab);
1493 have_lock = MUTEX_HELD(hash_lock);
1494 if (have_lock || mutex_tryenter(hash_lock)) {
1495 ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
1496 ASSERT(ab->b_datacnt > 0);
1498 arc_buf_t *buf = ab->b_buf;
1499 if (!rw_tryenter(&buf->b_lock, RW_WRITER)) {
1504 bytes_evicted += ab->b_size;
1505 if (recycle && ab->b_type == type &&
1506 ab->b_size == bytes &&
1507 !HDR_L2_WRITING(ab)) {
1508 stolen = buf->b_data;
1513 mutex_enter(&arc_eviction_mtx);
1514 arc_buf_destroy(buf,
1515 buf->b_data == stolen, FALSE);
1516 ab->b_buf = buf->b_next;
1517 buf->b_hdr = &arc_eviction_hdr;
1518 buf->b_next = arc_eviction_list;
1519 arc_eviction_list = buf;
1520 mutex_exit(&arc_eviction_mtx);
1521 rw_exit(&buf->b_lock);
1523 rw_exit(&buf->b_lock);
1524 arc_buf_destroy(buf,
1525 buf->b_data == stolen, TRUE);
1528 if (ab->b_datacnt == 0) {
1529 arc_change_state(evicted_state, ab, hash_lock);
1530 ASSERT(HDR_IN_HASH_TABLE(ab));
1531 ab->b_flags |= ARC_IN_HASH_TABLE;
1532 ab->b_flags &= ~ARC_BUF_AVAILABLE;
1533 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
1536 mutex_exit(hash_lock);
1537 if (bytes >= 0 && bytes_evicted >= bytes)
1544 mutex_exit(&evicted_state->arcs_mtx);
1545 mutex_exit(&state->arcs_mtx);
1547 if (bytes_evicted < bytes)
1548 dprintf("only evicted %lld bytes from %x",
1549 (longlong_t)bytes_evicted, state);
1552 ARCSTAT_INCR(arcstat_evict_skip, skipped);
1555 ARCSTAT_INCR(arcstat_mutex_miss, missed);
1558 * We have just evicted some date into the ghost state, make
1559 * sure we also adjust the ghost state size if necessary.
1562 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
1563 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
1564 arc_mru_ghost->arcs_size - arc_c;
1566 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
1568 MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
1569 arc_evict_ghost(arc_mru_ghost, NULL, todelete);
1570 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
1571 int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
1572 arc_mru_ghost->arcs_size +
1573 arc_mfu_ghost->arcs_size - arc_c);
1574 arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
1582 * Remove buffers from list until we've removed the specified number of
1583 * bytes. Destroy the buffers that are removed.
1586 arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes)
1588 arc_buf_hdr_t *ab, *ab_prev;
1589 list_t *list = &state->arcs_list[ARC_BUFC_DATA];
1590 kmutex_t *hash_lock;
1591 uint64_t bytes_deleted = 0;
1592 uint64_t bufs_skipped = 0;
1594 ASSERT(GHOST_STATE(state));
1596 mutex_enter(&state->arcs_mtx);
1597 for (ab = list_tail(list); ab; ab = ab_prev) {
1598 ab_prev = list_prev(list, ab);
1599 if (spa && ab->b_spa != spa)
1601 hash_lock = HDR_LOCK(ab);
1602 if (mutex_tryenter(hash_lock)) {
1603 ASSERT(!HDR_IO_IN_PROGRESS(ab));
1604 ASSERT(ab->b_buf == NULL);
1605 ARCSTAT_BUMP(arcstat_deleted);
1606 bytes_deleted += ab->b_size;
1608 if (ab->b_l2hdr != NULL) {
1610 * This buffer is cached on the 2nd Level ARC;
1611 * don't destroy the header.
1613 arc_change_state(arc_l2c_only, ab, hash_lock);
1614 mutex_exit(hash_lock);
1616 arc_change_state(arc_anon, ab, hash_lock);
1617 mutex_exit(hash_lock);
1618 arc_hdr_destroy(ab);
1621 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
1622 if (bytes >= 0 && bytes_deleted >= bytes)
1626 mutex_exit(&state->arcs_mtx);
1627 mutex_enter(hash_lock);
1628 mutex_exit(hash_lock);
1634 mutex_exit(&state->arcs_mtx);
1636 if (list == &state->arcs_list[ARC_BUFC_DATA] &&
1637 (bytes < 0 || bytes_deleted < bytes)) {
1638 list = &state->arcs_list[ARC_BUFC_METADATA];
1643 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
1647 if (bytes_deleted < bytes)
1648 dprintf("only deleted %lld bytes from %p",
1649 (longlong_t)bytes_deleted, state);
1655 int64_t top_sz, mru_over, arc_over, todelete;
1657 top_sz = arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used;
1659 if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
1661 MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], top_sz - arc_p);
1662 (void) arc_evict(arc_mru, NULL, toevict, FALSE, ARC_BUFC_DATA);
1663 top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
1666 if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
1668 MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], top_sz - arc_p);
1669 (void) arc_evict(arc_mru, NULL, toevict, FALSE,
1671 top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
1674 mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c;
1677 if (arc_mru_ghost->arcs_size > 0) {
1678 todelete = MIN(arc_mru_ghost->arcs_size, mru_over);
1679 arc_evict_ghost(arc_mru_ghost, NULL, todelete);
1683 if ((arc_over = arc_size - arc_c) > 0) {
1686 if (arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
1688 MIN(arc_mfu->arcs_lsize[ARC_BUFC_DATA], arc_over);
1689 (void) arc_evict(arc_mfu, NULL, toevict, FALSE,
1691 arc_over = arc_size - arc_c;
1695 arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
1697 MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA],
1699 (void) arc_evict(arc_mfu, NULL, toevict, FALSE,
1703 tbl_over = arc_size + arc_mru_ghost->arcs_size +
1704 arc_mfu_ghost->arcs_size - arc_c * 2;
1706 if (tbl_over > 0 && arc_mfu_ghost->arcs_size > 0) {
1707 todelete = MIN(arc_mfu_ghost->arcs_size, tbl_over);
1708 arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
1714 arc_do_user_evicts(void)
1716 static arc_buf_t *tmp_arc_eviction_list;
1719 * Move list over to avoid LOR
1722 mutex_enter(&arc_eviction_mtx);
1723 tmp_arc_eviction_list = arc_eviction_list;
1724 arc_eviction_list = NULL;
1725 mutex_exit(&arc_eviction_mtx);
1727 while (tmp_arc_eviction_list != NULL) {
1728 arc_buf_t *buf = tmp_arc_eviction_list;
1729 tmp_arc_eviction_list = buf->b_next;
1730 rw_enter(&buf->b_lock, RW_WRITER);
1732 rw_exit(&buf->b_lock);
1734 if (buf->b_efunc != NULL)
1735 VERIFY(buf->b_efunc(buf) == 0);
1737 buf->b_efunc = NULL;
1738 buf->b_private = NULL;
1739 kmem_cache_free(buf_cache, buf);
1742 if (arc_eviction_list != NULL)
1747 * Flush all *evictable* data from the cache for the given spa.
1748 * NOTE: this will not touch "active" (i.e. referenced) data.
1751 arc_flush(spa_t *spa)
1753 while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
1754 (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA);
1758 while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
1759 (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA);
1763 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
1764 (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA);
1768 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
1769 (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA);
1774 arc_evict_ghost(arc_mru_ghost, spa, -1);
1775 arc_evict_ghost(arc_mfu_ghost, spa, -1);
1777 mutex_enter(&arc_reclaim_thr_lock);
1778 arc_do_user_evicts();
1779 mutex_exit(&arc_reclaim_thr_lock);
1780 ASSERT(spa || arc_eviction_list == NULL);
1783 int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */
1788 if (arc_c > arc_c_min) {
1792 to_free = arc_c >> arc_shrink_shift;
1794 to_free = arc_c >> arc_shrink_shift;
1796 if (arc_c > arc_c_min + to_free)
1797 atomic_add_64(&arc_c, -to_free);
1801 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
1802 if (arc_c > arc_size)
1803 arc_c = MAX(arc_size, arc_c_min);
1805 arc_p = (arc_c >> 1);
1806 ASSERT(arc_c >= arc_c_min);
1807 ASSERT((int64_t)arc_p >= 0);
1810 if (arc_size > arc_c)
1814 static int needfree = 0;
1817 arc_reclaim_needed(void)
1826 * If pages are needed or we're within 2048 pages
1827 * of needing to page need to reclaim
1829 if (vm_pages_needed || (vm_paging_target() > -2048))
1837 * take 'desfree' extra pages, so we reclaim sooner, rather than later
1842 * check that we're out of range of the pageout scanner. It starts to
1843 * schedule paging if freemem is less than lotsfree and needfree.
1844 * lotsfree is the high-water mark for pageout, and needfree is the
1845 * number of needed free pages. We add extra pages here to make sure
1846 * the scanner doesn't start up while we're freeing memory.
1848 if (freemem < lotsfree + needfree + extra)
1852 * check to make sure that swapfs has enough space so that anon
1853 * reservations can still succeed. anon_resvmem() checks that the
1854 * availrmem is greater than swapfs_minfree, and the number of reserved
1855 * swap pages. We also add a bit of extra here just to prevent
1856 * circumstances from getting really dire.
1858 if (availrmem < swapfs_minfree + swapfs_reserve + extra)
1863 * If we're on an i386 platform, it's possible that we'll exhaust the
1864 * kernel heap space before we ever run out of available physical
1865 * memory. Most checks of the size of the heap_area compare against
1866 * tune.t_minarmem, which is the minimum available real memory that we
1867 * can have in the system. However, this is generally fixed at 25 pages
1868 * which is so low that it's useless. In this comparison, we seek to
1869 * calculate the total heap-size, and reclaim if more than 3/4ths of the
1870 * heap is allocated. (Or, in the calculation, if less than 1/4th is
1873 if (btop(vmem_size(heap_arena, VMEM_FREE)) <
1874 (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
1878 if (kmem_used() > (kmem_size() * 3) / 4)
1883 if (spa_get_random(100) == 0)
1890 arc_kmem_reap_now(arc_reclaim_strategy_t strat)
1894 kmem_cache_t *prev_cache = NULL;
1895 kmem_cache_t *prev_data_cache = NULL;
1896 extern kmem_cache_t *zio_buf_cache[];
1897 extern kmem_cache_t *zio_data_buf_cache[];
1901 if (arc_meta_used >= arc_meta_limit) {
1903 * We are exceeding our meta-data cache limit.
1904 * Purge some DNLC entries to release holds on meta-data.
1906 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
1910 * Reclaim unused memory from all kmem caches.
1917 * An aggressive reclamation will shrink the cache size as well as
1918 * reap free buffers from the arc kmem caches.
1920 if (strat == ARC_RECLAIM_AGGR)
1924 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
1925 if (zio_buf_cache[i] != prev_cache) {
1926 prev_cache = zio_buf_cache[i];
1927 kmem_cache_reap_now(zio_buf_cache[i]);
1929 if (zio_data_buf_cache[i] != prev_data_cache) {
1930 prev_data_cache = zio_data_buf_cache[i];
1931 kmem_cache_reap_now(zio_data_buf_cache[i]);
1935 kmem_cache_reap_now(buf_cache);
1936 kmem_cache_reap_now(hdr_cache);
1940 arc_reclaim_thread(void *dummy __unused)
1942 clock_t growtime = 0;
1943 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS;
1946 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
1948 mutex_enter(&arc_reclaim_thr_lock);
1949 while (arc_thread_exit == 0) {
1950 if (arc_reclaim_needed()) {
1953 if (last_reclaim == ARC_RECLAIM_CONS) {
1954 last_reclaim = ARC_RECLAIM_AGGR;
1956 last_reclaim = ARC_RECLAIM_CONS;
1960 last_reclaim = ARC_RECLAIM_AGGR;
1964 /* reset the growth delay for every reclaim */
1965 growtime = LBOLT + (arc_grow_retry * hz);
1967 if (needfree && last_reclaim == ARC_RECLAIM_CONS) {
1969 * If needfree is TRUE our vm_lowmem hook
1970 * was called and in that case we must free some
1971 * memory, so switch to aggressive mode.
1974 last_reclaim = ARC_RECLAIM_AGGR;
1976 arc_kmem_reap_now(last_reclaim);
1979 } else if (arc_no_grow && LBOLT >= growtime) {
1980 arc_no_grow = FALSE;
1984 (2 * arc_c < arc_size +
1985 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size))
1988 if (arc_eviction_list != NULL)
1989 arc_do_user_evicts();
1991 if (arc_reclaim_needed()) {
1998 /* block until needed, or one second, whichever is shorter */
1999 CALLB_CPR_SAFE_BEGIN(&cpr);
2000 (void) cv_timedwait(&arc_reclaim_thr_cv,
2001 &arc_reclaim_thr_lock, hz);
2002 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2005 arc_thread_exit = 0;
2006 cv_broadcast(&arc_reclaim_thr_cv);
2007 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */
2012 * Adapt arc info given the number of bytes we are trying to add and
2013 * the state that we are comming from. This function is only called
2014 * when we are adding new content to the cache.
2017 arc_adapt(int bytes, arc_state_t *state)
2021 if (state == arc_l2c_only)
2026 * Adapt the target size of the MRU list:
2027 * - if we just hit in the MRU ghost list, then increase
2028 * the target size of the MRU list.
2029 * - if we just hit in the MFU ghost list, then increase
2030 * the target size of the MFU list by decreasing the
2031 * target size of the MRU list.
2033 if (state == arc_mru_ghost) {
2034 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2035 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2037 arc_p = MIN(arc_c, arc_p + bytes * mult);
2038 } else if (state == arc_mfu_ghost) {
2039 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2040 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2042 arc_p = MAX(0, (int64_t)arc_p - bytes * mult);
2044 ASSERT((int64_t)arc_p >= 0);
2046 if (arc_reclaim_needed()) {
2047 cv_signal(&arc_reclaim_thr_cv);
2054 if (arc_c >= arc_c_max)
2058 * If we're within (2 * maxblocksize) bytes of the target
2059 * cache size, increment the target cache size
2061 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2062 atomic_add_64(&arc_c, (int64_t)bytes);
2063 if (arc_c > arc_c_max)
2065 else if (state == arc_anon)
2066 atomic_add_64(&arc_p, (int64_t)bytes);
2070 ASSERT((int64_t)arc_p >= 0);
2074 * Check if the cache has reached its limits and eviction is required
2078 arc_evict_needed(arc_buf_contents_t type)
2080 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2086 * If zio data pages are being allocated out of a separate heap segment,
2087 * then enforce that the size of available vmem for this area remains
2088 * above about 1/32nd free.
2090 if (type == ARC_BUFC_DATA && zio_arena != NULL &&
2091 vmem_size(zio_arena, VMEM_FREE) <
2092 (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
2097 if (arc_reclaim_needed())
2100 return (arc_size > arc_c);
2104 * The buffer, supplied as the first argument, needs a data block.
2105 * So, if we are at cache max, determine which cache should be victimized.
2106 * We have the following cases:
2108 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2109 * In this situation if we're out of space, but the resident size of the MFU is
2110 * under the limit, victimize the MFU cache to satisfy this insertion request.
2112 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2113 * Here, we've used up all of the available space for the MRU, so we need to
2114 * evict from our own cache instead. Evict from the set of resident MRU
2117 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2118 * c minus p represents the MFU space in the cache, since p is the size of the
2119 * cache that is dedicated to the MRU. In this situation there's still space on
2120 * the MFU side, so the MRU side needs to be victimized.
2122 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2123 * MFU's resident set is consuming more space than it has been allotted. In
2124 * this situation, we must victimize our own cache, the MFU, for this insertion.
2127 arc_get_data_buf(arc_buf_t *buf)
2129 arc_state_t *state = buf->b_hdr->b_state;
2130 uint64_t size = buf->b_hdr->b_size;
2131 arc_buf_contents_t type = buf->b_hdr->b_type;
2133 arc_adapt(size, state);
2136 * We have not yet reached cache maximum size,
2137 * just allocate a new buffer.
2139 if (!arc_evict_needed(type)) {
2140 if (type == ARC_BUFC_METADATA) {
2141 buf->b_data = zio_buf_alloc(size);
2142 arc_space_consume(size);
2144 ASSERT(type == ARC_BUFC_DATA);
2145 buf->b_data = zio_data_buf_alloc(size);
2146 atomic_add_64(&arc_size, size);
2152 * If we are prefetching from the mfu ghost list, this buffer
2153 * will end up on the mru list; so steal space from there.
2155 if (state == arc_mfu_ghost)
2156 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2157 else if (state == arc_mru_ghost)
2160 if (state == arc_mru || state == arc_anon) {
2161 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2162 state = (arc_mfu->arcs_lsize[type] > 0 &&
2163 arc_p > mru_used) ? arc_mfu : arc_mru;
2166 uint64_t mfu_space = arc_c - arc_p;
2167 state = (arc_mru->arcs_lsize[type] > 0 &&
2168 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2170 if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
2171 if (type == ARC_BUFC_METADATA) {
2172 buf->b_data = zio_buf_alloc(size);
2173 arc_space_consume(size);
2175 ASSERT(type == ARC_BUFC_DATA);
2176 buf->b_data = zio_data_buf_alloc(size);
2177 atomic_add_64(&arc_size, size);
2179 ARCSTAT_BUMP(arcstat_recycle_miss);
2181 ASSERT(buf->b_data != NULL);
2184 * Update the state size. Note that ghost states have a
2185 * "ghost size" and so don't need to be updated.
2187 if (!GHOST_STATE(buf->b_hdr->b_state)) {
2188 arc_buf_hdr_t *hdr = buf->b_hdr;
2190 atomic_add_64(&hdr->b_state->arcs_size, size);
2191 if (list_link_active(&hdr->b_arc_node)) {
2192 ASSERT(refcount_is_zero(&hdr->b_refcnt));
2193 atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2196 * If we are growing the cache, and we are adding anonymous
2197 * data, and we have outgrown arc_p, update arc_p
2199 if (arc_size < arc_c && hdr->b_state == arc_anon &&
2200 arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2201 arc_p = MIN(arc_c, arc_p + size);
2206 * This routine is called whenever a buffer is accessed.
2207 * NOTE: the hash lock is dropped in this function.
2210 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2212 ASSERT(MUTEX_HELD(hash_lock));
2214 if (buf->b_state == arc_anon) {
2216 * This buffer is not in the cache, and does not
2217 * appear in our "ghost" list. Add the new buffer
2221 ASSERT(buf->b_arc_access == 0);
2222 buf->b_arc_access = LBOLT;
2223 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2224 arc_change_state(arc_mru, buf, hash_lock);
2226 } else if (buf->b_state == arc_mru) {
2228 * If this buffer is here because of a prefetch, then either:
2229 * - clear the flag if this is a "referencing" read
2230 * (any subsequent access will bump this into the MFU state).
2232 * - move the buffer to the head of the list if this is
2233 * another prefetch (to make it less likely to be evicted).
2235 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2236 if (refcount_count(&buf->b_refcnt) == 0) {
2237 ASSERT(list_link_active(&buf->b_arc_node));
2239 buf->b_flags &= ~ARC_PREFETCH;
2240 ARCSTAT_BUMP(arcstat_mru_hits);
2242 buf->b_arc_access = LBOLT;
2247 * This buffer has been "accessed" only once so far,
2248 * but it is still in the cache. Move it to the MFU
2251 if (LBOLT > buf->b_arc_access + ARC_MINTIME) {
2253 * More than 125ms have passed since we
2254 * instantiated this buffer. Move it to the
2255 * most frequently used state.
2257 buf->b_arc_access = LBOLT;
2258 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2259 arc_change_state(arc_mfu, buf, hash_lock);
2261 ARCSTAT_BUMP(arcstat_mru_hits);
2262 } else if (buf->b_state == arc_mru_ghost) {
2263 arc_state_t *new_state;
2265 * This buffer has been "accessed" recently, but
2266 * was evicted from the cache. Move it to the
2270 if (buf->b_flags & ARC_PREFETCH) {
2271 new_state = arc_mru;
2272 if (refcount_count(&buf->b_refcnt) > 0)
2273 buf->b_flags &= ~ARC_PREFETCH;
2274 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2276 new_state = arc_mfu;
2277 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2280 buf->b_arc_access = LBOLT;
2281 arc_change_state(new_state, buf, hash_lock);
2283 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2284 } else if (buf->b_state == arc_mfu) {
2286 * This buffer has been accessed more than once and is
2287 * still in the cache. Keep it in the MFU state.
2289 * NOTE: an add_reference() that occurred when we did
2290 * the arc_read() will have kicked this off the list.
2291 * If it was a prefetch, we will explicitly move it to
2292 * the head of the list now.
2294 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2295 ASSERT(refcount_count(&buf->b_refcnt) == 0);
2296 ASSERT(list_link_active(&buf->b_arc_node));
2298 ARCSTAT_BUMP(arcstat_mfu_hits);
2299 buf->b_arc_access = LBOLT;
2300 } else if (buf->b_state == arc_mfu_ghost) {
2301 arc_state_t *new_state = arc_mfu;
2303 * This buffer has been accessed more than once but has
2304 * been evicted from the cache. Move it back to the
2308 if (buf->b_flags & ARC_PREFETCH) {
2310 * This is a prefetch access...
2311 * move this block back to the MRU state.
2313 ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
2314 new_state = arc_mru;
2317 buf->b_arc_access = LBOLT;
2318 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2319 arc_change_state(new_state, buf, hash_lock);
2321 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2322 } else if (buf->b_state == arc_l2c_only) {
2324 * This buffer is on the 2nd Level ARC.
2327 buf->b_arc_access = LBOLT;
2328 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2329 arc_change_state(arc_mfu, buf, hash_lock);
2331 ASSERT(!"invalid arc state");
2335 /* a generic arc_done_func_t which you can use */
2338 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2340 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2341 VERIFY(arc_buf_remove_ref(buf, arg) == 1);
2344 /* a generic arc_done_func_t */
2346 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2348 arc_buf_t **bufp = arg;
2349 if (zio && zio->io_error) {
2350 VERIFY(arc_buf_remove_ref(buf, arg) == 1);
2358 arc_read_done(zio_t *zio)
2360 arc_buf_hdr_t *hdr, *found;
2362 arc_buf_t *abuf; /* buffer we're assigning to callback */
2363 kmutex_t *hash_lock;
2364 arc_callback_t *callback_list, *acb;
2365 int freeable = FALSE;
2367 buf = zio->io_private;
2371 * The hdr was inserted into hash-table and removed from lists
2372 * prior to starting I/O. We should find this header, since
2373 * it's in the hash table, and it should be legit since it's
2374 * not possible to evict it during the I/O. The only possible
2375 * reason for it not to be found is if we were freed during the
2378 found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth,
2381 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
2382 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2383 (found == hdr && HDR_L2_READING(hdr)));
2385 hdr->b_flags &= ~ARC_L2_EVICTED;
2386 if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
2387 hdr->b_flags &= ~ARC_L2CACHE;
2389 /* byteswap if necessary */
2390 callback_list = hdr->b_acb;
2391 ASSERT(callback_list != NULL);
2392 if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
2393 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
2394 byteswap_uint64_array :
2395 dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
2396 func(buf->b_data, hdr->b_size);
2399 arc_cksum_compute(buf, B_FALSE);
2401 /* create copies of the data buffer for the callers */
2403 for (acb = callback_list; acb; acb = acb->acb_next) {
2404 if (acb->acb_done) {
2406 abuf = arc_buf_clone(buf);
2407 acb->acb_buf = abuf;
2412 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2413 ASSERT(!HDR_BUF_AVAILABLE(hdr));
2415 hdr->b_flags |= ARC_BUF_AVAILABLE;
2417 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
2419 if (zio->io_error != 0) {
2420 hdr->b_flags |= ARC_IO_ERROR;
2421 if (hdr->b_state != arc_anon)
2422 arc_change_state(arc_anon, hdr, hash_lock);
2423 if (HDR_IN_HASH_TABLE(hdr))
2424 buf_hash_remove(hdr);
2425 freeable = refcount_is_zero(&hdr->b_refcnt);
2429 * Broadcast before we drop the hash_lock to avoid the possibility
2430 * that the hdr (and hence the cv) might be freed before we get to
2431 * the cv_broadcast().
2433 cv_broadcast(&hdr->b_cv);
2437 * Only call arc_access on anonymous buffers. This is because
2438 * if we've issued an I/O for an evicted buffer, we've already
2439 * called arc_access (to prevent any simultaneous readers from
2440 * getting confused).
2442 if (zio->io_error == 0 && hdr->b_state == arc_anon)
2443 arc_access(hdr, hash_lock);
2444 mutex_exit(hash_lock);
2447 * This block was freed while we waited for the read to
2448 * complete. It has been removed from the hash table and
2449 * moved to the anonymous state (so that it won't show up
2452 ASSERT3P(hdr->b_state, ==, arc_anon);
2453 freeable = refcount_is_zero(&hdr->b_refcnt);
2456 /* execute each callback and free its structure */
2457 while ((acb = callback_list) != NULL) {
2459 acb->acb_done(zio, acb->acb_buf, acb->acb_private);
2461 if (acb->acb_zio_dummy != NULL) {
2462 acb->acb_zio_dummy->io_error = zio->io_error;
2463 zio_nowait(acb->acb_zio_dummy);
2466 callback_list = acb->acb_next;
2467 kmem_free(acb, sizeof (arc_callback_t));
2471 arc_hdr_destroy(hdr);
2475 * "Read" the block block at the specified DVA (in bp) via the
2476 * cache. If the block is found in the cache, invoke the provided
2477 * callback immediately and return. Note that the `zio' parameter
2478 * in the callback will be NULL in this case, since no IO was
2479 * required. If the block is not in the cache pass the read request
2480 * on to the spa with a substitute callback function, so that the
2481 * requested block will be added to the cache.
2483 * If a read request arrives for a block that has a read in-progress,
2484 * either wait for the in-progress read to complete (and return the
2485 * results); or, if this is a read with a "done" func, add a record
2486 * to the read to invoke the "done" func when the read completes,
2487 * and return; or just return.
2489 * arc_read_done() will invoke all the requested "done" functions
2490 * for readers of this block.
2492 * Normal callers should use arc_read and pass the arc buffer and offset
2493 * for the bp. But if you know you don't need locking, you can use
2497 arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
2498 arc_done_func_t *done, void *private, int priority, int zio_flags,
2499 uint32_t *arc_flags, const zbookmark_t *zb)
2502 arc_buf_hdr_t *hdr = pbuf->b_hdr;
2504 ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
2505 ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
2506 rw_enter(&pbuf->b_lock, RW_READER);
2508 err = arc_read_nolock(pio, spa, bp, done, private, priority,
2509 zio_flags, arc_flags, zb);
2511 ASSERT3P(hdr, ==, pbuf->b_hdr);
2512 rw_exit(&pbuf->b_lock);
2517 arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
2518 arc_done_func_t *done, void *private, int priority, int zio_flags,
2519 uint32_t *arc_flags, const zbookmark_t *zb)
2523 kmutex_t *hash_lock;
2527 hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
2528 if (hdr && hdr->b_datacnt > 0) {
2530 *arc_flags |= ARC_CACHED;
2532 if (HDR_IO_IN_PROGRESS(hdr)) {
2534 if (*arc_flags & ARC_WAIT) {
2535 cv_wait(&hdr->b_cv, hash_lock);
2536 mutex_exit(hash_lock);
2539 ASSERT(*arc_flags & ARC_NOWAIT);
2542 arc_callback_t *acb = NULL;
2544 acb = kmem_zalloc(sizeof (arc_callback_t),
2546 acb->acb_done = done;
2547 acb->acb_private = private;
2549 acb->acb_zio_dummy = zio_null(pio,
2550 spa, NULL, NULL, zio_flags);
2552 ASSERT(acb->acb_done != NULL);
2553 acb->acb_next = hdr->b_acb;
2555 add_reference(hdr, hash_lock, private);
2556 mutex_exit(hash_lock);
2559 mutex_exit(hash_lock);
2563 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2566 add_reference(hdr, hash_lock, private);
2568 * If this block is already in use, create a new
2569 * copy of the data so that we will be guaranteed
2570 * that arc_release() will always succeed.
2574 ASSERT(buf->b_data);
2575 if (HDR_BUF_AVAILABLE(hdr)) {
2576 ASSERT(buf->b_efunc == NULL);
2577 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
2579 buf = arc_buf_clone(buf);
2581 } else if (*arc_flags & ARC_PREFETCH &&
2582 refcount_count(&hdr->b_refcnt) == 0) {
2583 hdr->b_flags |= ARC_PREFETCH;
2585 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
2586 arc_access(hdr, hash_lock);
2587 if (*arc_flags & ARC_L2CACHE)
2588 hdr->b_flags |= ARC_L2CACHE;
2589 mutex_exit(hash_lock);
2590 ARCSTAT_BUMP(arcstat_hits);
2591 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2592 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2593 data, metadata, hits);
2596 done(NULL, buf, private);
2598 uint64_t size = BP_GET_LSIZE(bp);
2599 arc_callback_t *acb;
2604 /* this block is not in the cache */
2605 arc_buf_hdr_t *exists;
2606 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
2607 buf = arc_buf_alloc(spa, size, private, type);
2609 hdr->b_dva = *BP_IDENTITY(bp);
2610 hdr->b_birth = bp->blk_birth;
2611 hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
2612 exists = buf_hash_insert(hdr, &hash_lock);
2614 /* somebody beat us to the hash insert */
2615 mutex_exit(hash_lock);
2616 bzero(&hdr->b_dva, sizeof (dva_t));
2619 (void) arc_buf_remove_ref(buf, private);
2620 goto top; /* restart the IO request */
2622 /* if this is a prefetch, we don't have a reference */
2623 if (*arc_flags & ARC_PREFETCH) {
2624 (void) remove_reference(hdr, hash_lock,
2626 hdr->b_flags |= ARC_PREFETCH;
2628 if (*arc_flags & ARC_L2CACHE)
2629 hdr->b_flags |= ARC_L2CACHE;
2630 if (BP_GET_LEVEL(bp) > 0)
2631 hdr->b_flags |= ARC_INDIRECT;
2633 /* this block is in the ghost cache */
2634 ASSERT(GHOST_STATE(hdr->b_state));
2635 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2636 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
2637 ASSERT(hdr->b_buf == NULL);
2639 /* if this is a prefetch, we don't have a reference */
2640 if (*arc_flags & ARC_PREFETCH)
2641 hdr->b_flags |= ARC_PREFETCH;
2643 add_reference(hdr, hash_lock, private);
2644 if (*arc_flags & ARC_L2CACHE)
2645 hdr->b_flags |= ARC_L2CACHE;
2646 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2649 buf->b_efunc = NULL;
2650 buf->b_private = NULL;
2653 arc_get_data_buf(buf);
2654 ASSERT(hdr->b_datacnt == 0);
2659 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
2660 acb->acb_done = done;
2661 acb->acb_private = private;
2663 ASSERT(hdr->b_acb == NULL);
2665 hdr->b_flags |= ARC_IO_IN_PROGRESS;
2668 * If the buffer has been evicted, migrate it to a present state
2669 * before issuing the I/O. Once we drop the hash-table lock,
2670 * the header will be marked as I/O in progress and have an
2671 * attached buffer. At this point, anybody who finds this
2672 * buffer ought to notice that it's legit but has a pending I/O.
2675 if (GHOST_STATE(hdr->b_state))
2676 arc_access(hdr, hash_lock);
2678 if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
2679 (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
2680 addr = hdr->b_l2hdr->b_daddr;
2682 * Lock out device removal.
2684 if (vdev_is_dead(vd) ||
2685 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
2689 mutex_exit(hash_lock);
2691 ASSERT3U(hdr->b_size, ==, size);
2692 DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
2694 ARCSTAT_BUMP(arcstat_misses);
2695 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2696 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2697 data, metadata, misses);
2701 * Read from the L2ARC if the following are true:
2702 * 1. The L2ARC vdev was previously cached.
2703 * 2. This buffer still has L2ARC metadata.
2704 * 3. This buffer isn't currently writing to the L2ARC.
2705 * 4. The L2ARC entry wasn't evicted, which may
2706 * also have invalidated the vdev.
2708 if (hdr->b_l2hdr != NULL &&
2709 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) {
2710 l2arc_read_callback_t *cb;
2712 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
2713 ARCSTAT_BUMP(arcstat_l2_hits);
2715 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
2717 cb->l2rcb_buf = buf;
2718 cb->l2rcb_spa = spa;
2721 cb->l2rcb_flags = zio_flags;
2724 * l2arc read. The SCL_L2ARC lock will be
2725 * released by l2arc_read_done().
2727 rzio = zio_read_phys(pio, vd, addr, size,
2728 buf->b_data, ZIO_CHECKSUM_OFF,
2729 l2arc_read_done, cb, priority, zio_flags |
2730 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
2731 ZIO_FLAG_DONT_PROPAGATE |
2732 ZIO_FLAG_DONT_RETRY, B_FALSE);
2733 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
2736 if (*arc_flags & ARC_NOWAIT) {
2741 ASSERT(*arc_flags & ARC_WAIT);
2742 if (zio_wait(rzio) == 0)
2745 /* l2arc read error; goto zio_read() */
2747 DTRACE_PROBE1(l2arc__miss,
2748 arc_buf_hdr_t *, hdr);
2749 ARCSTAT_BUMP(arcstat_l2_misses);
2750 if (HDR_L2_WRITING(hdr))
2751 ARCSTAT_BUMP(arcstat_l2_rw_clash);
2752 spa_config_exit(spa, SCL_L2ARC, vd);
2756 rzio = zio_read(pio, spa, bp, buf->b_data, size,
2757 arc_read_done, buf, priority, zio_flags, zb);
2759 if (*arc_flags & ARC_WAIT)
2760 return (zio_wait(rzio));
2762 ASSERT(*arc_flags & ARC_NOWAIT);
2769 * arc_read() variant to support pool traversal. If the block is already
2770 * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
2771 * The idea is that we don't want pool traversal filling up memory, but
2772 * if the ARC already has the data anyway, we shouldn't pay for the I/O.
2775 arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
2781 hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
2783 if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
2784 arc_buf_t *buf = hdr->b_buf;
2787 while (buf->b_data == NULL) {
2791 bcopy(buf->b_data, data, hdr->b_size);
2797 mutex_exit(hash_mtx);
2803 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
2805 ASSERT(buf->b_hdr != NULL);
2806 ASSERT(buf->b_hdr->b_state != arc_anon);
2807 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
2808 buf->b_efunc = func;
2809 buf->b_private = private;
2813 * This is used by the DMU to let the ARC know that a buffer is
2814 * being evicted, so the ARC should clean up. If this arc buf
2815 * is not yet in the evicted state, it will be put there.
2818 arc_buf_evict(arc_buf_t *buf)
2821 kmutex_t *hash_lock;
2824 rw_enter(&buf->b_lock, RW_WRITER);
2828 * We are in arc_do_user_evicts().
2830 ASSERT(buf->b_data == NULL);
2831 rw_exit(&buf->b_lock);
2833 } else if (buf->b_data == NULL) {
2834 arc_buf_t copy = *buf; /* structure assignment */
2836 * We are on the eviction list; process this buffer now
2837 * but let arc_do_user_evicts() do the reaping.
2839 buf->b_efunc = NULL;
2840 rw_exit(&buf->b_lock);
2841 VERIFY(copy.b_efunc(©) == 0);
2844 hash_lock = HDR_LOCK(hdr);
2845 mutex_enter(hash_lock);
2847 ASSERT(buf->b_hdr == hdr);
2848 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
2849 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2852 * Pull this buffer off of the hdr
2855 while (*bufp != buf)
2856 bufp = &(*bufp)->b_next;
2857 *bufp = buf->b_next;
2859 ASSERT(buf->b_data != NULL);
2860 arc_buf_destroy(buf, FALSE, FALSE);
2862 if (hdr->b_datacnt == 0) {
2863 arc_state_t *old_state = hdr->b_state;
2864 arc_state_t *evicted_state;
2866 ASSERT(refcount_is_zero(&hdr->b_refcnt));
2869 (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
2871 mutex_enter(&old_state->arcs_mtx);
2872 mutex_enter(&evicted_state->arcs_mtx);
2874 arc_change_state(evicted_state, hdr, hash_lock);
2875 ASSERT(HDR_IN_HASH_TABLE(hdr));
2876 hdr->b_flags |= ARC_IN_HASH_TABLE;
2877 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
2879 mutex_exit(&evicted_state->arcs_mtx);
2880 mutex_exit(&old_state->arcs_mtx);
2882 mutex_exit(hash_lock);
2883 rw_exit(&buf->b_lock);
2885 VERIFY(buf->b_efunc(buf) == 0);
2886 buf->b_efunc = NULL;
2887 buf->b_private = NULL;
2889 kmem_cache_free(buf_cache, buf);
2894 * Release this buffer from the cache. This must be done
2895 * after a read and prior to modifying the buffer contents.
2896 * If the buffer has more than one reference, we must make
2897 * a new hdr for the buffer.
2900 arc_release(arc_buf_t *buf, void *tag)
2903 kmutex_t *hash_lock;
2904 l2arc_buf_hdr_t *l2hdr;
2907 rw_enter(&buf->b_lock, RW_WRITER);
2910 /* this buffer is not on any list */
2911 ASSERT(refcount_count(&hdr->b_refcnt) > 0);
2912 ASSERT(!(hdr->b_flags & ARC_STORED));
2914 if (hdr->b_state == arc_anon) {
2915 /* this buffer is already released */
2916 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
2917 ASSERT(BUF_EMPTY(hdr));
2918 ASSERT(buf->b_efunc == NULL);
2920 rw_exit(&buf->b_lock);
2924 hash_lock = HDR_LOCK(hdr);
2925 mutex_enter(hash_lock);
2927 l2hdr = hdr->b_l2hdr;
2929 mutex_enter(&l2arc_buflist_mtx);
2930 hdr->b_l2hdr = NULL;
2931 buf_size = hdr->b_size;
2935 * Do we have more than one buf?
2937 if (hdr->b_datacnt > 1) {
2938 arc_buf_hdr_t *nhdr;
2940 uint64_t blksz = hdr->b_size;
2941 spa_t *spa = hdr->b_spa;
2942 arc_buf_contents_t type = hdr->b_type;
2943 uint32_t flags = hdr->b_flags;
2945 ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
2947 * Pull the data off of this buf and attach it to
2948 * a new anonymous buf.
2950 (void) remove_reference(hdr, hash_lock, tag);
2952 while (*bufp != buf)
2953 bufp = &(*bufp)->b_next;
2954 *bufp = (*bufp)->b_next;
2957 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
2958 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
2959 if (refcount_is_zero(&hdr->b_refcnt)) {
2960 uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
2961 ASSERT3U(*size, >=, hdr->b_size);
2962 atomic_add_64(size, -hdr->b_size);
2964 hdr->b_datacnt -= 1;
2965 arc_cksum_verify(buf);
2967 mutex_exit(hash_lock);
2969 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
2970 nhdr->b_size = blksz;
2972 nhdr->b_type = type;
2974 nhdr->b_state = arc_anon;
2975 nhdr->b_arc_access = 0;
2976 nhdr->b_flags = flags & ARC_L2_WRITING;
2977 nhdr->b_l2hdr = NULL;
2978 nhdr->b_datacnt = 1;
2979 nhdr->b_freeze_cksum = NULL;
2980 (void) refcount_add(&nhdr->b_refcnt, tag);
2982 rw_exit(&buf->b_lock);
2983 atomic_add_64(&arc_anon->arcs_size, blksz);
2985 rw_exit(&buf->b_lock);
2986 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
2987 ASSERT(!list_link_active(&hdr->b_arc_node));
2988 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2989 arc_change_state(arc_anon, hdr, hash_lock);
2990 hdr->b_arc_access = 0;
2991 mutex_exit(hash_lock);
2993 bzero(&hdr->b_dva, sizeof (dva_t));
2998 buf->b_efunc = NULL;
2999 buf->b_private = NULL;
3002 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3003 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3004 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3005 mutex_exit(&l2arc_buflist_mtx);
3010 arc_released(arc_buf_t *buf)
3014 rw_enter(&buf->b_lock, RW_READER);
3015 released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3016 rw_exit(&buf->b_lock);
3021 arc_has_callback(arc_buf_t *buf)
3025 rw_enter(&buf->b_lock, RW_READER);
3026 callback = (buf->b_efunc != NULL);
3027 rw_exit(&buf->b_lock);
3033 arc_referenced(arc_buf_t *buf)
3037 rw_enter(&buf->b_lock, RW_READER);
3038 referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3039 rw_exit(&buf->b_lock);
3040 return (referenced);
3045 arc_write_ready(zio_t *zio)
3047 arc_write_callback_t *callback = zio->io_private;
3048 arc_buf_t *buf = callback->awcb_buf;
3049 arc_buf_hdr_t *hdr = buf->b_hdr;
3051 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3052 callback->awcb_ready(zio, buf, callback->awcb_private);
3055 * If the IO is already in progress, then this is a re-write
3056 * attempt, so we need to thaw and re-compute the cksum.
3057 * It is the responsibility of the callback to handle the
3058 * accounting for any re-write attempt.
3060 if (HDR_IO_IN_PROGRESS(hdr)) {
3061 mutex_enter(&hdr->b_freeze_lock);
3062 if (hdr->b_freeze_cksum != NULL) {
3063 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3064 hdr->b_freeze_cksum = NULL;
3066 mutex_exit(&hdr->b_freeze_lock);
3068 arc_cksum_compute(buf, B_FALSE);
3069 hdr->b_flags |= ARC_IO_IN_PROGRESS;
3073 arc_write_done(zio_t *zio)
3075 arc_write_callback_t *callback = zio->io_private;
3076 arc_buf_t *buf = callback->awcb_buf;
3077 arc_buf_hdr_t *hdr = buf->b_hdr;
3081 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3082 hdr->b_birth = zio->io_bp->blk_birth;
3083 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3085 * If the block to be written was all-zero, we may have
3086 * compressed it away. In this case no write was performed
3087 * so there will be no dva/birth-date/checksum. The buffer
3088 * must therefor remain anonymous (and uncached).
3090 if (!BUF_EMPTY(hdr)) {
3091 arc_buf_hdr_t *exists;
3092 kmutex_t *hash_lock;
3094 arc_cksum_verify(buf);
3096 exists = buf_hash_insert(hdr, &hash_lock);
3099 * This can only happen if we overwrite for
3100 * sync-to-convergence, because we remove
3101 * buffers from the hash table when we arc_free().
3103 ASSERT(zio->io_flags & ZIO_FLAG_IO_REWRITE);
3104 ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
3105 BP_IDENTITY(zio->io_bp)));
3106 ASSERT3U(zio->io_bp_orig.blk_birth, ==,
3107 zio->io_bp->blk_birth);
3109 ASSERT(refcount_is_zero(&exists->b_refcnt));
3110 arc_change_state(arc_anon, exists, hash_lock);
3111 mutex_exit(hash_lock);
3112 arc_hdr_destroy(exists);
3113 exists = buf_hash_insert(hdr, &hash_lock);
3114 ASSERT3P(exists, ==, NULL);
3116 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3117 /* if it's not anon, we are doing a scrub */
3118 if (hdr->b_state == arc_anon)
3119 arc_access(hdr, hash_lock);
3120 mutex_exit(hash_lock);
3121 } else if (callback->awcb_done == NULL) {
3124 * This is an anonymous buffer with no user callback,
3125 * destroy it if there are no active references.
3127 mutex_enter(&arc_eviction_mtx);
3128 destroy_hdr = refcount_is_zero(&hdr->b_refcnt);
3129 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3130 mutex_exit(&arc_eviction_mtx);
3132 arc_hdr_destroy(hdr);
3134 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3136 hdr->b_flags &= ~ARC_STORED;
3138 if (callback->awcb_done) {
3139 ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3140 callback->awcb_done(zio, buf, callback->awcb_private);
3143 kmem_free(callback, sizeof (arc_write_callback_t));
3147 write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp)
3149 boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata);
3151 /* Determine checksum setting */
3154 * Metadata always gets checksummed. If the data
3155 * checksum is multi-bit correctable, and it's not a
3156 * ZBT-style checksum, then it's suitable for metadata
3157 * as well. Otherwise, the metadata checksum defaults
3160 if (zio_checksum_table[wp->wp_oschecksum].ci_correctable &&
3161 !zio_checksum_table[wp->wp_oschecksum].ci_zbt)
3162 zp->zp_checksum = wp->wp_oschecksum;
3164 zp->zp_checksum = ZIO_CHECKSUM_FLETCHER_4;
3166 zp->zp_checksum = zio_checksum_select(wp->wp_dnchecksum,
3170 /* Determine compression setting */
3173 * XXX -- we should design a compression algorithm
3174 * that specializes in arrays of bps.
3176 zp->zp_compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
3179 zp->zp_compress = zio_compress_select(wp->wp_dncompress,
3183 zp->zp_type = wp->wp_type;
3184 zp->zp_level = wp->wp_level;
3185 zp->zp_ndvas = MIN(wp->wp_copies + ismd, spa_max_replication(spa));
3189 arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
3190 boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
3191 arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
3192 int zio_flags, const zbookmark_t *zb)
3194 arc_buf_hdr_t *hdr = buf->b_hdr;
3195 arc_write_callback_t *callback;
3199 ASSERT(ready != NULL);
3200 ASSERT(!HDR_IO_ERROR(hdr));
3201 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3202 ASSERT(hdr->b_acb == 0);
3204 hdr->b_flags |= ARC_L2CACHE;
3205 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3206 callback->awcb_ready = ready;
3207 callback->awcb_done = done;
3208 callback->awcb_private = private;
3209 callback->awcb_buf = buf;
3211 write_policy(spa, wp, &zp);
3212 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, &zp,
3213 arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
3219 arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
3220 zio_done_func_t *done, void *private, uint32_t arc_flags)
3223 kmutex_t *hash_lock;
3227 * If this buffer is in the cache, release it, so it
3230 ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
3233 * The checksum of blocks to free is not always
3234 * preserved (eg. on the deadlist). However, if it is
3235 * nonzero, it should match what we have in the cache.
3237 ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
3238 bp->blk_cksum.zc_word[0] == ab->b_cksum0 ||
3239 bp->blk_fill == BLK_FILL_ALREADY_FREED);
3241 if (ab->b_state != arc_anon)
3242 arc_change_state(arc_anon, ab, hash_lock);
3243 if (HDR_IO_IN_PROGRESS(ab)) {
3245 * This should only happen when we prefetch.
3247 ASSERT(ab->b_flags & ARC_PREFETCH);
3248 ASSERT3U(ab->b_datacnt, ==, 1);
3249 ab->b_flags |= ARC_FREED_IN_READ;
3250 if (HDR_IN_HASH_TABLE(ab))
3251 buf_hash_remove(ab);
3252 ab->b_arc_access = 0;
3253 bzero(&ab->b_dva, sizeof (dva_t));
3256 ab->b_buf->b_efunc = NULL;
3257 ab->b_buf->b_private = NULL;
3258 mutex_exit(hash_lock);
3259 } else if (refcount_is_zero(&ab->b_refcnt)) {
3260 ab->b_flags |= ARC_FREE_IN_PROGRESS;
3261 mutex_exit(hash_lock);
3262 arc_hdr_destroy(ab);
3263 ARCSTAT_BUMP(arcstat_deleted);
3266 * We still have an active reference on this
3267 * buffer. This can happen, e.g., from
3268 * dbuf_unoverride().
3270 ASSERT(!HDR_IN_HASH_TABLE(ab));
3271 ab->b_arc_access = 0;
3272 bzero(&ab->b_dva, sizeof (dva_t));
3275 ab->b_buf->b_efunc = NULL;
3276 ab->b_buf->b_private = NULL;
3277 mutex_exit(hash_lock);
3281 zio = zio_free(pio, spa, txg, bp, done, private, ZIO_FLAG_MUSTSUCCEED);
3283 if (arc_flags & ARC_WAIT)
3284 return (zio_wait(zio));
3286 ASSERT(arc_flags & ARC_NOWAIT);
3293 arc_memory_throttle(uint64_t reserve, uint64_t txg)
3296 uint64_t inflight_data = arc_anon->arcs_size;
3297 uint64_t available_memory = ptoa((uintmax_t)cnt.v_free_count);
3298 static uint64_t page_load = 0;
3299 static uint64_t last_txg = 0;
3304 MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3307 if (available_memory >= zfs_write_limit_max)
3310 if (txg > last_txg) {
3315 * If we are in pageout, we know that memory is already tight,
3316 * the arc is already going to be evicting, so we just want to
3317 * continue to let page writes occur as quickly as possible.
3319 if (curproc == pageproc) {
3320 if (page_load > available_memory / 4)
3322 /* Note: reserve is inflated, so we deflate */
3323 page_load += reserve / 8;
3325 } else if (page_load > 0 && arc_reclaim_needed()) {
3326 /* memory is low, delay before restarting */
3327 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3332 if (arc_size > arc_c_min) {
3333 uint64_t evictable_memory =
3334 arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3335 arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3336 arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3337 arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3338 available_memory += MIN(evictable_memory, arc_size - arc_c_min);
3341 if (inflight_data > available_memory / 4) {
3342 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3350 arc_tempreserve_clear(uint64_t reserve)
3352 atomic_add_64(&arc_tempreserve, -reserve);
3353 ASSERT((int64_t)arc_tempreserve >= 0);
3357 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3363 * Once in a while, fail for no reason. Everything should cope.
3365 if (spa_get_random(10000) == 0) {
3366 dprintf("forcing random failure\n");
3370 if (reserve > arc_c/4 && !arc_no_grow)
3371 arc_c = MIN(arc_c_max, reserve * 4);
3372 if (reserve > arc_c)
3376 * Writes will, almost always, require additional memory allocations
3377 * in order to compress/encrypt/etc the data. We therefor need to
3378 * make sure that there is sufficient available memory for this.
3380 if (error = arc_memory_throttle(reserve, txg))
3384 * Throttle writes when the amount of dirty data in the cache
3385 * gets too large. We try to keep the cache less than half full
3386 * of dirty blocks so that our sync times don't grow too large.
3387 * Note: if two requests come in concurrently, we might let them
3388 * both succeed, when one of them should fail. Not a huge deal.
3390 if (reserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 &&
3391 arc_anon->arcs_size > arc_c / 4) {
3392 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3393 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3394 arc_tempreserve>>10,
3395 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3396 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3397 reserve>>10, arc_c>>10);
3400 atomic_add_64(&arc_tempreserve, reserve);
3404 static kmutex_t arc_lowmem_lock;
3406 static eventhandler_tag arc_event_lowmem = NULL;
3409 arc_lowmem(void *arg __unused, int howto __unused)
3412 /* Serialize access via arc_lowmem_lock. */
3413 mutex_enter(&arc_lowmem_lock);
3415 cv_signal(&arc_reclaim_thr_cv);
3417 tsleep(&needfree, 0, "zfs:lowmem", hz / 5);
3418 mutex_exit(&arc_lowmem_lock);
3425 int prefetch_tunable_set = 0;
3427 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3428 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3429 mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL);
3431 /* Convert seconds to clock ticks */
3432 arc_min_prefetch_lifespan = 1 * hz;
3434 /* Start out with 1/8 of all memory */
3435 arc_c = kmem_size() / 8;
3439 * On architectures where the physical memory can be larger
3440 * than the addressable space (intel in 32-bit mode), we may
3441 * need to limit the cache to 1/8 of VM size.
3443 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3446 /* set min cache to 1/32 of all memory, or 16MB, whichever is more */
3447 arc_c_min = MAX(arc_c / 4, 64<<18);
3448 /* set max to 1/2 of all memory, or all but 1GB, whichever is more */
3449 if (arc_c * 8 >= 1<<30)
3450 arc_c_max = (arc_c * 8) - (1<<30);
3452 arc_c_max = arc_c_min;
3453 arc_c_max = MAX(arc_c * 5, arc_c_max);
3456 * Allow the tunables to override our calculations if they are
3457 * reasonable (ie. over 16MB)
3459 if (zfs_arc_max >= 64<<18 && zfs_arc_max < kmem_size())
3460 arc_c_max = zfs_arc_max;
3461 if (zfs_arc_min >= 64<<18 && zfs_arc_min <= arc_c_max)
3462 arc_c_min = zfs_arc_min;
3465 arc_p = (arc_c >> 1);
3467 /* limit meta-data to 1/4 of the arc capacity */
3468 arc_meta_limit = arc_c_max / 4;
3470 /* Allow the tunable to override if it is reasonable */
3471 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
3472 arc_meta_limit = zfs_arc_meta_limit;
3474 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
3475 arc_c_min = arc_meta_limit / 2;
3477 /* if kmem_flags are set, lets try to use less memory */
3478 if (kmem_debugging())
3480 if (arc_c < arc_c_min)
3483 zfs_arc_min = arc_c_min;
3484 zfs_arc_max = arc_c_max;
3486 arc_anon = &ARC_anon;
3488 arc_mru_ghost = &ARC_mru_ghost;
3490 arc_mfu_ghost = &ARC_mfu_ghost;
3491 arc_l2c_only = &ARC_l2c_only;
3494 mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3495 mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3496 mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3497 mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3498 mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3499 mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3501 list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
3502 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3503 list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
3504 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3505 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
3506 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3507 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
3508 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3509 list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
3510 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3511 list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
3512 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3513 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
3514 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3515 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
3516 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3517 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
3518 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3519 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
3520 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3524 arc_thread_exit = 0;
3525 arc_eviction_list = NULL;
3526 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3527 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3529 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3530 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3532 if (arc_ksp != NULL) {
3533 arc_ksp->ks_data = &arc_stats;
3534 kstat_install(arc_ksp);
3537 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3538 TS_RUN, minclsyspri);
3541 arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
3542 EVENTHANDLER_PRI_FIRST);
3548 if (zfs_write_limit_max == 0)
3549 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
3551 zfs_write_limit_shift = 0;
3552 mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
3555 if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
3556 prefetch_tunable_set = 1;
3559 if (prefetch_tunable_set == 0) {
3560 printf("ZFS NOTICE: prefetch is disabled by default on i386"
3561 " - add enable to tunable to change.\n" );
3562 zfs_prefetch_disable=1;
3565 if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
3566 prefetch_tunable_set == 0) {
3567 printf("ZFS NOTICE: system has less than 4GB and prefetch enable is not set"
3568 "... disabling.\n");
3569 zfs_prefetch_disable=1;
3572 /* Warn about ZFS memory and address space requirements. */
3573 if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
3574 printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
3575 "expect unstable behavior.\n");
3577 if (kmem_size() < 512 * (1 << 20)) {
3578 printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
3579 "expect unstable behavior.\n");
3580 printf(" Consider tuning vm.kmem_size and "
3581 "vm.kmem_size_max\n");
3582 printf(" in /boot/loader.conf.\n");
3591 mutex_enter(&arc_reclaim_thr_lock);
3592 arc_thread_exit = 1;
3593 cv_signal(&arc_reclaim_thr_cv);
3594 while (arc_thread_exit != 0)
3595 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3596 mutex_exit(&arc_reclaim_thr_lock);
3602 if (arc_ksp != NULL) {
3603 kstat_delete(arc_ksp);
3607 mutex_destroy(&arc_eviction_mtx);
3608 mutex_destroy(&arc_reclaim_thr_lock);
3609 cv_destroy(&arc_reclaim_thr_cv);
3611 list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3612 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3613 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3614 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3615 list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3616 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3617 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3618 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3620 mutex_destroy(&arc_anon->arcs_mtx);
3621 mutex_destroy(&arc_mru->arcs_mtx);
3622 mutex_destroy(&arc_mru_ghost->arcs_mtx);
3623 mutex_destroy(&arc_mfu->arcs_mtx);
3624 mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3626 mutex_destroy(&zfs_write_limit_lock);
3630 mutex_destroy(&arc_lowmem_lock);
3632 if (arc_event_lowmem != NULL)
3633 EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
3640 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3641 * It uses dedicated storage devices to hold cached data, which are populated
3642 * using large infrequent writes. The main role of this cache is to boost
3643 * the performance of random read workloads. The intended L2ARC devices
3644 * include short-stroked disks, solid state disks, and other media with
3645 * substantially faster read latency than disk.
3647 * +-----------------------+
3649 * +-----------------------+
3652 * l2arc_feed_thread() arc_read()
3656 * +---------------+ |
3658 * +---------------+ |
3663 * +-------+ +-------+
3665 * | cache | | cache |
3666 * +-------+ +-------+
3667 * +=========+ .-----.
3668 * : L2ARC : |-_____-|
3669 * : devices : | Disks |
3670 * +=========+ `-_____-'
3672 * Read requests are satisfied from the following sources, in order:
3675 * 2) vdev cache of L2ARC devices
3677 * 4) vdev cache of disks
3680 * Some L2ARC device types exhibit extremely slow write performance.
3681 * To accommodate for this there are some significant differences between
3682 * the L2ARC and traditional cache design:
3684 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from
3685 * the ARC behave as usual, freeing buffers and placing headers on ghost
3686 * lists. The ARC does not send buffers to the L2ARC during eviction as
3687 * this would add inflated write latencies for all ARC memory pressure.
3689 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
3690 * It does this by periodically scanning buffers from the eviction-end of
3691 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
3692 * not already there. It scans until a headroom of buffers is satisfied,
3693 * which itself is a buffer for ARC eviction. The thread that does this is
3694 * l2arc_feed_thread(), illustrated below; example sizes are included to
3695 * provide a better sense of ratio than this diagram:
3698 * +---------------------+----------+
3699 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC
3700 * +---------------------+----------+ | o L2ARC eligible
3701 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer
3702 * +---------------------+----------+ |
3703 * 15.9 Gbytes ^ 32 Mbytes |
3705 * l2arc_feed_thread()
3707 * l2arc write hand <--[oooo]--'
3711 * +==============================+
3712 * L2ARC dev |####|#|###|###| |####| ... |
3713 * +==============================+
3716 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
3717 * evicted, then the L2ARC has cached a buffer much sooner than it probably
3718 * needed to, potentially wasting L2ARC device bandwidth and storage. It is
3719 * safe to say that this is an uncommon case, since buffers at the end of
3720 * the ARC lists have moved there due to inactivity.
3722 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
3723 * then the L2ARC simply misses copying some buffers. This serves as a
3724 * pressure valve to prevent heavy read workloads from both stalling the ARC
3725 * with waits and clogging the L2ARC with writes. This also helps prevent
3726 * the potential for the L2ARC to churn if it attempts to cache content too
3727 * quickly, such as during backups of the entire pool.
3729 * 5. After system boot and before the ARC has filled main memory, there are
3730 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
3731 * lists can remain mostly static. Instead of searching from tail of these
3732 * lists as pictured, the l2arc_feed_thread() will search from the list heads
3733 * for eligible buffers, greatly increasing its chance of finding them.
3735 * The L2ARC device write speed is also boosted during this time so that
3736 * the L2ARC warms up faster. Since there have been no ARC evictions yet,
3737 * there are no L2ARC reads, and no fear of degrading read performance
3738 * through increased writes.
3740 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
3741 * the vdev queue can aggregate them into larger and fewer writes. Each
3742 * device is written to in a rotor fashion, sweeping writes through
3743 * available space then repeating.
3745 * 7. The L2ARC does not store dirty content. It never needs to flush
3746 * write buffers back to disk based storage.
3748 * 8. If an ARC buffer is written (and dirtied) which also exists in the
3749 * L2ARC, the now stale L2ARC buffer is immediately dropped.
3751 * The performance of the L2ARC can be tweaked by a number of tunables, which
3752 * may be necessary for different workloads:
3754 * l2arc_write_max max write bytes per interval
3755 * l2arc_write_boost extra write bytes during device warmup
3756 * l2arc_noprefetch skip caching prefetched buffers
3757 * l2arc_headroom number of max device writes to precache
3758 * l2arc_feed_secs seconds between L2ARC writing
3760 * Tunables may be removed or added as future performance improvements are
3761 * integrated, and also may become zpool properties.
3765 l2arc_hdr_stat_add(void)
3767 ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
3768 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
3772 l2arc_hdr_stat_remove(void)
3774 ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
3775 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
3779 * Cycle through L2ARC devices. This is how L2ARC load balances.
3780 * If a device is returned, this also returns holding the spa config lock.
3782 static l2arc_dev_t *
3783 l2arc_dev_get_next(void)
3785 l2arc_dev_t *first, *next = NULL;
3788 * Lock out the removal of spas (spa_namespace_lock), then removal
3789 * of cache devices (l2arc_dev_mtx). Once a device has been selected,
3790 * both locks will be dropped and a spa config lock held instead.
3792 mutex_enter(&spa_namespace_lock);
3793 mutex_enter(&l2arc_dev_mtx);
3795 /* if there are no vdevs, there is nothing to do */
3796 if (l2arc_ndev == 0)
3800 next = l2arc_dev_last;
3802 /* loop around the list looking for a non-faulted vdev */
3804 next = list_head(l2arc_dev_list);
3806 next = list_next(l2arc_dev_list, next);
3808 next = list_head(l2arc_dev_list);
3811 /* if we have come back to the start, bail out */
3814 else if (next == first)
3817 } while (vdev_is_dead(next->l2ad_vdev));
3819 /* if we were unable to find any usable vdevs, return NULL */
3820 if (vdev_is_dead(next->l2ad_vdev))
3823 l2arc_dev_last = next;
3826 mutex_exit(&l2arc_dev_mtx);
3829 * Grab the config lock to prevent the 'next' device from being
3830 * removed while we are writing to it.
3833 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
3834 mutex_exit(&spa_namespace_lock);
3840 * Free buffers that were tagged for destruction.
3843 l2arc_do_free_on_write()
3846 l2arc_data_free_t *df, *df_prev;
3848 mutex_enter(&l2arc_free_on_write_mtx);
3849 buflist = l2arc_free_on_write;
3851 for (df = list_tail(buflist); df; df = df_prev) {
3852 df_prev = list_prev(buflist, df);
3853 ASSERT(df->l2df_data != NULL);
3854 ASSERT(df->l2df_func != NULL);
3855 df->l2df_func(df->l2df_data, df->l2df_size);
3856 list_remove(buflist, df);
3857 kmem_free(df, sizeof (l2arc_data_free_t));
3860 mutex_exit(&l2arc_free_on_write_mtx);
3864 * A write to a cache device has completed. Update all headers to allow
3865 * reads from these buffers to begin.
3868 l2arc_write_done(zio_t *zio)
3870 l2arc_write_callback_t *cb;
3873 arc_buf_hdr_t *head, *ab, *ab_prev;
3874 l2arc_buf_hdr_t *abl2;
3875 kmutex_t *hash_lock;
3877 cb = zio->io_private;
3879 dev = cb->l2wcb_dev;
3880 ASSERT(dev != NULL);
3881 head = cb->l2wcb_head;
3882 ASSERT(head != NULL);
3883 buflist = dev->l2ad_buflist;
3884 ASSERT(buflist != NULL);
3885 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
3886 l2arc_write_callback_t *, cb);
3888 if (zio->io_error != 0)
3889 ARCSTAT_BUMP(arcstat_l2_writes_error);
3891 mutex_enter(&l2arc_buflist_mtx);
3894 * All writes completed, or an error was hit.
3896 for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
3897 ab_prev = list_prev(buflist, ab);
3899 hash_lock = HDR_LOCK(ab);
3900 if (!mutex_tryenter(hash_lock)) {
3902 * This buffer misses out. It may be in a stage
3903 * of eviction. Its ARC_L2_WRITING flag will be
3904 * left set, denying reads to this buffer.
3906 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
3910 if (zio->io_error != 0) {
3912 * Error - drop L2ARC entry.
3914 list_remove(buflist, ab);
3917 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
3918 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
3922 * Allow ARC to begin reads to this L2ARC entry.
3924 ab->b_flags &= ~ARC_L2_WRITING;
3926 mutex_exit(hash_lock);
3929 atomic_inc_64(&l2arc_writes_done);
3930 list_remove(buflist, head);
3931 kmem_cache_free(hdr_cache, head);
3932 mutex_exit(&l2arc_buflist_mtx);
3934 l2arc_do_free_on_write();
3936 kmem_free(cb, sizeof (l2arc_write_callback_t));
3940 * A read to a cache device completed. Validate buffer contents before
3941 * handing over to the regular ARC routines.
3944 l2arc_read_done(zio_t *zio)
3946 l2arc_read_callback_t *cb;
3949 kmutex_t *hash_lock;
3952 ASSERT(zio->io_vd != NULL);
3953 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
3955 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
3957 cb = zio->io_private;
3959 buf = cb->l2rcb_buf;
3960 ASSERT(buf != NULL);
3962 ASSERT(hdr != NULL);
3964 hash_lock = HDR_LOCK(hdr);
3965 mutex_enter(hash_lock);
3968 * Check this survived the L2ARC journey.
3970 equal = arc_cksum_equal(buf);
3971 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
3972 mutex_exit(hash_lock);
3973 zio->io_private = buf;
3974 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
3975 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
3978 mutex_exit(hash_lock);
3980 * Buffer didn't survive caching. Increment stats and
3981 * reissue to the original storage device.
3983 if (zio->io_error != 0) {
3984 ARCSTAT_BUMP(arcstat_l2_io_error);
3986 zio->io_error = EIO;
3989 ARCSTAT_BUMP(arcstat_l2_cksum_bad);
3992 * If there's no waiter, issue an async i/o to the primary
3993 * storage now. If there *is* a waiter, the caller must
3994 * issue the i/o in a context where it's OK to block.
3996 if (zio->io_waiter == NULL)
3997 zio_nowait(zio_read(zio->io_parent,
3998 cb->l2rcb_spa, &cb->l2rcb_bp,
3999 buf->b_data, zio->io_size, arc_read_done, buf,
4000 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4003 kmem_free(cb, sizeof (l2arc_read_callback_t));
4007 * This is the list priority from which the L2ARC will search for pages to
4008 * cache. This is used within loops (0..3) to cycle through lists in the
4009 * desired order. This order can have a significant effect on cache
4012 * Currently the metadata lists are hit first, MFU then MRU, followed by
4013 * the data lists. This function returns a locked list, and also returns
4017 l2arc_list_locked(int list_num, kmutex_t **lock)
4021 ASSERT(list_num >= 0 && list_num <= 3);
4025 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4026 *lock = &arc_mfu->arcs_mtx;
4029 list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4030 *lock = &arc_mru->arcs_mtx;
4033 list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4034 *lock = &arc_mfu->arcs_mtx;
4037 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4038 *lock = &arc_mru->arcs_mtx;
4042 ASSERT(!(MUTEX_HELD(*lock)));
4048 * Evict buffers from the device write hand to the distance specified in
4049 * bytes. This distance may span populated buffers, it may span nothing.
4050 * This is clearing a region on the L2ARC device ready for writing.
4051 * If the 'all' boolean is set, every buffer is evicted.
4054 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4057 l2arc_buf_hdr_t *abl2;
4058 arc_buf_hdr_t *ab, *ab_prev;
4059 kmutex_t *hash_lock;
4062 buflist = dev->l2ad_buflist;
4064 if (buflist == NULL)
4067 if (!all && dev->l2ad_first) {
4069 * This is the first sweep through the device. There is
4075 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4077 * When nearing the end of the device, evict to the end
4078 * before the device write hand jumps to the start.
4080 taddr = dev->l2ad_end;
4082 taddr = dev->l2ad_hand + distance;
4084 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4085 uint64_t, taddr, boolean_t, all);
4088 mutex_enter(&l2arc_buflist_mtx);
4089 for (ab = list_tail(buflist); ab; ab = ab_prev) {
4090 ab_prev = list_prev(buflist, ab);
4092 hash_lock = HDR_LOCK(ab);
4093 if (!mutex_tryenter(hash_lock)) {
4095 * Missed the hash lock. Retry.
4097 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4098 mutex_exit(&l2arc_buflist_mtx);
4099 mutex_enter(hash_lock);
4100 mutex_exit(hash_lock);
4104 if (HDR_L2_WRITE_HEAD(ab)) {
4106 * We hit a write head node. Leave it for
4107 * l2arc_write_done().
4109 list_remove(buflist, ab);
4110 mutex_exit(hash_lock);
4114 if (!all && ab->b_l2hdr != NULL &&
4115 (ab->b_l2hdr->b_daddr > taddr ||
4116 ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4118 * We've evicted to the target address,
4119 * or the end of the device.
4121 mutex_exit(hash_lock);
4125 if (HDR_FREE_IN_PROGRESS(ab)) {
4127 * Already on the path to destruction.
4129 mutex_exit(hash_lock);
4133 if (ab->b_state == arc_l2c_only) {
4134 ASSERT(!HDR_L2_READING(ab));
4136 * This doesn't exist in the ARC. Destroy.
4137 * arc_hdr_destroy() will call list_remove()
4138 * and decrement arcstat_l2_size.
4140 arc_change_state(arc_anon, ab, hash_lock);
4141 arc_hdr_destroy(ab);
4144 * Invalidate issued or about to be issued
4145 * reads, since we may be about to write
4146 * over this location.
4148 if (HDR_L2_READING(ab)) {
4149 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4150 ab->b_flags |= ARC_L2_EVICTED;
4154 * Tell ARC this no longer exists in L2ARC.
4156 if (ab->b_l2hdr != NULL) {
4159 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4160 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4162 list_remove(buflist, ab);
4165 * This may have been leftover after a
4168 ab->b_flags &= ~ARC_L2_WRITING;
4170 mutex_exit(hash_lock);
4172 mutex_exit(&l2arc_buflist_mtx);
4174 spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict));
4175 dev->l2ad_evict = taddr;
4179 * Find and write ARC buffers to the L2ARC device.
4181 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4182 * for reading until they have completed writing.
4185 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
4187 arc_buf_hdr_t *ab, *ab_prev, *head;
4188 l2arc_buf_hdr_t *hdrl2;
4190 uint64_t passed_sz, write_sz, buf_sz, headroom;
4192 kmutex_t *hash_lock, *list_lock;
4193 boolean_t have_lock, full;
4194 l2arc_write_callback_t *cb;
4198 ASSERT(dev->l2ad_vdev != NULL);
4203 head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4204 head->b_flags |= ARC_L2_WRITE_HEAD;
4207 * Copy buffers for L2ARC writing.
4209 mutex_enter(&l2arc_buflist_mtx);
4210 for (try = 0; try <= 3; try++) {
4211 list = l2arc_list_locked(try, &list_lock);
4215 * L2ARC fast warmup.
4217 * Until the ARC is warm and starts to evict, read from the
4218 * head of the ARC lists rather than the tail.
4220 headroom = target_sz * l2arc_headroom;
4221 if (arc_warm == B_FALSE)
4222 ab = list_head(list);
4224 ab = list_tail(list);
4226 for (; ab; ab = ab_prev) {
4227 if (arc_warm == B_FALSE)
4228 ab_prev = list_next(list, ab);
4230 ab_prev = list_prev(list, ab);
4232 hash_lock = HDR_LOCK(ab);
4233 have_lock = MUTEX_HELD(hash_lock);
4234 if (!have_lock && !mutex_tryenter(hash_lock)) {
4236 * Skip this buffer rather than waiting.
4241 passed_sz += ab->b_size;
4242 if (passed_sz > headroom) {
4246 mutex_exit(hash_lock);
4250 if (ab->b_spa != spa) {
4251 mutex_exit(hash_lock);
4255 if (ab->b_l2hdr != NULL) {
4259 mutex_exit(hash_lock);
4263 if (HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) {
4264 mutex_exit(hash_lock);
4268 if ((write_sz + ab->b_size) > target_sz) {
4270 mutex_exit(hash_lock);
4274 if (ab->b_buf == NULL) {
4275 DTRACE_PROBE1(l2arc__buf__null, void *, ab);
4276 mutex_exit(hash_lock);
4282 * Insert a dummy header on the buflist so
4283 * l2arc_write_done() can find where the
4284 * write buffers begin without searching.
4286 list_insert_head(dev->l2ad_buflist, head);
4289 sizeof (l2arc_write_callback_t), KM_SLEEP);
4290 cb->l2wcb_dev = dev;
4291 cb->l2wcb_head = head;
4292 pio = zio_root(spa, l2arc_write_done, cb,
4297 * Create and add a new L2ARC header.
4299 hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4301 hdrl2->b_daddr = dev->l2ad_hand;
4303 ab->b_flags |= ARC_L2_WRITING;
4304 ab->b_l2hdr = hdrl2;
4305 list_insert_head(dev->l2ad_buflist, ab);
4306 buf_data = ab->b_buf->b_data;
4307 buf_sz = ab->b_size;
4310 * Compute and store the buffer cksum before
4311 * writing. On debug the cksum is verified first.
4313 arc_cksum_verify(ab->b_buf);
4314 arc_cksum_compute(ab->b_buf, B_TRUE);
4316 mutex_exit(hash_lock);
4318 wzio = zio_write_phys(pio, dev->l2ad_vdev,
4319 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4320 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4321 ZIO_FLAG_CANFAIL, B_FALSE);
4323 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4325 (void) zio_nowait(wzio);
4328 * Keep the clock hand suitably device-aligned.
4330 buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4333 dev->l2ad_hand += buf_sz;
4336 mutex_exit(list_lock);
4341 mutex_exit(&l2arc_buflist_mtx);
4344 ASSERT3U(write_sz, ==, 0);
4345 kmem_cache_free(hdr_cache, head);
4349 ASSERT3U(write_sz, <=, target_sz);
4350 ARCSTAT_BUMP(arcstat_l2_writes_sent);
4351 ARCSTAT_INCR(arcstat_l2_size, write_sz);
4352 spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz);
4355 * Bump device hand to the device start if it is approaching the end.
4356 * l2arc_evict() will already have evicted ahead for this case.
4358 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4359 spa_l2cache_space_update(dev->l2ad_vdev, 0,
4360 dev->l2ad_end - dev->l2ad_hand);
4361 dev->l2ad_hand = dev->l2ad_start;
4362 dev->l2ad_evict = dev->l2ad_start;
4363 dev->l2ad_first = B_FALSE;
4366 (void) zio_wait(pio);
4370 * This thread feeds the L2ARC at regular intervals. This is the beating
4371 * heart of the L2ARC.
4374 l2arc_feed_thread(void *dummy __unused)
4381 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
4383 mutex_enter(&l2arc_feed_thr_lock);
4385 while (l2arc_thread_exit == 0) {
4387 * Pause for l2arc_feed_secs seconds between writes.
4389 CALLB_CPR_SAFE_BEGIN(&cpr);
4390 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
4391 hz * l2arc_feed_secs);
4392 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
4395 * Quick check for L2ARC devices.
4397 mutex_enter(&l2arc_dev_mtx);
4398 if (l2arc_ndev == 0) {
4399 mutex_exit(&l2arc_dev_mtx);
4402 mutex_exit(&l2arc_dev_mtx);
4405 * This selects the next l2arc device to write to, and in
4406 * doing so the next spa to feed from: dev->l2ad_spa. This
4407 * will return NULL if there are now no l2arc devices or if
4408 * they are all faulted.
4410 * If a device is returned, its spa's config lock is also
4411 * held to prevent device removal. l2arc_dev_get_next()
4412 * will grab and release l2arc_dev_mtx.
4414 if ((dev = l2arc_dev_get_next()) == NULL)
4417 spa = dev->l2ad_spa;
4418 ASSERT(spa != NULL);
4421 * Avoid contributing to memory pressure.
4423 if (arc_reclaim_needed()) {
4424 ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
4425 spa_config_exit(spa, SCL_L2ARC, dev);
4429 ARCSTAT_BUMP(arcstat_l2_feeds);
4431 size = dev->l2ad_write;
4432 if (arc_warm == B_FALSE)
4433 size += dev->l2ad_boost;
4436 * Evict L2ARC buffers that will be overwritten.
4438 l2arc_evict(dev, size, B_FALSE);
4441 * Write ARC buffers.
4443 l2arc_write_buffers(spa, dev, size);
4444 spa_config_exit(spa, SCL_L2ARC, dev);
4447 l2arc_thread_exit = 0;
4448 cv_broadcast(&l2arc_feed_thr_cv);
4449 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */
4454 l2arc_vdev_present(vdev_t *vd)
4458 mutex_enter(&l2arc_dev_mtx);
4459 for (dev = list_head(l2arc_dev_list); dev != NULL;
4460 dev = list_next(l2arc_dev_list, dev)) {
4461 if (dev->l2ad_vdev == vd)
4464 mutex_exit(&l2arc_dev_mtx);
4466 return (dev != NULL);
4470 * Add a vdev for use by the L2ARC. By this point the spa has already
4471 * validated the vdev and opened it.
4474 l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end)
4476 l2arc_dev_t *adddev;
4478 ASSERT(!l2arc_vdev_present(vd));
4481 * Create a new l2arc device entry.
4483 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
4484 adddev->l2ad_spa = spa;
4485 adddev->l2ad_vdev = vd;
4486 adddev->l2ad_write = l2arc_write_max;
4487 adddev->l2ad_boost = l2arc_write_boost;
4488 adddev->l2ad_start = start;
4489 adddev->l2ad_end = end;
4490 adddev->l2ad_hand = adddev->l2ad_start;
4491 adddev->l2ad_evict = adddev->l2ad_start;
4492 adddev->l2ad_first = B_TRUE;
4493 ASSERT3U(adddev->l2ad_write, >, 0);
4496 * This is a list of all ARC buffers that are still valid on the
4499 adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
4500 list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
4501 offsetof(arc_buf_hdr_t, b_l2node));
4503 spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0);
4506 * Add device to global list
4508 mutex_enter(&l2arc_dev_mtx);
4509 list_insert_head(l2arc_dev_list, adddev);
4510 atomic_inc_64(&l2arc_ndev);
4511 mutex_exit(&l2arc_dev_mtx);
4515 * Remove a vdev from the L2ARC.
4518 l2arc_remove_vdev(vdev_t *vd)
4520 l2arc_dev_t *dev, *nextdev, *remdev = NULL;
4523 * Find the device by vdev
4525 mutex_enter(&l2arc_dev_mtx);
4526 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
4527 nextdev = list_next(l2arc_dev_list, dev);
4528 if (vd == dev->l2ad_vdev) {
4533 ASSERT(remdev != NULL);
4536 * Remove device from global list
4538 list_remove(l2arc_dev_list, remdev);
4539 l2arc_dev_last = NULL; /* may have been invalidated */
4540 atomic_dec_64(&l2arc_ndev);
4541 mutex_exit(&l2arc_dev_mtx);
4544 * Clear all buflists and ARC references. L2ARC device flush.
4546 l2arc_evict(remdev, 0, B_TRUE);
4547 list_destroy(remdev->l2ad_buflist);
4548 kmem_free(remdev->l2ad_buflist, sizeof (list_t));
4549 kmem_free(remdev, sizeof (l2arc_dev_t));
4555 l2arc_thread_exit = 0;
4557 l2arc_writes_sent = 0;
4558 l2arc_writes_done = 0;
4560 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4561 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
4562 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
4563 mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
4564 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
4566 l2arc_dev_list = &L2ARC_dev_list;
4567 l2arc_free_on_write = &L2ARC_free_on_write;
4568 list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
4569 offsetof(l2arc_dev_t, l2ad_node));
4570 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
4571 offsetof(l2arc_data_free_t, l2df_list_node));
4578 * This is called from dmu_fini(), which is called from spa_fini();
4579 * Because of this, we can assume that all l2arc devices have
4580 * already been removed when the pools themselves were removed.
4583 l2arc_do_free_on_write();
4585 mutex_destroy(&l2arc_feed_thr_lock);
4586 cv_destroy(&l2arc_feed_thr_cv);
4587 mutex_destroy(&l2arc_dev_mtx);
4588 mutex_destroy(&l2arc_buflist_mtx);
4589 mutex_destroy(&l2arc_free_on_write_mtx);
4591 list_destroy(l2arc_dev_list);
4592 list_destroy(l2arc_free_on_write);
4598 if (!(spa_mode & FWRITE))
4601 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
4602 TS_RUN, minclsyspri);
4608 if (!(spa_mode & FWRITE))
4611 mutex_enter(&l2arc_feed_thr_lock);
4612 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
4613 l2arc_thread_exit = 1;
4614 while (l2arc_thread_exit != 0)
4615 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
4616 mutex_exit(&l2arc_feed_thr_lock);