4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
23 * All rights reserved.
26 #include <sys/zfs_context.h>
27 #include <sys/spa_impl.h>
28 #include <sys/vdev_impl.h>
29 #include <sys/trim_map.h>
33 * Calculate the zio end, upgrading based on ashift which would be
34 * done by zio_vdev_io_start.
36 * This makes free range consolidation much more effective
37 * than it would otherwise be as well as ensuring that entire
38 * blocks are invalidated by writes.
40 #define TRIM_ZIO_END(vd, offset, size) (offset + \
41 P2ROUNDUP(size, 1ULL << vd->vdev_top->vdev_ashift))
43 #define TRIM_MAP_SINC(tm, size) \
44 atomic_add_64(&(tm)->tm_bytes, (size))
46 #define TRIM_MAP_SDEC(tm, size) \
47 atomic_add_64(&(tm)->tm_bytes, -(size))
49 #define TRIM_MAP_QINC(tm) \
50 atomic_inc_64(&(tm)->tm_pending); \
52 #define TRIM_MAP_QDEC(tm) \
53 atomic_dec_64(&(tm)->tm_pending);
55 typedef struct trim_map {
56 list_t tm_head; /* List of segments sorted by txg. */
57 avl_tree_t tm_queued_frees; /* AVL tree of segments waiting for TRIM. */
58 avl_tree_t tm_inflight_frees; /* AVL tree of in-flight TRIMs. */
59 avl_tree_t tm_inflight_writes; /* AVL tree of in-flight writes. */
60 list_t tm_pending_writes; /* Writes blocked on in-flight frees. */
62 uint64_t tm_pending; /* Count of pending TRIMs. */
63 uint64_t tm_bytes; /* Total size in bytes of queued TRIMs. */
66 typedef struct trim_seg {
67 avl_node_t ts_node; /* AVL node. */
68 list_node_t ts_next; /* List element. */
69 uint64_t ts_start; /* Starting offset of this segment. */
70 uint64_t ts_end; /* Ending offset (non-inclusive). */
71 uint64_t ts_txg; /* Segment creation txg. */
72 hrtime_t ts_time; /* Segment creation time. */
75 extern boolean_t zfs_trim_enabled;
77 static u_int trim_txg_delay = 32;
78 static u_int trim_timeout = 30;
79 static u_int trim_max_interval = 1;
80 /* Limit outstanding TRIMs to 2G (max size for a single TRIM request) */
81 static uint64_t trim_vdev_max_bytes = 2147483648;
82 /* Limit outstanding TRIMs to 64 (max ranges for a single TRIM request) */
83 static u_int trim_vdev_max_pending = 64;
85 SYSCTL_DECL(_vfs_zfs);
86 SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RD, 0, "ZFS TRIM");
88 TUNABLE_INT("vfs.zfs.trim.txg_delay", &trim_txg_delay);
89 SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, txg_delay, CTLFLAG_RWTUN, &trim_txg_delay,
90 0, "Delay TRIMs by up to this many TXGs");
92 TUNABLE_INT("vfs.zfs.trim.timeout", &trim_timeout);
93 SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, timeout, CTLFLAG_RWTUN, &trim_timeout, 0,
94 "Delay TRIMs by up to this many seconds");
96 TUNABLE_INT("vfs.zfs.trim.max_interval", &trim_max_interval);
97 SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, max_interval, CTLFLAG_RWTUN,
98 &trim_max_interval, 0,
99 "Maximum interval between TRIM queue processing (seconds)");
101 SYSCTL_DECL(_vfs_zfs_vdev);
102 TUNABLE_QUAD("vfs.zfs.vdev.trim_max_bytes", &trim_vdev_max_bytes);
103 SYSCTL_QUAD(_vfs_zfs_vdev, OID_AUTO, trim_max_bytes, CTLFLAG_RWTUN,
104 &trim_vdev_max_bytes, 0,
105 "Maximum pending TRIM bytes for a vdev");
107 TUNABLE_INT("vfs.zfs.vdev.trim_max_pending", &trim_vdev_max_pending);
108 SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, trim_max_pending, CTLFLAG_RWTUN,
109 &trim_vdev_max_pending, 0,
110 "Maximum pending TRIM segments for a vdev");
113 static void trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd);
116 trim_map_seg_compare(const void *x1, const void *x2)
118 const trim_seg_t *s1 = x1;
119 const trim_seg_t *s2 = x2;
121 if (s1->ts_start < s2->ts_start) {
122 if (s1->ts_end > s2->ts_start)
126 if (s1->ts_start > s2->ts_start) {
127 if (s1->ts_start < s2->ts_end)
135 trim_map_zio_compare(const void *x1, const void *x2)
137 const zio_t *z1 = x1;
138 const zio_t *z2 = x2;
140 if (z1->io_offset < z2->io_offset) {
141 if (z1->io_offset + z1->io_size > z2->io_offset)
145 if (z1->io_offset > z2->io_offset) {
146 if (z1->io_offset < z2->io_offset + z2->io_size)
154 trim_map_create(vdev_t *vd)
158 ASSERT(vd->vdev_ops->vdev_op_leaf);
160 if (!zfs_trim_enabled)
163 tm = kmem_zalloc(sizeof (*tm), KM_SLEEP);
164 mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL);
165 list_create(&tm->tm_head, sizeof (trim_seg_t),
166 offsetof(trim_seg_t, ts_next));
167 list_create(&tm->tm_pending_writes, sizeof (zio_t),
168 offsetof(zio_t, io_trim_link));
169 avl_create(&tm->tm_queued_frees, trim_map_seg_compare,
170 sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
171 avl_create(&tm->tm_inflight_frees, trim_map_seg_compare,
172 sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
173 avl_create(&tm->tm_inflight_writes, trim_map_zio_compare,
174 sizeof (zio_t), offsetof(zio_t, io_trim_node));
175 vd->vdev_trimmap = tm;
179 trim_map_destroy(vdev_t *vd)
184 ASSERT(vd->vdev_ops->vdev_op_leaf);
186 if (!zfs_trim_enabled)
189 tm = vd->vdev_trimmap;
194 * We may have been called before trim_map_vdev_commit_done()
195 * had a chance to run, so do it now to prune the remaining
198 trim_map_vdev_commit_done(vd->vdev_spa, vd);
200 mutex_enter(&tm->tm_lock);
201 while ((ts = list_head(&tm->tm_head)) != NULL) {
202 avl_remove(&tm->tm_queued_frees, ts);
203 list_remove(&tm->tm_head, ts);
204 kmem_free(ts, sizeof (*ts));
205 TRIM_MAP_SDEC(tm, ts->ts_end - ts->ts_start);
208 mutex_exit(&tm->tm_lock);
210 avl_destroy(&tm->tm_queued_frees);
211 avl_destroy(&tm->tm_inflight_frees);
212 avl_destroy(&tm->tm_inflight_writes);
213 list_destroy(&tm->tm_pending_writes);
214 list_destroy(&tm->tm_head);
215 mutex_destroy(&tm->tm_lock);
216 kmem_free(tm, sizeof (*tm));
217 vd->vdev_trimmap = NULL;
221 trim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
224 trim_seg_t tsearch, *ts_before, *ts_after, *ts;
225 boolean_t merge_before, merge_after;
228 ASSERT(MUTEX_HELD(&tm->tm_lock));
232 tsearch.ts_start = start;
233 tsearch.ts_end = end;
235 ts = avl_find(&tm->tm_queued_frees, &tsearch, &where);
237 if (start < ts->ts_start)
238 trim_map_segment_add(tm, start, ts->ts_start, txg);
239 if (end > ts->ts_end)
240 trim_map_segment_add(tm, ts->ts_end, end, txg);
244 ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE);
245 ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER);
247 merge_before = (ts_before != NULL && ts_before->ts_end == start);
248 merge_after = (ts_after != NULL && ts_after->ts_start == end);
250 if (merge_before && merge_after) {
251 TRIM_MAP_SINC(tm, ts_after->ts_start - ts_before->ts_end);
253 avl_remove(&tm->tm_queued_frees, ts_before);
254 list_remove(&tm->tm_head, ts_before);
255 ts_after->ts_start = ts_before->ts_start;
256 ts_after->ts_txg = txg;
257 ts_after->ts_time = time;
258 list_remove(&tm->tm_head, ts_after);
259 list_insert_tail(&tm->tm_head, ts_after);
260 kmem_free(ts_before, sizeof (*ts_before));
261 } else if (merge_before) {
262 TRIM_MAP_SINC(tm, end - ts_before->ts_end);
263 ts_before->ts_end = end;
264 ts_before->ts_txg = txg;
265 ts_before->ts_time = time;
266 list_remove(&tm->tm_head, ts_before);
267 list_insert_tail(&tm->tm_head, ts_before);
268 } else if (merge_after) {
269 TRIM_MAP_SINC(tm, ts_after->ts_start - start);
270 ts_after->ts_start = start;
271 ts_after->ts_txg = txg;
272 ts_after->ts_time = time;
273 list_remove(&tm->tm_head, ts_after);
274 list_insert_tail(&tm->tm_head, ts_after);
276 TRIM_MAP_SINC(tm, end - start);
278 ts = kmem_alloc(sizeof (*ts), KM_SLEEP);
279 ts->ts_start = start;
283 avl_insert(&tm->tm_queued_frees, ts, where);
284 list_insert_tail(&tm->tm_head, ts);
289 trim_map_segment_remove(trim_map_t *tm, trim_seg_t *ts, uint64_t start,
293 boolean_t left_over, right_over;
295 ASSERT(MUTEX_HELD(&tm->tm_lock));
297 left_over = (ts->ts_start < start);
298 right_over = (ts->ts_end > end);
300 TRIM_MAP_SDEC(tm, end - start);
301 if (left_over && right_over) {
302 nts = kmem_alloc(sizeof (*nts), KM_SLEEP);
304 nts->ts_end = ts->ts_end;
305 nts->ts_txg = ts->ts_txg;
306 nts->ts_time = ts->ts_time;
308 avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER);
309 list_insert_after(&tm->tm_head, ts, nts);
311 } else if (left_over) {
313 } else if (right_over) {
316 avl_remove(&tm->tm_queued_frees, ts);
317 list_remove(&tm->tm_head, ts);
319 kmem_free(ts, sizeof (*ts));
324 trim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
328 ASSERT(MUTEX_HELD(&tm->tm_lock));
330 zsearch.io_offset = start;
331 zsearch.io_size = end - start;
333 zs = avl_find(&tm->tm_inflight_writes, &zsearch, NULL);
335 trim_map_segment_add(tm, start, end, txg);
338 if (start < zs->io_offset)
339 trim_map_free_locked(tm, start, zs->io_offset, txg);
340 if (zs->io_offset + zs->io_size < end)
341 trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg);
345 trim_map_free(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
347 trim_map_t *tm = vd->vdev_trimmap;
349 if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL)
352 mutex_enter(&tm->tm_lock);
353 trim_map_free_locked(tm, offset, TRIM_ZIO_END(vd, offset, size), txg);
354 mutex_exit(&tm->tm_lock);
358 trim_map_write_start(zio_t *zio)
360 vdev_t *vd = zio->io_vd;
361 trim_map_t *tm = vd->vdev_trimmap;
362 trim_seg_t tsearch, *ts;
363 boolean_t left_over, right_over;
366 if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL)
369 start = zio->io_offset;
370 end = TRIM_ZIO_END(zio->io_vd, start, zio->io_size);
371 tsearch.ts_start = start;
372 tsearch.ts_end = end;
374 mutex_enter(&tm->tm_lock);
377 * Checking for colliding in-flight frees.
379 ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL);
381 list_insert_tail(&tm->tm_pending_writes, zio);
382 mutex_exit(&tm->tm_lock);
386 ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL);
389 * Loop until all overlapping segments are removed.
392 trim_map_segment_remove(tm, ts, start, end);
393 ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL);
394 } while (ts != NULL);
396 avl_add(&tm->tm_inflight_writes, zio);
398 mutex_exit(&tm->tm_lock);
404 trim_map_write_done(zio_t *zio)
406 vdev_t *vd = zio->io_vd;
407 trim_map_t *tm = vd->vdev_trimmap;
410 * Don't check for vdev_notrim, since the write could have
411 * started before vdev_notrim was set.
413 if (!zfs_trim_enabled || tm == NULL)
416 mutex_enter(&tm->tm_lock);
418 * Don't fail if the write isn't in the tree, since the write
419 * could have started after vdev_notrim was set.
421 if (zio->io_trim_node.avl_child[0] ||
422 zio->io_trim_node.avl_child[1] ||
423 AVL_XPARENT(&zio->io_trim_node) ||
424 tm->tm_inflight_writes.avl_root == &zio->io_trim_node)
425 avl_remove(&tm->tm_inflight_writes, zio);
426 mutex_exit(&tm->tm_lock);
430 * Return the oldest segment (the one with the lowest txg / time) or NULL if:
431 * 1. The list is empty
432 * 2. The first element's txg is greater than txgsafe
433 * 3. The first element's txg is not greater than the txg argument and the
434 * the first element's time is not greater than time argument
437 trim_map_first(trim_map_t *tm, uint64_t txg, uint64_t txgsafe, hrtime_t time)
441 ASSERT(MUTEX_HELD(&tm->tm_lock));
442 VERIFY(txgsafe >= txg);
444 ts = list_head(&tm->tm_head);
445 if (ts != NULL && ts->ts_txg <= txgsafe &&
446 (ts->ts_txg <= txg || ts->ts_time <= time ||
447 tm->tm_bytes > trim_vdev_max_bytes ||
448 tm->tm_pending > trim_vdev_max_pending))
454 trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
456 trim_map_t *tm = vd->vdev_trimmap;
458 uint64_t size, txgtarget, txgsafe;
461 ASSERT(vd->vdev_ops->vdev_op_leaf);
466 timelimit = gethrtime() - (hrtime_t)trim_timeout * NANOSEC;
467 if (vd->vdev_isl2cache) {
468 txgsafe = UINT64_MAX;
469 txgtarget = UINT64_MAX;
471 txgsafe = MIN(spa_last_synced_txg(spa), spa_freeze_txg(spa));
472 if (txgsafe > trim_txg_delay)
473 txgtarget = txgsafe - trim_txg_delay;
478 mutex_enter(&tm->tm_lock);
479 /* Loop until we have sent all outstanding free's */
480 while ((ts = trim_map_first(tm, txgtarget, txgsafe, timelimit))
482 list_remove(&tm->tm_head, ts);
483 avl_remove(&tm->tm_queued_frees, ts);
484 avl_add(&tm->tm_inflight_frees, ts);
485 size = ts->ts_end - ts->ts_start;
486 zio_nowait(zio_trim(zio, spa, vd, ts->ts_start, size));
487 TRIM_MAP_SDEC(tm, size);
490 mutex_exit(&tm->tm_lock);
494 trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd)
496 trim_map_t *tm = vd->vdev_trimmap;
498 list_t pending_writes;
500 uint64_t start, size;
503 ASSERT(vd->vdev_ops->vdev_op_leaf);
508 mutex_enter(&tm->tm_lock);
509 if (!avl_is_empty(&tm->tm_inflight_frees)) {
511 while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees,
513 kmem_free(ts, sizeof (*ts));
516 list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t,
518 list_move_tail(&pending_writes, &tm->tm_pending_writes);
519 mutex_exit(&tm->tm_lock);
521 while ((zio = list_remove_head(&pending_writes)) != NULL) {
522 zio_vdev_io_reissue(zio);
525 list_destroy(&pending_writes);
529 trim_map_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
536 if (vd->vdev_ops->vdev_op_leaf) {
537 trim_map_vdev_commit(spa, zio, vd);
539 for (c = 0; c < vd->vdev_children; c++)
540 trim_map_commit(spa, zio, vd->vdev_child[c]);
545 trim_map_commit_done(spa_t *spa, vdev_t *vd)
552 if (vd->vdev_ops->vdev_op_leaf) {
553 trim_map_vdev_commit_done(spa, vd);
555 for (c = 0; c < vd->vdev_children; c++)
556 trim_map_commit_done(spa, vd->vdev_child[c]);
561 trim_thread(void *arg)
567 (void) snprintf(curthread->td_name, sizeof(curthread->td_name),
568 "trim %s", spa_name(spa));
572 mutex_enter(&spa->spa_trim_lock);
573 if (spa->spa_trim_thread == NULL) {
574 spa->spa_trim_thread = curthread;
575 cv_signal(&spa->spa_trim_cv);
576 mutex_exit(&spa->spa_trim_lock);
580 (void) cv_timedwait(&spa->spa_trim_cv, &spa->spa_trim_lock,
581 hz * trim_max_interval);
582 mutex_exit(&spa->spa_trim_lock);
584 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
586 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
587 trim_map_commit(spa, zio, spa->spa_root_vdev);
588 (void) zio_wait(zio);
589 trim_map_commit_done(spa, spa->spa_root_vdev);
590 spa_config_exit(spa, SCL_STATE, FTAG);
595 trim_thread_create(spa_t *spa)
598 if (!zfs_trim_enabled)
601 mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL);
602 cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL);
603 mutex_enter(&spa->spa_trim_lock);
604 spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0,
605 TS_RUN, minclsyspri);
606 mutex_exit(&spa->spa_trim_lock);
610 trim_thread_destroy(spa_t *spa)
613 if (!zfs_trim_enabled)
615 if (spa->spa_trim_thread == NULL)
618 mutex_enter(&spa->spa_trim_lock);
619 /* Setting spa_trim_thread to NULL tells the thread to stop. */
620 spa->spa_trim_thread = NULL;
621 cv_signal(&spa->spa_trim_cv);
622 /* The thread will set it back to != NULL on exit. */
623 while (spa->spa_trim_thread == NULL)
624 cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock);
625 spa->spa_trim_thread = NULL;
626 mutex_exit(&spa->spa_trim_lock);
628 cv_destroy(&spa->spa_trim_cv);
629 mutex_destroy(&spa->spa_trim_lock);
633 trim_thread_wakeup(spa_t *spa)
636 if (!zfs_trim_enabled)
638 if (spa->spa_trim_thread == NULL)
641 mutex_enter(&spa->spa_trim_lock);
642 cv_signal(&spa->spa_trim_cv);
643 mutex_exit(&spa->spa_trim_lock);