4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
23 * All rights reserved.
26 #include <sys/zfs_context.h>
27 #include <sys/spa_impl.h>
28 #include <sys/vdev_impl.h>
29 #include <sys/trim_map.h>
33 * Calculate the zio end, upgrading based on ashift which would be
34 * done by zio_vdev_io_start.
36 * This makes free range consolidation much more effective
37 * than it would otherwise be as well as ensuring that entire
38 * blocks are invalidated by writes.
40 #define TRIM_ZIO_END(vd, offset, size) (offset + \
41 P2ROUNDUP(size, 1ULL << vd->vdev_top->vdev_ashift))
43 /* Maximal segment size for ATA TRIM. */
44 #define TRIM_MAP_SIZE_FACTOR (512 << 16)
46 #define TRIM_MAP_SEGS(size) (1 + (size) / TRIM_MAP_SIZE_FACTOR)
48 #define TRIM_MAP_ADD(tm, ts) do { \
49 list_insert_tail(&(tm)->tm_head, (ts)); \
50 (tm)->tm_pending += TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \
53 #define TRIM_MAP_REM(tm, ts) do { \
54 list_remove(&(tm)->tm_head, (ts)); \
55 (tm)->tm_pending -= TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \
58 typedef struct trim_map {
59 list_t tm_head; /* List of segments sorted by txg. */
60 avl_tree_t tm_queued_frees; /* AVL tree of segments waiting for TRIM. */
61 avl_tree_t tm_inflight_frees; /* AVL tree of in-flight TRIMs. */
62 avl_tree_t tm_inflight_writes; /* AVL tree of in-flight writes. */
63 list_t tm_pending_writes; /* Writes blocked on in-flight frees. */
65 uint64_t tm_pending; /* Count of pending TRIMs. */
68 typedef struct trim_seg {
69 avl_node_t ts_node; /* AVL node. */
70 list_node_t ts_next; /* List element. */
71 uint64_t ts_start; /* Starting offset of this segment. */
72 uint64_t ts_end; /* Ending offset (non-inclusive). */
73 uint64_t ts_txg; /* Segment creation txg. */
74 hrtime_t ts_time; /* Segment creation time. */
77 extern boolean_t zfs_trim_enabled;
79 static u_int trim_txg_delay = 32; /* Keep deleted data up to 32 TXG */
80 static u_int trim_timeout = 30; /* Keep deleted data up to 30s */
81 static u_int trim_max_interval = 1; /* 1s delays between TRIMs */
82 static u_int trim_vdev_max_pending = 10000; /* Keep up to 10K segments */
84 SYSCTL_DECL(_vfs_zfs);
85 SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
88 SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, txg_delay, CTLFLAG_RWTUN, &trim_txg_delay,
89 0, "Delay TRIMs by up to this many TXGs");
90 SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, timeout, CTLFLAG_RWTUN, &trim_timeout, 0,
91 "Delay TRIMs by up to this many seconds");
92 SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, max_interval, CTLFLAG_RWTUN,
93 &trim_max_interval, 0,
94 "Maximum interval between TRIM queue processing (seconds)");
96 SYSCTL_DECL(_vfs_zfs_vdev);
97 SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, trim_max_pending, CTLFLAG_RWTUN,
98 &trim_vdev_max_pending, 0,
99 "Maximum pending TRIM segments for a vdev");
101 static void trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd);
104 trim_map_seg_compare(const void *x1, const void *x2)
106 const trim_seg_t *s1 = x1;
107 const trim_seg_t *s2 = x2;
109 if (s1->ts_start < s2->ts_start) {
110 if (s1->ts_end > s2->ts_start)
114 if (s1->ts_start > s2->ts_start) {
115 if (s1->ts_start < s2->ts_end)
123 trim_map_zio_compare(const void *x1, const void *x2)
125 const zio_t *z1 = x1;
126 const zio_t *z2 = x2;
128 if (z1->io_offset < z2->io_offset) {
129 if (z1->io_offset + z1->io_size > z2->io_offset)
133 if (z1->io_offset > z2->io_offset) {
134 if (z1->io_offset < z2->io_offset + z2->io_size)
142 trim_map_create(vdev_t *vd)
146 ASSERT(zfs_trim_enabled && !vd->vdev_notrim &&
147 vd->vdev_ops->vdev_op_leaf);
149 tm = kmem_zalloc(sizeof (*tm), KM_SLEEP);
150 mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL);
151 list_create(&tm->tm_head, sizeof (trim_seg_t),
152 offsetof(trim_seg_t, ts_next));
153 list_create(&tm->tm_pending_writes, sizeof (zio_t),
154 offsetof(zio_t, io_trim_link));
155 avl_create(&tm->tm_queued_frees, trim_map_seg_compare,
156 sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
157 avl_create(&tm->tm_inflight_frees, trim_map_seg_compare,
158 sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
159 avl_create(&tm->tm_inflight_writes, trim_map_zio_compare,
160 sizeof (zio_t), offsetof(zio_t, io_trim_node));
161 vd->vdev_trimmap = tm;
165 trim_map_destroy(vdev_t *vd)
170 ASSERT(vd->vdev_ops->vdev_op_leaf);
172 if (!zfs_trim_enabled)
175 tm = vd->vdev_trimmap;
180 * We may have been called before trim_map_vdev_commit_done()
181 * had a chance to run, so do it now to prune the remaining
184 trim_map_vdev_commit_done(vd->vdev_spa, vd);
186 mutex_enter(&tm->tm_lock);
187 while ((ts = list_head(&tm->tm_head)) != NULL) {
188 avl_remove(&tm->tm_queued_frees, ts);
189 TRIM_MAP_REM(tm, ts);
190 kmem_free(ts, sizeof (*ts));
192 mutex_exit(&tm->tm_lock);
194 avl_destroy(&tm->tm_queued_frees);
195 avl_destroy(&tm->tm_inflight_frees);
196 avl_destroy(&tm->tm_inflight_writes);
197 list_destroy(&tm->tm_pending_writes);
198 list_destroy(&tm->tm_head);
199 mutex_destroy(&tm->tm_lock);
200 kmem_free(tm, sizeof (*tm));
201 vd->vdev_trimmap = NULL;
205 trim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
208 trim_seg_t tsearch, *ts_before, *ts_after, *ts;
209 boolean_t merge_before, merge_after;
212 ASSERT(MUTEX_HELD(&tm->tm_lock));
216 tsearch.ts_start = start;
217 tsearch.ts_end = end;
219 ts = avl_find(&tm->tm_queued_frees, &tsearch, &where);
221 if (start < ts->ts_start)
222 trim_map_segment_add(tm, start, ts->ts_start, txg);
223 if (end > ts->ts_end)
224 trim_map_segment_add(tm, ts->ts_end, end, txg);
228 ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE);
229 ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER);
231 merge_before = (ts_before != NULL && ts_before->ts_end == start);
232 merge_after = (ts_after != NULL && ts_after->ts_start == end);
234 if (merge_before && merge_after) {
235 avl_remove(&tm->tm_queued_frees, ts_before);
236 TRIM_MAP_REM(tm, ts_before);
237 TRIM_MAP_REM(tm, ts_after);
238 ts_after->ts_start = ts_before->ts_start;
239 ts_after->ts_txg = txg;
240 ts_after->ts_time = time;
241 TRIM_MAP_ADD(tm, ts_after);
242 kmem_free(ts_before, sizeof (*ts_before));
243 } else if (merge_before) {
244 TRIM_MAP_REM(tm, ts_before);
245 ts_before->ts_end = end;
246 ts_before->ts_txg = txg;
247 ts_before->ts_time = time;
248 TRIM_MAP_ADD(tm, ts_before);
249 } else if (merge_after) {
250 TRIM_MAP_REM(tm, ts_after);
251 ts_after->ts_start = start;
252 ts_after->ts_txg = txg;
253 ts_after->ts_time = time;
254 TRIM_MAP_ADD(tm, ts_after);
256 ts = kmem_alloc(sizeof (*ts), KM_SLEEP);
257 ts->ts_start = start;
261 avl_insert(&tm->tm_queued_frees, ts, where);
262 TRIM_MAP_ADD(tm, ts);
267 trim_map_segment_remove(trim_map_t *tm, trim_seg_t *ts, uint64_t start,
271 boolean_t left_over, right_over;
273 ASSERT(MUTEX_HELD(&tm->tm_lock));
275 left_over = (ts->ts_start < start);
276 right_over = (ts->ts_end > end);
278 TRIM_MAP_REM(tm, ts);
279 if (left_over && right_over) {
280 nts = kmem_alloc(sizeof (*nts), KM_SLEEP);
282 nts->ts_end = ts->ts_end;
283 nts->ts_txg = ts->ts_txg;
284 nts->ts_time = ts->ts_time;
286 avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER);
287 TRIM_MAP_ADD(tm, ts);
288 TRIM_MAP_ADD(tm, nts);
289 } else if (left_over) {
291 TRIM_MAP_ADD(tm, ts);
292 } else if (right_over) {
294 TRIM_MAP_ADD(tm, ts);
296 avl_remove(&tm->tm_queued_frees, ts);
297 kmem_free(ts, sizeof (*ts));
302 trim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
306 ASSERT(MUTEX_HELD(&tm->tm_lock));
308 zsearch.io_offset = start;
309 zsearch.io_size = end - start;
311 zs = avl_find(&tm->tm_inflight_writes, &zsearch, NULL);
313 trim_map_segment_add(tm, start, end, txg);
316 if (start < zs->io_offset)
317 trim_map_free_locked(tm, start, zs->io_offset, txg);
318 if (zs->io_offset + zs->io_size < end)
319 trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg);
323 trim_map_free(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
325 trim_map_t *tm = vd->vdev_trimmap;
327 if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL)
330 mutex_enter(&tm->tm_lock);
331 trim_map_free_locked(tm, offset, TRIM_ZIO_END(vd, offset, size), txg);
332 mutex_exit(&tm->tm_lock);
336 trim_map_write_start(zio_t *zio)
338 vdev_t *vd = zio->io_vd;
339 trim_map_t *tm = vd->vdev_trimmap;
340 trim_seg_t tsearch, *ts;
341 boolean_t left_over, right_over;
344 if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL)
347 start = zio->io_offset;
348 end = TRIM_ZIO_END(zio->io_vd, start, zio->io_size);
349 tsearch.ts_start = start;
350 tsearch.ts_end = end;
352 mutex_enter(&tm->tm_lock);
355 * Checking for colliding in-flight frees.
357 ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL);
359 list_insert_tail(&tm->tm_pending_writes, zio);
360 mutex_exit(&tm->tm_lock);
365 * Loop until all overlapping segments are removed.
367 while ((ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL)) != NULL) {
368 trim_map_segment_remove(tm, ts, start, end);
371 avl_add(&tm->tm_inflight_writes, zio);
373 mutex_exit(&tm->tm_lock);
379 trim_map_write_done(zio_t *zio)
381 vdev_t *vd = zio->io_vd;
382 trim_map_t *tm = vd->vdev_trimmap;
385 * Don't check for vdev_notrim, since the write could have
386 * started before vdev_notrim was set.
388 if (!zfs_trim_enabled || tm == NULL)
391 mutex_enter(&tm->tm_lock);
393 * Don't fail if the write isn't in the tree, since the write
394 * could have started after vdev_notrim was set.
396 if (zio->io_trim_node.avl_child[0] ||
397 zio->io_trim_node.avl_child[1] ||
398 AVL_XPARENT(&zio->io_trim_node) ||
399 tm->tm_inflight_writes.avl_root == &zio->io_trim_node)
400 avl_remove(&tm->tm_inflight_writes, zio);
401 mutex_exit(&tm->tm_lock);
405 * Return the oldest segment (the one with the lowest txg / time) or NULL if:
406 * 1. The list is empty
407 * 2. The first element's txg is greater than txgsafe
408 * 3. The first element's txg is not greater than the txg argument and the
409 * the first element's time is not greater than time argument
412 trim_map_first(trim_map_t *tm, uint64_t txg, uint64_t txgsafe, hrtime_t time,
417 ASSERT(MUTEX_HELD(&tm->tm_lock));
418 VERIFY(txgsafe >= txg);
420 ts = list_head(&tm->tm_head);
421 if (ts != NULL && ts->ts_txg <= txgsafe &&
422 (ts->ts_txg <= txg || ts->ts_time <= time || force))
428 trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
430 trim_map_t *tm = vd->vdev_trimmap;
432 uint64_t size, offset, txgtarget, txgsafe;
436 ASSERT(vd->vdev_ops->vdev_op_leaf);
441 timelimit = gethrtime() - (hrtime_t)trim_timeout * NANOSEC;
442 if (vd->vdev_isl2cache) {
443 txgsafe = UINT64_MAX;
444 txgtarget = UINT64_MAX;
446 txgsafe = MIN(spa_last_synced_txg(spa), spa_freeze_txg(spa));
447 if (txgsafe > trim_txg_delay)
448 txgtarget = txgsafe - trim_txg_delay;
453 mutex_enter(&tm->tm_lock);
455 if (tm->tm_pending > trim_vdev_max_pending)
456 hard = (tm->tm_pending - trim_vdev_max_pending) / 4;
457 soft = P2ROUNDUP(hard + tm->tm_pending / trim_timeout + 1, 64);
458 /* Loop until we have sent all outstanding free's */
460 (ts = trim_map_first(tm, txgtarget, txgsafe, timelimit, hard > 0))
462 TRIM_MAP_REM(tm, ts);
463 avl_remove(&tm->tm_queued_frees, ts);
464 avl_add(&tm->tm_inflight_frees, ts);
465 size = ts->ts_end - ts->ts_start;
466 offset = ts->ts_start;
468 * We drop the lock while we call zio_nowait as the IO
469 * scheduler can result in a different IO being run e.g.
470 * a write which would result in a recursive lock.
472 mutex_exit(&tm->tm_lock);
474 zio_nowait(zio_trim(zio, spa, vd, offset, size));
476 soft -= TRIM_MAP_SEGS(size);
477 hard -= TRIM_MAP_SEGS(size);
478 mutex_enter(&tm->tm_lock);
480 mutex_exit(&tm->tm_lock);
484 trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd)
486 trim_map_t *tm = vd->vdev_trimmap;
488 list_t pending_writes;
490 uint64_t start, size;
493 ASSERT(vd->vdev_ops->vdev_op_leaf);
498 mutex_enter(&tm->tm_lock);
499 if (!avl_is_empty(&tm->tm_inflight_frees)) {
501 while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees,
503 kmem_free(ts, sizeof (*ts));
506 list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t,
508 list_move_tail(&pending_writes, &tm->tm_pending_writes);
509 mutex_exit(&tm->tm_lock);
511 while ((zio = list_remove_head(&pending_writes)) != NULL) {
512 zio_vdev_io_reissue(zio);
515 list_destroy(&pending_writes);
519 trim_map_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
526 if (vd->vdev_ops->vdev_op_leaf) {
527 trim_map_vdev_commit(spa, zio, vd);
529 for (c = 0; c < vd->vdev_children; c++)
530 trim_map_commit(spa, zio, vd->vdev_child[c]);
535 trim_map_commit_done(spa_t *spa, vdev_t *vd)
542 if (vd->vdev_ops->vdev_op_leaf) {
543 trim_map_vdev_commit_done(spa, vd);
545 for (c = 0; c < vd->vdev_children; c++)
546 trim_map_commit_done(spa, vd->vdev_child[c]);
551 trim_thread(void *arg)
557 (void) snprintf(curthread->td_name, sizeof(curthread->td_name),
558 "trim %s", spa_name(spa));
562 mutex_enter(&spa->spa_trim_lock);
563 if (spa->spa_trim_thread == NULL) {
564 spa->spa_trim_thread = curthread;
565 cv_signal(&spa->spa_trim_cv);
566 mutex_exit(&spa->spa_trim_lock);
570 (void) cv_timedwait(&spa->spa_trim_cv, &spa->spa_trim_lock,
571 hz * trim_max_interval);
572 mutex_exit(&spa->spa_trim_lock);
574 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
576 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
577 trim_map_commit(spa, zio, spa->spa_root_vdev);
578 (void) zio_wait(zio);
579 trim_map_commit_done(spa, spa->spa_root_vdev);
580 spa_config_exit(spa, SCL_STATE, FTAG);
585 trim_thread_create(spa_t *spa)
588 if (!zfs_trim_enabled)
591 mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL);
592 cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL);
593 mutex_enter(&spa->spa_trim_lock);
594 spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0,
595 TS_RUN, minclsyspri);
596 mutex_exit(&spa->spa_trim_lock);
600 trim_thread_destroy(spa_t *spa)
603 if (!zfs_trim_enabled)
605 if (spa->spa_trim_thread == NULL)
608 mutex_enter(&spa->spa_trim_lock);
609 /* Setting spa_trim_thread to NULL tells the thread to stop. */
610 spa->spa_trim_thread = NULL;
611 cv_signal(&spa->spa_trim_cv);
612 /* The thread will set it back to != NULL on exit. */
613 while (spa->spa_trim_thread == NULL)
614 cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock);
615 spa->spa_trim_thread = NULL;
616 mutex_exit(&spa->spa_trim_lock);
618 cv_destroy(&spa->spa_trim_cv);
619 mutex_destroy(&spa->spa_trim_lock);
623 trim_thread_wakeup(spa_t *spa)
626 if (!zfs_trim_enabled)
628 if (spa->spa_trim_thread == NULL)
631 mutex_enter(&spa->spa_trim_lock);
632 cv_signal(&spa->spa_trim_cv);
633 mutex_exit(&spa->spa_trim_lock);