4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
23 * All rights reserved.
26 #include <sys/zfs_context.h>
27 #include <sys/spa_impl.h>
28 #include <sys/vdev_impl.h>
29 #include <sys/trim_map.h>
31 typedef struct trim_map {
32 list_t tm_head; /* List of segments sorted by txg. */
33 avl_tree_t tm_queued_frees; /* AVL tree of segments waiting for TRIM. */
34 avl_tree_t tm_inflight_frees; /* AVL tree of in-flight TRIMs. */
35 avl_tree_t tm_inflight_writes; /* AVL tree of in-flight writes. */
36 list_t tm_pending_writes; /* Writes blocked on in-flight frees. */
40 typedef struct trim_seg {
41 avl_node_t ts_node; /* AVL node. */
42 list_node_t ts_next; /* List element. */
43 uint64_t ts_start; /* Starting offset of this segment. */
44 uint64_t ts_end; /* Ending offset (non-inclusive). */
45 uint64_t ts_txg; /* Segment creation txg. */
48 extern boolean_t zfs_notrim;
50 SYSCTL_DECL(_vfs_zfs);
51 /* Delay TRIMs by that many TXGs. */
52 static int trim_txg_limit = 64;
53 TUNABLE_INT("vfs.zfs.trim_txg_limit", &trim_txg_limit);
54 SYSCTL_INT(_vfs_zfs, OID_AUTO, trim_txg_limit, CTLFLAG_RW, &trim_txg_limit, 0,
55 "Delay TRIMs by that many TXGs.");
57 static void trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd);
60 trim_map_seg_compare(const void *x1, const void *x2)
62 const trim_seg_t *s1 = x1;
63 const trim_seg_t *s2 = x2;
65 if (s1->ts_start < s2->ts_start) {
66 if (s1->ts_end > s2->ts_start)
70 if (s1->ts_start > s2->ts_start) {
71 if (s1->ts_start < s2->ts_end)
79 trim_map_zio_compare(const void *x1, const void *x2)
84 if (z1->io_offset < z2->io_offset) {
85 if (z1->io_offset + z1->io_size > z2->io_offset)
89 if (z1->io_offset > z2->io_offset) {
90 if (z1->io_offset < z2->io_offset + z2->io_size)
98 trim_map_create(vdev_t *vd)
102 ASSERT(vd->vdev_ops->vdev_op_leaf);
107 tm = kmem_zalloc(sizeof (*tm), KM_SLEEP);
108 mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL);
109 list_create(&tm->tm_head, sizeof (trim_seg_t),
110 offsetof(trim_seg_t, ts_next));
111 list_create(&tm->tm_pending_writes, sizeof (zio_t),
112 offsetof(zio_t, io_trim_link));
113 avl_create(&tm->tm_queued_frees, trim_map_seg_compare,
114 sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
115 avl_create(&tm->tm_inflight_frees, trim_map_seg_compare,
116 sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
117 avl_create(&tm->tm_inflight_writes, trim_map_zio_compare,
118 sizeof (zio_t), offsetof(zio_t, io_trim_node));
119 vd->vdev_trimmap = tm;
123 trim_map_destroy(vdev_t *vd)
128 ASSERT(vd->vdev_ops->vdev_op_leaf);
133 tm = vd->vdev_trimmap;
138 * We may have been called before trim_map_vdev_commit_done()
139 * had a chance to run, so do it now to prune the remaining
142 trim_map_vdev_commit_done(vd->vdev_spa, vd);
144 mutex_enter(&tm->tm_lock);
145 while ((ts = list_head(&tm->tm_head)) != NULL) {
146 avl_remove(&tm->tm_queued_frees, ts);
147 list_remove(&tm->tm_head, ts);
148 kmem_free(ts, sizeof (*ts));
150 mutex_exit(&tm->tm_lock);
152 avl_destroy(&tm->tm_queued_frees);
153 avl_destroy(&tm->tm_inflight_frees);
154 avl_destroy(&tm->tm_inflight_writes);
155 list_destroy(&tm->tm_pending_writes);
156 list_destroy(&tm->tm_head);
157 mutex_destroy(&tm->tm_lock);
158 kmem_free(tm, sizeof (*tm));
159 vd->vdev_trimmap = NULL;
163 trim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
166 trim_seg_t tsearch, *ts_before, *ts_after, *ts;
167 boolean_t merge_before, merge_after;
169 ASSERT(MUTEX_HELD(&tm->tm_lock));
172 tsearch.ts_start = start;
173 tsearch.ts_end = end;
175 ts = avl_find(&tm->tm_queued_frees, &tsearch, &where);
177 if (start < ts->ts_start)
178 trim_map_segment_add(tm, start, ts->ts_start, txg);
179 if (end > ts->ts_end)
180 trim_map_segment_add(tm, ts->ts_end, end, txg);
184 ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE);
185 ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER);
187 merge_before = (ts_before != NULL && ts_before->ts_end == start &&
188 ts_before->ts_txg == txg);
189 merge_after = (ts_after != NULL && ts_after->ts_start == end &&
190 ts_after->ts_txg == txg);
192 if (merge_before && merge_after) {
193 avl_remove(&tm->tm_queued_frees, ts_before);
194 list_remove(&tm->tm_head, ts_before);
195 ts_after->ts_start = ts_before->ts_start;
196 kmem_free(ts_before, sizeof (*ts_before));
197 } else if (merge_before) {
198 ts_before->ts_end = end;
199 } else if (merge_after) {
200 ts_after->ts_start = start;
202 ts = kmem_alloc(sizeof (*ts), KM_SLEEP);
203 ts->ts_start = start;
206 avl_insert(&tm->tm_queued_frees, ts, where);
207 list_insert_tail(&tm->tm_head, ts);
212 trim_map_segment_remove(trim_map_t *tm, trim_seg_t *ts, uint64_t start,
216 boolean_t left_over, right_over;
218 ASSERT(MUTEX_HELD(&tm->tm_lock));
220 left_over = (ts->ts_start < start);
221 right_over = (ts->ts_end > end);
223 if (left_over && right_over) {
224 nts = kmem_alloc(sizeof (*nts), KM_SLEEP);
226 nts->ts_end = ts->ts_end;
227 nts->ts_txg = ts->ts_txg;
229 avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER);
230 list_insert_after(&tm->tm_head, ts, nts);
231 } else if (left_over) {
233 } else if (right_over) {
236 avl_remove(&tm->tm_queued_frees, ts);
237 list_remove(&tm->tm_head, ts);
238 kmem_free(ts, sizeof (*ts));
243 trim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
247 ASSERT(MUTEX_HELD(&tm->tm_lock));
249 zsearch.io_offset = start;
250 zsearch.io_size = end - start;
252 zs = avl_find(&tm->tm_inflight_writes, &zsearch, NULL);
254 trim_map_segment_add(tm, start, end, txg);
257 if (start < zs->io_offset)
258 trim_map_free_locked(tm, start, zs->io_offset, txg);
259 if (zs->io_offset + zs->io_size < end)
260 trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg);
264 trim_map_free(zio_t *zio)
266 vdev_t *vd = zio->io_vd;
267 trim_map_t *tm = vd->vdev_trimmap;
269 if (zfs_notrim || vd->vdev_notrim || tm == NULL)
272 mutex_enter(&tm->tm_lock);
273 trim_map_free_locked(tm, zio->io_offset, zio->io_offset + zio->io_size,
274 vd->vdev_spa->spa_syncing_txg);
275 mutex_exit(&tm->tm_lock);
279 trim_map_write_start(zio_t *zio)
281 vdev_t *vd = zio->io_vd;
282 trim_map_t *tm = vd->vdev_trimmap;
283 trim_seg_t tsearch, *ts;
284 boolean_t left_over, right_over;
287 if (zfs_notrim || vd->vdev_notrim || tm == NULL)
290 start = zio->io_offset;
291 end = start + zio->io_size;
292 tsearch.ts_start = start;
293 tsearch.ts_end = end;
295 mutex_enter(&tm->tm_lock);
298 * Checking for colliding in-flight frees.
300 ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL);
302 list_insert_tail(&tm->tm_pending_writes, zio);
303 mutex_exit(&tm->tm_lock);
307 ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL);
310 * Loop until all overlapping segments are removed.
313 trim_map_segment_remove(tm, ts, start, end);
314 ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL);
315 } while (ts != NULL);
317 avl_add(&tm->tm_inflight_writes, zio);
319 mutex_exit(&tm->tm_lock);
325 trim_map_write_done(zio_t *zio)
327 vdev_t *vd = zio->io_vd;
328 trim_map_t *tm = vd->vdev_trimmap;
331 * Don't check for vdev_notrim, since the write could have
332 * started before vdev_notrim was set.
334 if (zfs_notrim || tm == NULL)
337 mutex_enter(&tm->tm_lock);
339 * Don't fail if the write isn't in the tree, since the write
340 * could have started after vdev_notrim was set.
342 if (zio->io_trim_node.avl_child[0] ||
343 zio->io_trim_node.avl_child[1] ||
344 AVL_XPARENT(&zio->io_trim_node) ||
345 tm->tm_inflight_writes.avl_root == &zio->io_trim_node)
346 avl_remove(&tm->tm_inflight_writes, zio);
347 mutex_exit(&tm->tm_lock);
351 * Return the oldest segment (the one with the lowest txg) or false if
352 * the list is empty or the first element's txg is greater than txg given
353 * as function argument.
356 trim_map_first(trim_map_t *tm, uint64_t txg)
360 ASSERT(MUTEX_HELD(&tm->tm_lock));
362 ts = list_head(&tm->tm_head);
363 if (ts != NULL && ts->ts_txg <= txg)
369 trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
371 trim_map_t *tm = vd->vdev_trimmap;
373 uint64_t start, size, txglimit;
375 ASSERT(vd->vdev_ops->vdev_op_leaf);
380 txglimit = MIN(spa->spa_syncing_txg, spa_freeze_txg(spa)) -
383 mutex_enter(&tm->tm_lock);
385 * Loop until we send all frees up to the txglimit.
387 while ((ts = trim_map_first(tm, txglimit)) != NULL) {
388 list_remove(&tm->tm_head, ts);
389 avl_remove(&tm->tm_queued_frees, ts);
390 avl_add(&tm->tm_inflight_frees, ts);
391 zio_nowait(zio_trim(zio, spa, vd, ts->ts_start,
392 ts->ts_end - ts->ts_start));
394 mutex_exit(&tm->tm_lock);
398 trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd)
400 trim_map_t *tm = vd->vdev_trimmap;
402 list_t pending_writes;
404 uint64_t start, size;
407 ASSERT(vd->vdev_ops->vdev_op_leaf);
412 mutex_enter(&tm->tm_lock);
413 if (!avl_is_empty(&tm->tm_inflight_frees)) {
415 while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees,
417 kmem_free(ts, sizeof (*ts));
420 list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t,
422 list_move_tail(&pending_writes, &tm->tm_pending_writes);
423 mutex_exit(&tm->tm_lock);
425 while ((zio = list_remove_head(&pending_writes)) != NULL) {
426 zio_vdev_io_reissue(zio);
429 list_destroy(&pending_writes);
433 trim_map_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
437 if (vd == NULL || spa->spa_syncing_txg <= trim_txg_limit)
440 if (vd->vdev_ops->vdev_op_leaf) {
441 trim_map_vdev_commit(spa, zio, vd);
443 for (c = 0; c < vd->vdev_children; c++)
444 trim_map_commit(spa, zio, vd->vdev_child[c]);
449 trim_map_commit_done(spa_t *spa, vdev_t *vd)
456 if (vd->vdev_ops->vdev_op_leaf) {
457 trim_map_vdev_commit_done(spa, vd);
459 for (c = 0; c < vd->vdev_children; c++)
460 trim_map_commit_done(spa, vd->vdev_child[c]);
465 trim_thread(void *arg)
471 mutex_enter(&spa->spa_trim_lock);
472 if (spa->spa_trim_thread == NULL) {
473 spa->spa_trim_thread = curthread;
474 cv_signal(&spa->spa_trim_cv);
475 mutex_exit(&spa->spa_trim_lock);
478 cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock);
479 mutex_exit(&spa->spa_trim_lock);
481 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
483 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
484 trim_map_commit(spa, zio, spa->spa_root_vdev);
485 (void) zio_wait(zio);
486 trim_map_commit_done(spa, spa->spa_root_vdev);
487 spa_config_exit(spa, SCL_STATE, FTAG);
492 trim_thread_create(spa_t *spa)
498 mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL);
499 cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL);
500 mutex_enter(&spa->spa_trim_lock);
501 spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0,
502 TS_RUN, minclsyspri);
503 mutex_exit(&spa->spa_trim_lock);
507 trim_thread_destroy(spa_t *spa)
512 if (spa->spa_trim_thread == NULL)
515 mutex_enter(&spa->spa_trim_lock);
516 /* Setting spa_trim_thread to NULL tells the thread to stop. */
517 spa->spa_trim_thread = NULL;
518 cv_signal(&spa->spa_trim_cv);
519 /* The thread will set it back to != NULL on exit. */
520 while (spa->spa_trim_thread == NULL)
521 cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock);
522 spa->spa_trim_thread = NULL;
523 mutex_exit(&spa->spa_trim_lock);
525 cv_destroy(&spa->spa_trim_cv);
526 mutex_destroy(&spa->spa_trim_lock);
530 trim_thread_wakeup(spa_t *spa)
535 if (spa->spa_trim_thread == NULL)
538 mutex_enter(&spa->spa_trim_lock);
539 cv_signal(&spa->spa_trim_cv);
540 mutex_exit(&spa->spa_trim_lock);