4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
29 #include <sys/zfs_context.h>
32 #include <sys/dnode.h>
34 #include <sys/range_tree.h>
37 * Range trees are tree-based data structures that can be used to
38 * track free space or generally any space allocation information.
39 * A range tree keeps track of individual segments and automatically
40 * provides facilities such as adjacent extent merging and extent
41 * splitting in response to range add/remove requests.
43 * A range tree starts out completely empty, with no segments in it.
44 * Adding an allocation via range_tree_add to the range tree can either:
45 * 1) create a new extent
46 * 2) extend an adjacent extent
47 * 3) merge two adjacent extents
48 * Conversely, removing an allocation via range_tree_remove can:
49 * 1) completely remove an extent
50 * 2) shorten an extent (if the allocation was near one of its ends)
51 * 3) split an extent into two extents, in effect punching a hole
53 * A range tree is also capable of 'bridging' gaps when adding
54 * allocations. This is useful for cases when close proximity of
55 * allocations is an important detail that needs to be represented
56 * in the range tree. See range_tree_set_gap(). The default behavior
57 * is not to bridge gaps (i.e. the maximum allowed gap size is 0).
59 * In order to traverse a range tree, use either the range_tree_walk()
60 * or range_tree_vacate() functions.
62 * To obtain more accurate information on individual segment
63 * operations that the range tree performs "under the hood", you can
64 * specify a set of callbacks by passing a range_tree_ops_t structure
65 * to the range_tree_create function. Any callbacks that are non-NULL
66 * are then called at the appropriate times.
68 * The range tree code also supports a special variant of range trees
69 * that can bridge small gaps between segments. This kind of tree is used
70 * by the dsl scanning code to group I/Os into mostly sequential chunks to
71 * optimize disk performance. The code here attempts to do this with as
72 * little memory and computational overhead as possible. One limitation of
73 * this implementation is that segments of range trees with gaps can only
74 * support removing complete segments.
77 kmem_cache_t *range_seg_cache;
79 /* Generic ops for managing an AVL tree alongside a range tree */
80 struct range_tree_ops rt_avl_ops = {
81 .rtop_create = rt_avl_create,
82 .rtop_destroy = rt_avl_destroy,
83 .rtop_add = rt_avl_add,
84 .rtop_remove = rt_avl_remove,
85 .rtop_vacate = rt_avl_vacate,
91 ASSERT(range_seg_cache == NULL);
92 range_seg_cache = kmem_cache_create("range_seg_cache",
93 sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
99 kmem_cache_destroy(range_seg_cache);
100 range_seg_cache = NULL;
104 range_tree_stat_verify(range_tree_t *rt)
107 uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
110 for (rs = avl_first(&rt->rt_root); rs != NULL;
111 rs = AVL_NEXT(&rt->rt_root, rs)) {
112 uint64_t size = rs->rs_end - rs->rs_start;
113 int idx = highbit64(size) - 1;
116 ASSERT3U(hist[idx], !=, 0);
119 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
120 if (hist[i] != rt->rt_histogram[i]) {
121 zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu",
122 i, hist, hist[i], rt->rt_histogram[i]);
124 VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
129 range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
131 uint64_t size = rs->rs_end - rs->rs_start;
132 int idx = highbit64(size) - 1;
136 sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
138 rt->rt_histogram[idx]++;
139 ASSERT3U(rt->rt_histogram[idx], !=, 0);
143 range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
145 uint64_t size = rs->rs_end - rs->rs_start;
146 int idx = highbit64(size) - 1;
150 sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
152 ASSERT3U(rt->rt_histogram[idx], !=, 0);
153 rt->rt_histogram[idx]--;
157 * NOTE: caller is responsible for all locking.
160 range_tree_seg_compare(const void *x1, const void *x2)
162 const range_seg_t *r1 = (const range_seg_t *)x1;
163 const range_seg_t *r2 = (const range_seg_t *)x2;
165 ASSERT3U(r1->rs_start, <=, r1->rs_end);
166 ASSERT3U(r2->rs_start, <=, r2->rs_end);
168 return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
172 range_tree_create_impl(range_tree_ops_t *ops, void *arg,
173 int (*avl_compare) (const void *, const void *), uint64_t gap)
175 range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
177 avl_create(&rt->rt_root, range_tree_seg_compare,
178 sizeof (range_seg_t), offsetof(range_seg_t, rs_node));
183 rt->rt_avl_compare = avl_compare;
185 if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
186 rt->rt_ops->rtop_create(rt, rt->rt_arg);
192 range_tree_create(range_tree_ops_t *ops, void *arg)
194 return (range_tree_create_impl(ops, arg, NULL, 0));
198 range_tree_destroy(range_tree_t *rt)
200 VERIFY0(rt->rt_space);
202 if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
203 rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
205 avl_destroy(&rt->rt_root);
206 kmem_free(rt, sizeof (*rt));
210 range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta)
212 ASSERT3U(rs->rs_fill + delta, !=, 0);
213 ASSERT3U(rs->rs_fill + delta, <=, rs->rs_end - rs->rs_start);
215 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
216 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
217 rs->rs_fill += delta;
218 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
219 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
223 range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
225 range_tree_t *rt = arg;
227 range_seg_t rsearch, *rs_before, *rs_after, *rs;
228 uint64_t end = start + size, gap = rt->rt_gap;
229 uint64_t bridge_size = 0;
230 boolean_t merge_before, merge_after;
232 ASSERT3U(size, !=, 0);
233 ASSERT3U(fill, <=, size);
235 rsearch.rs_start = start;
236 rsearch.rs_end = end;
237 rs = avl_find(&rt->rt_root, &rsearch, &where);
239 if (gap == 0 && rs != NULL &&
240 rs->rs_start <= start && rs->rs_end >= end) {
241 zfs_panic_recover("zfs: allocating allocated segment"
242 "(offset=%llu size=%llu) of (offset=%llu size=%llu)\n",
243 (longlong_t)start, (longlong_t)size,
244 (longlong_t)rs->rs_start,
245 (longlong_t)rs->rs_end - rs->rs_start);
250 * If this is a gap-supporting range tree, it is possible that we
251 * are inserting into an existing segment. In this case simply
252 * bump the fill count and call the remove / add callbacks. If the
253 * new range will extend an existing segment, we remove the
254 * existing one, apply the new extent to it and re-insert it using
255 * the normal code paths.
258 ASSERT3U(gap, !=, 0);
259 if (rs->rs_start <= start && rs->rs_end >= end) {
260 range_tree_adjust_fill(rt, rs, fill);
264 avl_remove(&rt->rt_root, rs);
265 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
266 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
268 range_tree_stat_decr(rt, rs);
269 rt->rt_space -= rs->rs_end - rs->rs_start;
272 start = MIN(start, rs->rs_start);
273 end = MAX(end, rs->rs_end);
276 range_tree_add_impl(rt, start, size, fill);
278 kmem_cache_free(range_seg_cache, rs);
282 ASSERT3P(rs, ==, NULL);
285 * Determine whether or not we will have to merge with our neighbors.
286 * If gap != 0, we might need to merge with our neighbors even if we
287 * aren't directly touching.
289 rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
290 rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);
292 merge_before = (rs_before != NULL && rs_before->rs_end >= start - gap);
293 merge_after = (rs_after != NULL && rs_after->rs_start <= end + gap);
295 if (merge_before && gap != 0)
296 bridge_size += start - rs_before->rs_end;
297 if (merge_after && gap != 0)
298 bridge_size += rs_after->rs_start - end;
300 if (merge_before && merge_after) {
301 avl_remove(&rt->rt_root, rs_before);
302 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) {
303 rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
304 rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
307 range_tree_stat_decr(rt, rs_before);
308 range_tree_stat_decr(rt, rs_after);
310 rs_after->rs_fill += rs_before->rs_fill + fill;
311 rs_after->rs_start = rs_before->rs_start;
312 kmem_cache_free(range_seg_cache, rs_before);
314 } else if (merge_before) {
315 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
316 rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
318 range_tree_stat_decr(rt, rs_before);
320 rs_before->rs_fill += fill;
321 rs_before->rs_end = end;
323 } else if (merge_after) {
324 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
325 rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
327 range_tree_stat_decr(rt, rs_after);
329 rs_after->rs_fill += fill;
330 rs_after->rs_start = start;
333 rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
336 rs->rs_start = start;
338 avl_insert(&rt->rt_root, rs, where);
342 ASSERT3U(rs->rs_fill, <=, rs->rs_end - rs->rs_start);
344 ASSERT3U(rs->rs_fill, ==, rs->rs_end - rs->rs_start);
346 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
347 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
349 range_tree_stat_incr(rt, rs);
350 rt->rt_space += size + bridge_size;
354 range_tree_add(void *arg, uint64_t start, uint64_t size)
356 range_tree_add_impl(arg, start, size, size);
360 range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size,
364 range_seg_t rsearch, *rs, *newseg;
365 uint64_t end = start + size;
366 boolean_t left_over, right_over;
368 VERIFY3U(size, !=, 0);
369 VERIFY3U(size, <=, rt->rt_space);
371 rsearch.rs_start = start;
372 rsearch.rs_end = end;
373 rs = avl_find(&rt->rt_root, &rsearch, &where);
375 /* Make sure we completely overlap with someone */
377 zfs_panic_recover("zfs: freeing free segment "
378 "(offset=%llu size=%llu)",
379 (longlong_t)start, (longlong_t)size);
384 * Range trees with gap support must only remove complete segments
385 * from the tree. This allows us to maintain accurate fill accounting
386 * and to ensure that bridged sections are not leaked. If we need to
387 * remove less than the full segment, we can only adjust the fill count.
389 if (rt->rt_gap != 0) {
391 if (rs->rs_fill == size) {
392 start = rs->rs_start;
396 range_tree_adjust_fill(rt, rs, -size);
399 } else if (rs->rs_start != start || rs->rs_end != end) {
400 zfs_panic_recover("zfs: freeing partial segment of "
401 "gap tree (offset=%llu size=%llu) of "
402 "(offset=%llu size=%llu)",
403 (longlong_t)start, (longlong_t)size,
404 (longlong_t)rs->rs_start,
405 (longlong_t)rs->rs_end - rs->rs_start);
410 VERIFY3U(rs->rs_start, <=, start);
411 VERIFY3U(rs->rs_end, >=, end);
413 left_over = (rs->rs_start != start);
414 right_over = (rs->rs_end != end);
416 range_tree_stat_decr(rt, rs);
418 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
419 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
421 if (left_over && right_over) {
422 newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
423 newseg->rs_start = end;
424 newseg->rs_end = rs->rs_end;
425 newseg->rs_fill = newseg->rs_end - newseg->rs_start;
426 range_tree_stat_incr(rt, newseg);
430 avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
431 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
432 rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
433 } else if (left_over) {
435 } else if (right_over) {
438 avl_remove(&rt->rt_root, rs);
439 kmem_cache_free(range_seg_cache, rs);
445 * The fill of the leftover segment will always be equal to
446 * the size, since we do not support removing partial segments
447 * of range trees with gaps.
449 rs->rs_fill = rs->rs_end - rs->rs_start;
450 range_tree_stat_incr(rt, rs);
452 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
453 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
456 rt->rt_space -= size;
460 range_tree_remove(void *arg, uint64_t start, uint64_t size)
462 range_tree_remove_impl(arg, start, size, B_FALSE);
466 range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size)
468 range_tree_remove_impl(rt, start, size, B_TRUE);
472 range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
473 uint64_t newstart, uint64_t newsize)
475 int64_t delta = newsize - (rs->rs_end - rs->rs_start);
477 range_tree_stat_decr(rt, rs);
478 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
479 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
481 rs->rs_start = newstart;
482 rs->rs_end = newstart + newsize;
484 range_tree_stat_incr(rt, rs);
485 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
486 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
488 rt->rt_space += delta;
492 range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
495 uint64_t end = start + size;
499 rsearch.rs_start = start;
500 rsearch.rs_end = end;
501 return (avl_find(&rt->rt_root, &rsearch, NULL));
505 range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
507 range_seg_t *rs = range_tree_find_impl(rt, start, size);
508 if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size)
514 range_tree_verify_not_present(range_tree_t *rt, uint64_t off, uint64_t size)
516 range_seg_t *rs = range_tree_find(rt, off, size);
518 panic("segment already in tree; rs=%p", (void *)rs);
522 range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size)
524 return (range_tree_find(rt, start, size) != NULL);
528 * Ensure that this range is not in the tree, regardless of whether
529 * it is currently in the tree.
532 range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size)
539 while ((rs = range_tree_find_impl(rt, start, size)) != NULL) {
540 uint64_t free_start = MAX(rs->rs_start, start);
541 uint64_t free_end = MIN(rs->rs_end, start + size);
542 range_tree_remove(rt, free_start, free_end - free_start);
547 range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst)
551 ASSERT0(range_tree_space(*rtdst));
552 ASSERT0(avl_numnodes(&(*rtdst)->rt_root));
560 range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
566 if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL)
567 rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
569 while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
571 func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
572 kmem_cache_free(range_seg_cache, rs);
575 bzero(rt->rt_histogram, sizeof (rt->rt_histogram));
580 range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
584 for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
585 func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
589 range_tree_first(range_tree_t *rt)
591 return (avl_first(&rt->rt_root));
595 range_tree_space(range_tree_t *rt)
597 return (rt->rt_space);
600 /* Generic range tree functions for maintaining segments in an AVL tree. */
602 rt_avl_create(range_tree_t *rt, void *arg)
604 avl_tree_t *tree = arg;
606 avl_create(tree, rt->rt_avl_compare, sizeof (range_seg_t),
607 offsetof(range_seg_t, rs_pp_node));
611 rt_avl_destroy(range_tree_t *rt, void *arg)
613 avl_tree_t *tree = arg;
615 ASSERT0(avl_numnodes(tree));
620 rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg)
622 avl_tree_t *tree = arg;
627 rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
629 avl_tree_t *tree = arg;
630 avl_remove(tree, rs);
634 rt_avl_vacate(range_tree_t *rt, void *arg)
637 * Normally one would walk the tree freeing nodes along the way.
638 * Since the nodes are shared with the range trees we can avoid
639 * walking all nodes and just reinitialize the avl tree. The nodes
640 * will be freed by the range tree, so we don't want to free them here.
642 rt_avl_create(rt, arg);
646 range_tree_is_empty(range_tree_t *rt)
649 return (range_tree_space(rt) == 0);
653 range_tree_min(range_tree_t *rt)
655 range_seg_t *rs = avl_first(&rt->rt_root);
656 return (rs != NULL ? rs->rs_start : 0);
660 range_tree_max(range_tree_t *rt)
662 range_seg_t *rs = avl_last(&rt->rt_root);
663 return (rs != NULL ? rs->rs_end : 0);
667 range_tree_span(range_tree_t *rt)
669 return (range_tree_max(rt) - range_tree_min(rt));