2 * SPDX-License-Identifier: BSD-3-Clause
4 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the University nor the names of its contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
18 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting
32 * This module implements a general bitmap allocator/deallocator. The
33 * allocator eats around 2 bits per 'block'. The module does not
34 * try to interpret the meaning of a 'block' other than to return
35 * SWAPBLK_NONE on an allocation failure.
37 * A radix tree controls access to pieces of the bitmap, and includes
38 * auxiliary information at each interior node about the availabilty of
39 * contiguous free blocks in the subtree rooted at that node. Two radix
40 * constants are involved: one for the size of the bitmaps contained in the
41 * leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of
42 * each of the meta (interior) nodes (BLIST_META_RADIX). Each subtree is
43 * associated with a range of blocks. The root of any subtree stores a
44 * hint field that defines an upper bound on the size of the largest
45 * allocation that can begin in the associated block range. A hint is an
46 * upper bound on a potential allocation, but not necessarily a tight upper
49 * The radix tree also implements two collapsed states for meta nodes:
50 * the ALL-ALLOCATED state and the ALL-FREE state. If a meta node is
51 * in either of these two states, all information contained underneath
52 * the node is considered stale. These states are used to optimize
53 * allocation and freeing operations.
55 * The hinting greatly increases code efficiency for allocations while
56 * the general radix structure optimizes both allocations and frees. The
57 * radix tree should be able to operate well no matter how much
58 * fragmentation there is and no matter how large a bitmap is used.
60 * The blist code wires all necessary memory at creation time. Neither
61 * allocations nor frees require interaction with the memory subsystem.
62 * The non-blocking features of the blist code are used in the swap code
65 * LAYOUT: The radix tree is laid out recursively using a
66 * linear array. Each meta node is immediately followed (laid out
67 * sequentially in memory) by BLIST_META_RADIX lower level nodes. This
68 * is a recursive structure but one that can be easily scanned through
69 * a very simple 'skip' calculation. In order to support large radixes,
70 * portions of the tree may reside outside our memory allocation. We
71 * handle this with an early-termination optimization (when bighint is
72 * set to -1) on the scan. The memory allocation is only large enough
73 * to cover the number of blocks requested at creation time even if it
74 * must be encompassed in larger root-node radix.
76 * NOTE: the allocator cannot currently allocate more than
77 * BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too
78 * large' if you try. This is an area that could use improvement. The
79 * radix is large enough that this restriction does not effect the swap
80 * system, though. Currently only the allocation code is affected by
81 * this algorithmic unfeature. The freeing code can handle arbitrary
84 * This code can be compiled stand-alone for debugging.
87 #include <sys/cdefs.h>
88 __FBSDID("$FreeBSD$");
92 #include <sys/param.h>
93 #include <sys/systm.h>
95 #include <sys/kernel.h>
96 #include <sys/blist.h>
97 #include <sys/malloc.h>
100 #include <sys/mutex.h>
104 #ifndef BLIST_NO_DEBUG
108 #include <sys/types.h>
109 #include <sys/malloc.h>
110 #include <sys/sbuf.h>
118 #define bitcount64(x) __bitcount64((uint64_t)(x))
119 #define malloc(a,b,c) calloc(a, 1)
120 #define free(a,b) free(a)
121 static __inline int imax(int a, int b) { return (a > b ? a : b); }
123 #include <sys/blist.h>
125 void panic(const char *ctl, ...);
130 * static support functions
132 static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
133 static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count,
135 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
136 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
138 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
139 blist_t dest, daddr_t count);
140 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
141 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
144 static void blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
149 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
152 _Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0,
153 "radix divisibility error");
154 #define BLIST_BMAP_MASK (BLIST_BMAP_RADIX - 1)
155 #define BLIST_META_MASK (BLIST_META_RADIX - 1)
158 * For a subtree that can represent the state of up to 'radix' blocks, the
159 * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX. If 'm'
160 * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h
161 * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
162 * or, equivalently, (m**(h+1)-1)/(m-1). This quantity is called 'skip'
163 * in the 'meta' functions that process subtrees. Since integer division
164 * discards remainders, we can express this computation as
165 * skip = (m * m**h) / (m - 1)
166 * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1)
167 * and since m divides BLIST_BMAP_RADIX, we can simplify further to
168 * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1)
169 * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1))
170 * so that simple integer division by a constant can safely be used for the
173 static inline daddr_t
174 radix_to_skip(daddr_t radix)
178 ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK));
182 * Use binary search, or a faster method, to find the 1 bit in a u_daddr_t.
183 * Assumes that the argument has only one bit set.
186 bitpos(u_daddr_t mask)
190 switch (sizeof(mask)) {
191 #ifdef HAVE_INLINE_FFSLL
192 case sizeof(long long):
193 return (ffsll(mask) - 1);
197 hi = BLIST_BMAP_RADIX;
198 while (lo + 1 < hi) {
199 mid = (lo + hi) >> 1;
200 if ((mask >> mid) != 0)
210 * blist_create() - create a blist capable of handling up to the specified
213 * blocks - must be greater than 0
214 * flags - malloc flags
216 * The smallest blist consists of a single leaf node capable of
217 * managing BLIST_BMAP_RADIX blocks.
220 blist_create(daddr_t blocks, int flags)
223 daddr_t i, last_block;
224 u_daddr_t nodes, radix, skip;
228 * Calculate the radix and node count used for scanning. Find the last
229 * block that is followed by a terminator.
231 last_block = blocks - 1;
232 radix = BLIST_BMAP_RADIX;
233 while (radix < blocks) {
234 if (((last_block / radix + 1) & BLIST_META_MASK) != 0)
236 * A terminator will be added. Update last_block to the
237 * position just before that terminator.
239 last_block |= radix - 1;
240 radix *= BLIST_META_RADIX;
244 * Count the meta-nodes in the expanded tree, including the final
245 * terminator, from the bottom level up to the root.
247 nodes = (last_block >= blocks) ? 2 : 1;
248 last_block /= BLIST_BMAP_RADIX;
249 while (last_block > 0) {
250 nodes += last_block + 1;
251 last_block /= BLIST_META_RADIX;
253 bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags |
258 bl->bl_blocks = blocks;
259 bl->bl_radix = radix;
263 * Initialize the empty tree by filling in root values, then initialize
264 * just the terminators in the rest of the tree.
266 bl->bl_root[0].bm_bighint = 0;
267 if (radix == BLIST_BMAP_RADIX)
268 bl->bl_root[0].u.bmu_bitmap = 0;
270 bl->bl_root[0].u.bmu_avail = 0;
271 last_block = blocks - 1;
273 while (radix > BLIST_BMAP_RADIX) {
274 radix /= BLIST_META_RADIX;
275 skip = radix_to_skip(radix);
276 digit = last_block / radix;
277 i += 1 + digit * skip;
278 if (digit != BLIST_META_MASK) {
282 bl->bl_root[i + skip].bm_bighint = (daddr_t)-1;
283 bl->bl_root[i + skip].u.bmu_bitmap = 0;
288 #if defined(BLIST_DEBUG)
290 "BLIST representing %lld blocks (%lld MB of swap)"
291 ", requiring %lldK of ram\n",
292 (long long)bl->bl_blocks,
293 (long long)bl->bl_blocks * 4 / 1024,
294 (long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
296 printf("BLIST raw radix tree contains %lld records\n",
304 blist_destroy(blist_t bl)
311 * blist_alloc() - reserve space in the block bitmap. Return the base
312 * of a contiguous region or SWAPBLK_NONE if space could
316 blist_alloc(blist_t bl, daddr_t count)
321 * This loop iterates at most twice. An allocation failure in the
322 * first iteration leads to a second iteration only if the cursor was
323 * non-zero. When the cursor is zero, an allocation failure will
324 * reduce the hint, stopping further iterations.
326 while (count <= bl->bl_root->bm_bighint) {
327 blk = blst_meta_alloc(bl->bl_root, bl->bl_cursor, count,
329 if (blk != SWAPBLK_NONE) {
330 bl->bl_cursor = blk + count;
331 if (bl->bl_cursor == bl->bl_blocks)
334 } else if (bl->bl_cursor != 0)
337 return (SWAPBLK_NONE);
341 * blist_avail() - return the number of free blocks.
344 blist_avail(blist_t bl)
347 if (bl->bl_radix == BLIST_BMAP_RADIX)
348 return (bitcount64(bl->bl_root->u.bmu_bitmap));
350 return (bl->bl_root->u.bmu_avail);
354 * blist_free() - free up space in the block bitmap. Return the base
355 * of a contiguous region. Panic if an inconsistancy is
359 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
362 blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
366 * blist_fill() - mark a region in the block bitmap as off-limits
367 * to the allocator (i.e. allocate it), ignoring any
368 * existing allocations. Return the number of blocks
369 * actually filled that were free before the call.
372 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
375 return (blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix));
379 * blist_resize() - resize an existing radix tree to handle the
380 * specified number of blocks. This will reallocate
381 * the tree and transfer the previous bitmap to the new
382 * one. When extending the tree you can specify whether
383 * the new blocks are to left allocated or freed.
386 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
388 blist_t newbl = blist_create(count, flags);
392 if (count > save->bl_blocks)
393 count = save->bl_blocks;
394 blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
397 * If resizing upwards, should we free the new space or not?
399 if (freenew && count < newbl->bl_blocks) {
400 blist_free(newbl, count, newbl->bl_blocks - count);
408 * blist_print() - dump radix tree
411 blist_print(blist_t bl)
413 printf("BLIST cursor = %08jx {\n", (uintmax_t)bl->bl_cursor);
414 blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
420 static const u_daddr_t fib[] = {
421 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
422 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
423 514229, 832040, 1346269, 2178309, 3524578,
427 * Use 'gap' to describe a maximal range of unallocated blocks/bits.
430 daddr_t start; /* current gap start, or SWAPBLK_NONE */
431 daddr_t num; /* number of gaps observed */
432 daddr_t max; /* largest gap size */
433 daddr_t avg; /* average gap size */
434 daddr_t err; /* sum - num * avg */
435 daddr_t histo[nitems(fib)]; /* # gaps in each size range */
436 int max_bucket; /* last histo elt with nonzero val */
440 * gap_stats_counting() - is the state 'counting 1 bits'?
441 * or 'skipping 0 bits'?
444 gap_stats_counting(const struct gap_stats *stats)
447 return (stats->start != SWAPBLK_NONE);
451 * init_gap_stats() - initialize stats on gap sizes
454 init_gap_stats(struct gap_stats *stats)
457 bzero(stats, sizeof(*stats));
458 stats->start = SWAPBLK_NONE;
462 * update_gap_stats() - update stats on gap sizes
465 update_gap_stats(struct gap_stats *stats, daddr_t posn)
470 if (!gap_stats_counting(stats)) {
474 size = posn - stats->start;
475 stats->start = SWAPBLK_NONE;
476 if (size > stats->max)
480 * Find the fibonacci range that contains size,
481 * expecting to find it in an early range.
485 while (hi < nitems(fib) && fib[hi] <= size) {
489 if (hi >= nitems(fib))
491 while (lo + 1 != hi) {
492 mid = (lo + hi) >> 1;
493 if (fib[mid] <= size)
499 if (lo > stats->max_bucket)
500 stats->max_bucket = lo;
501 stats->err += size - stats->avg;
503 stats->avg += stats->err / stats->num;
504 stats->err %= stats->num;
508 * dump_gap_stats() - print stats on gap sizes
511 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
515 sbuf_printf(s, "number of maximal free ranges: %jd\n",
516 (intmax_t)stats->num);
517 sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
518 sbuf_printf(s, "average maximal free range size: %jd\n",
519 (intmax_t)stats->avg);
520 sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
521 sbuf_printf(s, " count | size range\n");
522 sbuf_printf(s, " ----- | ----------\n");
523 for (i = 0; i < stats->max_bucket; i++) {
524 if (stats->histo[i] != 0) {
525 sbuf_printf(s, "%20jd | ",
526 (intmax_t)stats->histo[i]);
527 if (fib[i] != fib[i + 1] - 1)
528 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
529 (intmax_t)fib[i + 1] - 1);
531 sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
534 sbuf_printf(s, "%20jd | ", (intmax_t)stats->histo[i]);
535 if (stats->histo[i] > 1)
536 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
537 (intmax_t)stats->max);
539 sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
543 * blist_stats() - dump radix tree stats
546 blist_stats(blist_t bl, struct sbuf *s)
548 struct gap_stats gstats;
549 struct gap_stats *stats = &gstats;
550 daddr_t i, nodes, radix;
551 u_daddr_t bit, diff, mask;
553 init_gap_stats(stats);
556 while (i < bl->bl_radix + bl->bl_blocks) {
558 * Find max size subtree starting at i.
560 radix = BLIST_BMAP_RADIX;
561 while (((i / radix) & BLIST_META_MASK) == 0)
562 radix *= BLIST_META_RADIX;
565 * Check for skippable subtrees starting at i.
567 while (radix > BLIST_BMAP_RADIX) {
568 if (bl->bl_root[nodes].u.bmu_avail == 0) {
569 if (gap_stats_counting(stats))
570 update_gap_stats(stats, i);
573 if (bl->bl_root[nodes].u.bmu_avail == radix) {
574 if (!gap_stats_counting(stats))
575 update_gap_stats(stats, i);
583 radix /= BLIST_META_RADIX;
585 if (radix == BLIST_BMAP_RADIX) {
589 mask = bl->bl_root[nodes].u.bmu_bitmap;
590 diff = mask ^ (mask << 1);
591 if (gap_stats_counting(stats))
595 update_gap_stats(stats, i + bitpos(bit));
599 nodes += radix_to_skip(radix);
602 update_gap_stats(stats, i);
603 dump_gap_stats(stats, s);
606 /************************************************************************
607 * ALLOCATION SUPPORT FUNCTIONS *
608 ************************************************************************
610 * These support functions do all the actual work. They may seem
611 * rather longish, but that's because I've commented them up. The
612 * actual code is straight forward.
617 * blist_leaf_alloc() - allocate at a leaf in the radix tree (a bitmap).
619 * This is the core of the allocator and is optimized for the
620 * BLIST_BMAP_RADIX block allocation case. Otherwise, execution
621 * time is proportional to log2(count) + bitpos time.
624 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count)
627 int count1, hi, lo, num_shifts, range1, range_ext;
631 num_shifts = fls(count1);
632 mask = scan->u.bmu_bitmap;
633 while ((-mask & ~mask) != 0 && num_shifts > 0) {
635 * If bit i is set in mask, then bits in [i, i+range1] are set
636 * in scan->u.bmu_bitmap. The value of range1 is equal to
637 * count1 >> num_shifts. Grow range and reduce num_shifts to 0,
638 * while preserving these invariants. The updates to mask leave
639 * fewer bits set, but each bit that remains set represents a
640 * longer string of consecutive bits set in scan->u.bmu_bitmap.
641 * If more updates to mask cannot clear more bits, because mask
642 * is partitioned with all 0 bits preceding all 1 bits, the loop
643 * terminates immediately.
646 range_ext = range1 + ((count1 >> num_shifts) & 1);
648 * mask is a signed quantity for the shift because when it is
649 * shifted right, the sign bit should copied; when the last
650 * block of the leaf is free, pretend, for a while, that all the
651 * blocks that follow it are also free.
653 mask &= (daddr_t)mask >> range_ext;
658 * Update bighint. There is no allocation bigger than range1
659 * starting in this leaf.
661 scan->bm_bighint = range1;
662 return (SWAPBLK_NONE);
665 /* Discard any candidates that appear before blk. */
666 mask &= (u_daddr_t)-1 << (blk & BLIST_BMAP_MASK);
668 return (SWAPBLK_NONE);
671 * The least significant set bit in mask marks the start of the first
672 * available range of sufficient size. Clear all the bits but that one,
673 * and then find its position.
679 if (hi > BLIST_BMAP_RADIX) {
681 * An allocation within this leaf is impossible, so a successful
682 * allocation depends on the next leaf providing some of the blocks.
684 if (((blk / BLIST_BMAP_RADIX + 1) & BLIST_META_MASK) == 0) {
686 * The next leaf has a different meta-node parent, so it
687 * is not necessarily initialized. Update bighint,
688 * comparing the range found at the end of mask to the
689 * largest earlier range that could have been made to
690 * vanish in the initial processing of mask.
692 scan->bm_bighint = imax(BLIST_BMAP_RADIX - lo, range1);
693 return (SWAPBLK_NONE);
695 hi -= BLIST_BMAP_RADIX;
696 if (((scan[1].u.bmu_bitmap + 1) & ~((u_daddr_t)-1 << hi)) != 0) {
698 * The next leaf doesn't have enough free blocks at the
699 * beginning to complete the spanning allocation. The
700 * hint cannot be updated, because the same allocation
701 * request could be satisfied later, by this leaf, if
702 * the state of the next leaf changes, and without any
703 * changes to this leaf.
705 return (SWAPBLK_NONE);
707 /* Clear the first 'hi' bits in the next leaf, allocating them. */
708 scan[1].u.bmu_bitmap &= (u_daddr_t)-1 << hi;
709 hi = BLIST_BMAP_RADIX;
712 /* Set the bits of mask at position 'lo' and higher. */
714 if (hi == BLIST_BMAP_RADIX) {
716 * Update bighint. There is no allocation bigger than range1
717 * available in this leaf after this allocation completes.
719 scan->bm_bighint = range1;
721 /* Clear the bits of mask at position 'hi' and higher. */
722 mask &= (u_daddr_t)-1 >> (BLIST_BMAP_RADIX - hi);
723 /* If this allocation uses all the bits, clear the hint. */
724 if (mask == scan->u.bmu_bitmap)
725 scan->bm_bighint = 0;
727 /* Clear the allocated bits from this leaf. */
728 scan->u.bmu_bitmap &= ~mask;
729 return ((blk & ~BLIST_BMAP_MASK) + lo);
733 * blist_meta_alloc() - allocate at a meta in the radix tree.
735 * Attempt to allocate at a meta node. If we can't, we update
736 * bighint and return a failure. Updating bighint optimize future
737 * calls that hit this node. We have to check for our collapse cases
738 * and we have a few optimizations strewn in as well.
741 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix)
743 daddr_t blk, i, next_skip, r, skip;
745 bool scan_from_start;
747 if (radix == BLIST_BMAP_RADIX)
748 return (blst_leaf_alloc(scan, cursor, count));
749 if (scan->u.bmu_avail < count) {
751 * The meta node's hint must be too large if the allocation
752 * exceeds the number of free blocks. Reduce the hint, and
755 scan->bm_bighint = scan->u.bmu_avail;
756 return (SWAPBLK_NONE);
758 blk = cursor & -radix;
759 skip = radix_to_skip(radix);
760 next_skip = skip / BLIST_META_RADIX;
763 * An ALL-FREE meta node requires special handling before allocating
766 if (scan->u.bmu_avail == radix) {
767 radix /= BLIST_META_RADIX;
770 * Reinitialize each of the meta node's children. An ALL-FREE
771 * meta node cannot have a terminator in any subtree.
773 for (i = 1; i < skip; i += next_skip) {
775 scan[i].u.bmu_bitmap = (u_daddr_t)-1;
777 scan[i].u.bmu_avail = radix;
778 scan[i].bm_bighint = radix;
781 radix /= BLIST_META_RADIX;
786 * The allocation exceeds the number of blocks that are
787 * managed by a subtree of this meta node.
789 panic("allocation too large");
791 scan_from_start = cursor == blk;
792 child = (cursor - blk) / radix;
793 blk += child * radix;
794 for (i = 1 + child * next_skip; i < skip; i += next_skip) {
795 if (count <= scan[i].bm_bighint) {
797 * The allocation might fit beginning in the i'th subtree.
799 r = blst_meta_alloc(&scan[i],
800 cursor > blk ? cursor : blk, count, radix);
801 if (r != SWAPBLK_NONE) {
802 scan->u.bmu_avail -= count;
805 } else if (scan[i].bm_bighint == (daddr_t)-1) {
815 * We couldn't allocate count in this subtree, update bighint.
817 if (scan_from_start && scan->bm_bighint >= count)
818 scan->bm_bighint = count - 1;
820 return (SWAPBLK_NONE);
824 * BLST_LEAF_FREE() - free allocated block from leaf bitmap
828 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
834 * free some data in this bitmap
835 * mask=0000111111111110000
839 n = blk & BLIST_BMAP_MASK;
840 mask = ((u_daddr_t)-1 << n) &
841 ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
842 if (scan->u.bmu_bitmap & mask)
843 panic("freeing free block");
844 scan->u.bmu_bitmap |= mask;
847 * We could probably do a better job here. We are required to make
848 * bighint at least as large as the biggest contiguous block of
849 * data. If we just shoehorn it, a little extra overhead will
850 * be incured on the next allocation (but only that one typically).
852 scan->bm_bighint = BLIST_BMAP_RADIX;
856 * BLST_META_FREE() - free allocated blocks from radix tree meta info
858 * This support routine frees a range of blocks from the bitmap.
859 * The range must be entirely enclosed by this radix node. If a
860 * meta node, we break the range down recursively to free blocks
861 * in subnodes (which means that this code can free an arbitrary
862 * range whereas the allocation code cannot allocate an arbitrary
866 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
868 daddr_t blk, i, next_skip, skip, v;
871 if (scan->bm_bighint == (daddr_t)-1)
872 panic("freeing invalid range");
873 if (radix == BLIST_BMAP_RADIX)
874 return (blst_leaf_free(scan, freeBlk, count));
875 skip = radix_to_skip(radix);
876 next_skip = skip / BLIST_META_RADIX;
878 if (scan->u.bmu_avail == 0) {
880 * ALL-ALLOCATED special case, with possible
881 * shortcut to ALL-FREE special case.
883 scan->u.bmu_avail = count;
884 scan->bm_bighint = count;
886 if (count != radix) {
887 for (i = 1; i < skip; i += next_skip) {
888 if (scan[i].bm_bighint == (daddr_t)-1)
890 scan[i].bm_bighint = 0;
891 if (next_skip == 1) {
892 scan[i].u.bmu_bitmap = 0;
894 scan[i].u.bmu_avail = 0;
900 scan->u.bmu_avail += count;
901 /* scan->bm_bighint = radix; */
905 * ALL-FREE special case.
908 if (scan->u.bmu_avail == radix)
910 if (scan->u.bmu_avail > radix)
911 panic("blst_meta_free: freeing already free blocks (%lld) %lld/%lld",
912 (long long)count, (long long)scan->u.bmu_avail,
916 * Break the free down into its components
919 blk = freeBlk & -radix;
920 radix /= BLIST_META_RADIX;
922 child = (freeBlk - blk) / radix;
923 blk += child * radix;
924 i = 1 + child * next_skip;
925 while (i < skip && blk < freeBlk + count) {
926 v = blk + radix - freeBlk;
929 blst_meta_free(&scan[i], freeBlk, v, radix);
930 if (scan->bm_bighint < scan[i].bm_bighint)
931 scan->bm_bighint = scan[i].bm_bighint;
940 * BLIST_RADIX_COPY() - copy one radix tree to another
942 * Locates free space in the source tree and frees it in the destination
943 * tree. The space may not already be free in the destination.
946 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
949 daddr_t i, next_skip, skip;
955 if (radix == BLIST_BMAP_RADIX) {
956 u_daddr_t v = scan->u.bmu_bitmap;
958 if (v == (u_daddr_t)-1) {
959 blist_free(dest, blk, count);
963 for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) {
964 if (v & ((u_daddr_t)1 << i))
965 blist_free(dest, blk + i, 1);
975 if (scan->u.bmu_avail == 0) {
977 * Source all allocated, leave dest allocated
981 if (scan->u.bmu_avail == radix) {
983 * Source all free, free entire dest
986 blist_free(dest, blk, count);
988 blist_free(dest, blk, radix);
993 skip = radix_to_skip(radix);
994 next_skip = skip / BLIST_META_RADIX;
995 radix /= BLIST_META_RADIX;
997 for (i = 1; count && i < skip; i += next_skip) {
998 if (scan[i].bm_bighint == (daddr_t)-1)
1001 if (count >= radix) {
1002 blst_copy(&scan[i], blk, radix, dest, radix);
1006 blst_copy(&scan[i], blk, radix, dest, count);
1015 * BLST_LEAF_FILL() - allocate specific blocks in leaf bitmap
1017 * This routine allocates all blocks in the specified range
1018 * regardless of any existing allocations in that range. Returns
1019 * the number of blocks allocated by the call.
1022 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
1028 n = blk & BLIST_BMAP_MASK;
1029 mask = ((u_daddr_t)-1 << n) &
1030 ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
1032 /* Count the number of blocks that we are allocating. */
1033 nblks = bitcount64(scan->u.bmu_bitmap & mask);
1035 scan->u.bmu_bitmap &= ~mask;
1040 * BLIST_META_FILL() - allocate specific blocks at a meta node
1042 * This routine allocates the specified range of blocks,
1043 * regardless of any existing allocations in the range. The
1044 * range must be within the extent of this node. Returns the
1045 * number of blocks allocated by the call.
1048 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
1050 daddr_t blk, i, nblks, next_skip, skip, v;
1053 if (scan->bm_bighint == (daddr_t)-1)
1054 panic("filling invalid range");
1055 if (count > radix) {
1057 * The allocation exceeds the number of blocks that are
1058 * managed by this node.
1060 panic("fill too large");
1062 if (radix == BLIST_BMAP_RADIX)
1063 return (blst_leaf_fill(scan, allocBlk, count));
1064 if (count == radix || scan->u.bmu_avail == 0) {
1066 * ALL-ALLOCATED special case
1068 nblks = scan->u.bmu_avail;
1069 scan->u.bmu_avail = 0;
1070 scan->bm_bighint = 0;
1073 skip = radix_to_skip(radix);
1074 next_skip = skip / BLIST_META_RADIX;
1075 blk = allocBlk & -radix;
1078 * An ALL-FREE meta node requires special handling before allocating
1079 * any of its blocks.
1081 if (scan->u.bmu_avail == radix) {
1082 radix /= BLIST_META_RADIX;
1085 * Reinitialize each of the meta node's children. An ALL-FREE
1086 * meta node cannot have a terminator in any subtree.
1088 for (i = 1; i < skip; i += next_skip) {
1090 scan[i].u.bmu_bitmap = (u_daddr_t)-1;
1092 scan[i].u.bmu_avail = radix;
1093 scan[i].bm_bighint = radix;
1096 radix /= BLIST_META_RADIX;
1100 child = (allocBlk - blk) / radix;
1101 blk += child * radix;
1102 i = 1 + child * next_skip;
1103 while (i < skip && blk < allocBlk + count) {
1104 v = blk + radix - allocBlk;
1107 nblks += blst_meta_fill(&scan[i], allocBlk, v, radix);
1113 scan->u.bmu_avail -= nblks;
1120 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
1122 daddr_t i, next_skip, skip;
1124 if (radix == BLIST_BMAP_RADIX) {
1126 "%*.*s(%08llx,%lld): bitmap %016llx big=%lld\n",
1128 (long long)blk, (long long)radix,
1129 (long long)scan->u.bmu_bitmap,
1130 (long long)scan->bm_bighint
1135 if (scan->u.bmu_avail == 0) {
1137 "%*.*s(%08llx,%lld) ALL ALLOCATED\n",
1144 if (scan->u.bmu_avail == radix) {
1146 "%*.*s(%08llx,%lld) ALL FREE\n",
1155 "%*.*s(%08llx,%lld): subtree (%lld/%lld) big=%lld {\n",
1157 (long long)blk, (long long)radix,
1158 (long long)scan->u.bmu_avail,
1160 (long long)scan->bm_bighint
1163 skip = radix_to_skip(radix);
1164 next_skip = skip / BLIST_META_RADIX;
1165 radix /= BLIST_META_RADIX;
1168 for (i = 1; i < skip; i += next_skip) {
1169 if (scan[i].bm_bighint == (daddr_t)-1) {
1171 "%*.*s(%08llx,%lld): Terminator\n",
1173 (long long)blk, (long long)radix
1177 blst_radix_print(&scan[i], blk, radix, tab);
1193 main(int ac, char **av)
1200 for (i = 1; i < ac; ++i) {
1201 const char *ptr = av[i];
1203 size = strtol(ptr, NULL, 0);
1207 fprintf(stderr, "Bad option: %s\n", ptr - 2);
1210 bl = blist_create(size, M_WAITOK);
1211 blist_free(bl, 0, size);
1216 long long count = 0;
1218 printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1219 (long long)size, (long long)bl->bl_radix);
1221 if (fgets(buf, sizeof(buf), stdin) == NULL)
1225 if (sscanf(buf + 1, "%lld", &count) == 1) {
1226 blist_resize(&bl, count, 1, M_WAITOK);
1234 s = sbuf_new_auto();
1237 printf("%s", sbuf_data(s));
1241 if (sscanf(buf + 1, "%lld", &count) == 1) {
1242 daddr_t blk = blist_alloc(bl, count);
1243 printf(" R=%08llx\n", (long long)blk);
1249 if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1250 blist_free(bl, da, count);
1256 if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1258 (intmax_t)blist_fill(bl, da, count));
1284 panic(const char *ctl, ...)
1289 vfprintf(stderr, ctl, va);
1290 fprintf(stderr, "\n");