2 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved.
3 * Redistribution and use in source and binary forms, with or without
4 * modification, are permitted provided that the following conditions
6 * 1. Redistributions of source code must retain the above copyright
7 * notice, this list of conditions and the following disclaimer.
8 * 2. Redistributions in binary form must reproduce the above copyright
9 * notice, this list of conditions and the following disclaimer in the
10 * documentation and/or other materials provided with the distribution.
11 * 3. Neither the name of the University nor the names of its contributors
12 * may be used to endorse or promote products derived from this software
13 * without specific prior written permission.
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
16 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
19 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
21 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting
30 * This module implements a general bitmap allocator/deallocator. The
31 * allocator eats around 2 bits per 'block'. The module does not
32 * try to interpret the meaning of a 'block' other than to return
33 * SWAPBLK_NONE on an allocation failure.
35 * A radix tree controls access to pieces of the bitmap, and includes
36 * auxiliary information at each interior node about the availabilty of
37 * contiguous free blocks in the subtree rooted at that node. Two radix
38 * constants are involved: one for the size of the bitmaps contained in the
39 * leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of
40 * each of the meta (interior) nodes (BLIST_META_RADIX). Each subtree is
41 * associated with a range of blocks. The root of any subtree stores a
42 * hint field that defines an upper bound on the size of the largest
43 * allocation that can begin in the associated block range. A hint is an
44 * upper bound on a potential allocation, but not necessarily a tight upper
47 * The radix tree also implements two collapsed states for meta nodes:
48 * the ALL-ALLOCATED state and the ALL-FREE state. If a meta node is
49 * in either of these two states, all information contained underneath
50 * the node is considered stale. These states are used to optimize
51 * allocation and freeing operations.
53 * The hinting greatly increases code efficiency for allocations while
54 * the general radix structure optimizes both allocations and frees. The
55 * radix tree should be able to operate well no matter how much
56 * fragmentation there is and no matter how large a bitmap is used.
58 * The blist code wires all necessary memory at creation time. Neither
59 * allocations nor frees require interaction with the memory subsystem.
60 * The non-blocking features of the blist code are used in the swap code
63 * LAYOUT: The radix tree is laid out recursively using a
64 * linear array. Each meta node is immediately followed (laid out
65 * sequentially in memory) by BLIST_META_RADIX lower level nodes. This
66 * is a recursive structure but one that can be easily scanned through
67 * a very simple 'skip' calculation. In order to support large radixes,
68 * portions of the tree may reside outside our memory allocation. We
69 * handle this with an early-termination optimization (when bighint is
70 * set to -1) on the scan. The memory allocation is only large enough
71 * to cover the number of blocks requested at creation time even if it
72 * must be encompassed in larger root-node radix.
74 * NOTE: the allocator cannot currently allocate more than
75 * BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too
76 * large' if you try. This is an area that could use improvement. The
77 * radix is large enough that this restriction does not effect the swap
78 * system, though. Currently only the allocation code is affected by
79 * this algorithmic unfeature. The freeing code can handle arbitrary
82 * This code can be compiled stand-alone for debugging.
85 #include <sys/cdefs.h>
86 __FBSDID("$FreeBSD$");
90 #include <sys/param.h>
91 #include <sys/systm.h>
93 #include <sys/kernel.h>
94 #include <sys/blist.h>
95 #include <sys/malloc.h>
98 #include <sys/mutex.h>
102 #ifndef BLIST_NO_DEBUG
106 #include <sys/types.h>
107 #include <sys/malloc.h>
108 #include <sys/sbuf.h>
115 #define bitcount64(x) __bitcount64((uint64_t)(x))
116 #define malloc(a,b,c) calloc(a, 1)
117 #define free(a,b) free(a)
118 static __inline int imax(int a, int b) { return (a > b ? a : b); }
120 #include <sys/blist.h>
122 void panic(const char *ctl, ...);
127 * static support functions
129 static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
130 static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count,
132 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
133 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
135 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
136 blist_t dest, daddr_t count);
137 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
138 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
140 static daddr_t blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t count);
142 static void blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
147 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
150 _Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0,
151 "radix divisibility error");
152 #define BLIST_BMAP_MASK (BLIST_BMAP_RADIX - 1)
153 #define BLIST_META_MASK (BLIST_META_RADIX - 1)
156 * For a subtree that can represent the state of up to 'radix' blocks, the
157 * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX. If 'm'
158 * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h
159 * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
160 * or, equivalently, (m**(h+1)-1)/(m-1). This quantity is called 'skip'
161 * in the 'meta' functions that process subtrees. Since integer division
162 * discards remainders, we can express this computation as
163 * skip = (m * m**h) / (m - 1)
164 * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1)
165 * and since m divides BLIST_BMAP_RADIX, we can simplify further to
166 * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1)
167 * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1))
168 * so that simple integer division by a constant can safely be used for the
171 static inline daddr_t
172 radix_to_skip(daddr_t radix)
176 ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK));
180 * Use binary search, or a faster method, to find the 1 bit in a u_daddr_t.
181 * Assumes that the argument has only one bit set.
184 bitpos(u_daddr_t mask)
188 switch (sizeof(mask)) {
189 #ifdef HAVE_INLINE_FFSLL
190 case sizeof(long long):
191 return (ffsll(mask) - 1);
195 hi = BLIST_BMAP_RADIX;
196 while (lo + 1 < hi) {
197 mid = (lo + hi) >> 1;
198 if ((mask >> mid) != 0)
208 * blist_create() - create a blist capable of handling up to the specified
211 * blocks - must be greater than 0
212 * flags - malloc flags
214 * The smallest blist consists of a single leaf node capable of
215 * managing BLIST_BMAP_RADIX blocks.
218 blist_create(daddr_t blocks, int flags)
221 daddr_t nodes, radix;
224 * Calculate the radix field used for scanning.
226 radix = BLIST_BMAP_RADIX;
227 while (radix < blocks) {
228 radix *= BLIST_META_RADIX;
230 nodes = 1 + blst_radix_init(NULL, radix, blocks);
232 bl = malloc(sizeof(struct blist), M_SWAP, flags);
236 bl->bl_blocks = blocks;
237 bl->bl_radix = radix;
239 bl->bl_root = malloc(nodes * sizeof(blmeta_t), M_SWAP, flags);
240 if (bl->bl_root == NULL) {
244 blst_radix_init(bl->bl_root, radix, blocks);
246 #if defined(BLIST_DEBUG)
248 "BLIST representing %lld blocks (%lld MB of swap)"
249 ", requiring %lldK of ram\n",
250 (long long)bl->bl_blocks,
251 (long long)bl->bl_blocks * 4 / 1024,
252 (long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
254 printf("BLIST raw radix tree contains %lld records\n",
262 blist_destroy(blist_t bl)
264 free(bl->bl_root, M_SWAP);
269 * blist_alloc() - reserve space in the block bitmap. Return the base
270 * of a contiguous region or SWAPBLK_NONE if space could
274 blist_alloc(blist_t bl, daddr_t count)
279 * This loop iterates at most twice. An allocation failure in the
280 * first iteration leads to a second iteration only if the cursor was
281 * non-zero. When the cursor is zero, an allocation failure will
282 * reduce the hint, stopping further iterations.
284 while (count <= bl->bl_root->bm_bighint) {
285 blk = blst_meta_alloc(bl->bl_root, bl->bl_cursor, count,
287 if (blk != SWAPBLK_NONE) {
288 bl->bl_cursor = blk + count;
289 if (bl->bl_cursor == bl->bl_blocks)
292 } else if (bl->bl_cursor != 0)
295 return (SWAPBLK_NONE);
299 * blist_avail() - return the number of free blocks.
302 blist_avail(blist_t bl)
305 if (bl->bl_radix == BLIST_BMAP_RADIX)
306 return (bitcount64(bl->bl_root->u.bmu_bitmap));
308 return (bl->bl_root->u.bmu_avail);
312 * blist_free() - free up space in the block bitmap. Return the base
313 * of a contiguous region. Panic if an inconsistancy is
317 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
320 blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
324 * blist_fill() - mark a region in the block bitmap as off-limits
325 * to the allocator (i.e. allocate it), ignoring any
326 * existing allocations. Return the number of blocks
327 * actually filled that were free before the call.
330 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
333 return (blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix));
337 * blist_resize() - resize an existing radix tree to handle the
338 * specified number of blocks. This will reallocate
339 * the tree and transfer the previous bitmap to the new
340 * one. When extending the tree you can specify whether
341 * the new blocks are to left allocated or freed.
344 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
346 blist_t newbl = blist_create(count, flags);
350 if (count > save->bl_blocks)
351 count = save->bl_blocks;
352 blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
355 * If resizing upwards, should we free the new space or not?
357 if (freenew && count < newbl->bl_blocks) {
358 blist_free(newbl, count, newbl->bl_blocks - count);
366 * blist_print() - dump radix tree
369 blist_print(blist_t bl)
371 printf("BLIST cursor = %08jx {\n", (uintmax_t)bl->bl_cursor);
372 blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
378 static const u_daddr_t fib[] = {
379 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
380 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
381 514229, 832040, 1346269, 2178309, 3524578,
385 * Use 'gap' to describe a maximal range of unallocated blocks/bits.
388 daddr_t start; /* current gap start, or SWAPBLK_NONE */
389 daddr_t num; /* number of gaps observed */
390 daddr_t max; /* largest gap size */
391 daddr_t avg; /* average gap size */
392 daddr_t err; /* sum - num * avg */
393 daddr_t histo[nitems(fib)]; /* # gaps in each size range */
394 int max_bucket; /* last histo elt with nonzero val */
398 * gap_stats_counting() - is the state 'counting 1 bits'?
399 * or 'skipping 0 bits'?
402 gap_stats_counting(const struct gap_stats *stats)
405 return (stats->start != SWAPBLK_NONE);
409 * init_gap_stats() - initialize stats on gap sizes
412 init_gap_stats(struct gap_stats *stats)
415 bzero(stats, sizeof(*stats));
416 stats->start = SWAPBLK_NONE;
420 * update_gap_stats() - update stats on gap sizes
423 update_gap_stats(struct gap_stats *stats, daddr_t posn)
428 if (!gap_stats_counting(stats)) {
432 size = posn - stats->start;
433 stats->start = SWAPBLK_NONE;
434 if (size > stats->max)
438 * Find the fibonacci range that contains size,
439 * expecting to find it in an early range.
443 while (hi < nitems(fib) && fib[hi] <= size) {
447 if (hi >= nitems(fib))
449 while (lo + 1 != hi) {
450 mid = (lo + hi) >> 1;
451 if (fib[mid] <= size)
457 if (lo > stats->max_bucket)
458 stats->max_bucket = lo;
459 stats->err += size - stats->avg;
461 stats->avg += stats->err / stats->num;
462 stats->err %= stats->num;
466 * dump_gap_stats() - print stats on gap sizes
469 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
473 sbuf_printf(s, "number of maximal free ranges: %jd\n",
474 (intmax_t)stats->num);
475 sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
476 sbuf_printf(s, "average maximal free range size: %jd\n",
477 (intmax_t)stats->avg);
478 sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
479 sbuf_printf(s, " count | size range\n");
480 sbuf_printf(s, " ----- | ----------\n");
481 for (i = 0; i < stats->max_bucket; i++) {
482 if (stats->histo[i] != 0) {
483 sbuf_printf(s, "%20jd | ",
484 (intmax_t)stats->histo[i]);
485 if (fib[i] != fib[i + 1] - 1)
486 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
487 (intmax_t)fib[i + 1] - 1);
489 sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
492 sbuf_printf(s, "%20jd | ", (intmax_t)stats->histo[i]);
493 if (stats->histo[i] > 1)
494 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
495 (intmax_t)stats->max);
497 sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
501 * blist_stats() - dump radix tree stats
504 blist_stats(blist_t bl, struct sbuf *s)
506 struct gap_stats gstats;
507 struct gap_stats *stats = &gstats;
508 daddr_t i, nodes, radix;
509 u_daddr_t bit, diff, mask;
511 init_gap_stats(stats);
514 while (i < bl->bl_radix + bl->bl_blocks) {
516 * Find max size subtree starting at i.
518 radix = BLIST_BMAP_RADIX;
519 while (((i / radix) & BLIST_META_MASK) == 0)
520 radix *= BLIST_META_RADIX;
523 * Check for skippable subtrees starting at i.
525 while (radix > BLIST_BMAP_RADIX) {
526 if (bl->bl_root[nodes].u.bmu_avail == 0) {
527 if (gap_stats_counting(stats))
528 update_gap_stats(stats, i);
531 if (bl->bl_root[nodes].u.bmu_avail == radix) {
532 if (!gap_stats_counting(stats))
533 update_gap_stats(stats, i);
541 radix /= BLIST_META_RADIX;
543 if (radix == BLIST_BMAP_RADIX) {
547 mask = bl->bl_root[nodes].u.bmu_bitmap;
548 diff = mask ^ (mask << 1);
549 if (gap_stats_counting(stats))
553 update_gap_stats(stats, i + bitpos(bit));
557 nodes += radix_to_skip(radix);
560 update_gap_stats(stats, i);
561 dump_gap_stats(stats, s);
564 /************************************************************************
565 * ALLOCATION SUPPORT FUNCTIONS *
566 ************************************************************************
568 * These support functions do all the actual work. They may seem
569 * rather longish, but that's because I've commented them up. The
570 * actual code is straight forward.
575 * blist_leaf_alloc() - allocate at a leaf in the radix tree (a bitmap).
577 * This is the core of the allocator and is optimized for the
578 * BLIST_BMAP_RADIX block allocation case. Otherwise, execution
579 * time is proportional to log2(count) + bitpos time.
582 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count)
585 int count1, hi, lo, num_shifts, range1, range_ext;
589 num_shifts = fls(count1);
590 mask = scan->u.bmu_bitmap;
591 while ((-mask & ~mask) != 0 && num_shifts > 0) {
593 * If bit i is set in mask, then bits in [i, i+range1] are set
594 * in scan->u.bmu_bitmap. The value of range1 is equal to
595 * count1 >> num_shifts. Grow range and reduce num_shifts to 0,
596 * while preserving these invariants. The updates to mask leave
597 * fewer bits set, but each bit that remains set represents a
598 * longer string of consecutive bits set in scan->u.bmu_bitmap.
599 * If more updates to mask cannot clear more bits, because mask
600 * is partitioned with all 0 bits preceding all 1 bits, the loop
601 * terminates immediately.
604 range_ext = range1 + ((count1 >> num_shifts) & 1);
606 * mask is a signed quantity for the shift because when it is
607 * shifted right, the sign bit should copied; when the last
608 * block of the leaf is free, pretend, for a while, that all the
609 * blocks that follow it are also free.
611 mask &= (daddr_t)mask >> range_ext;
616 * Update bighint. There is no allocation bigger than range1
617 * starting in this leaf.
619 scan->bm_bighint = range1;
620 return (SWAPBLK_NONE);
623 /* Discard any candidates that appear before blk. */
624 mask &= (u_daddr_t)-1 << (blk & BLIST_BMAP_MASK);
626 return (SWAPBLK_NONE);
629 * The least significant set bit in mask marks the start of the first
630 * available range of sufficient size. Clear all the bits but that one,
631 * and then find its position.
637 if (hi > BLIST_BMAP_RADIX) {
639 * An allocation within this leaf is impossible, so a successful
640 * allocation depends on the next leaf providing some of the blocks.
642 if (((blk / BLIST_BMAP_RADIX + 1) & BLIST_META_MASK) == 0) {
644 * The next leaf has a different meta-node parent, so it
645 * is not necessarily initialized. Update bighint,
646 * comparing the range found at the end of mask to the
647 * largest earlier range that could have been made to
648 * vanish in the initial processing of mask.
650 scan->bm_bighint = imax(BLIST_BMAP_RADIX - lo, range1);
651 return (SWAPBLK_NONE);
653 hi -= BLIST_BMAP_RADIX;
654 if (((scan[1].u.bmu_bitmap + 1) & ~((u_daddr_t)-1 << hi)) != 0) {
656 * The next leaf doesn't have enough free blocks at the
657 * beginning to complete the spanning allocation. The
658 * hint cannot be updated, because the same allocation
659 * request could be satisfied later, by this leaf, if
660 * the state of the next leaf changes, and without any
661 * changes to this leaf.
663 return (SWAPBLK_NONE);
665 /* Clear the first 'hi' bits in the next leaf, allocating them. */
666 scan[1].u.bmu_bitmap &= (u_daddr_t)-1 << hi;
667 hi = BLIST_BMAP_RADIX;
670 /* Set the bits of mask at position 'lo' and higher. */
672 if (hi == BLIST_BMAP_RADIX) {
674 * Update bighint. There is no allocation bigger than range1
675 * available in this leaf after this allocation completes.
677 scan->bm_bighint = range1;
679 /* Clear the bits of mask at position 'hi' and higher. */
680 mask &= (u_daddr_t)-1 >> (BLIST_BMAP_RADIX - hi);
681 /* If this allocation uses all the bits, clear the hint. */
682 if (mask == scan->u.bmu_bitmap)
683 scan->bm_bighint = 0;
685 /* Clear the allocated bits from this leaf. */
686 scan->u.bmu_bitmap &= ~mask;
687 return ((blk & ~BLIST_BMAP_MASK) + lo);
691 * blist_meta_alloc() - allocate at a meta in the radix tree.
693 * Attempt to allocate at a meta node. If we can't, we update
694 * bighint and return a failure. Updating bighint optimize future
695 * calls that hit this node. We have to check for our collapse cases
696 * and we have a few optimizations strewn in as well.
699 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix)
701 daddr_t blk, i, next_skip, r, skip;
703 bool scan_from_start;
705 if (radix == BLIST_BMAP_RADIX)
706 return (blst_leaf_alloc(scan, cursor, count));
707 if (scan->u.bmu_avail < count) {
709 * The meta node's hint must be too large if the allocation
710 * exceeds the number of free blocks. Reduce the hint, and
713 scan->bm_bighint = scan->u.bmu_avail;
714 return (SWAPBLK_NONE);
716 blk = cursor & -radix;
717 skip = radix_to_skip(radix);
718 next_skip = skip / BLIST_META_RADIX;
721 * An ALL-FREE meta node requires special handling before allocating
724 if (scan->u.bmu_avail == radix) {
725 radix /= BLIST_META_RADIX;
728 * Reinitialize each of the meta node's children. An ALL-FREE
729 * meta node cannot have a terminator in any subtree.
731 for (i = 1; i < skip; i += next_skip) {
733 scan[i].u.bmu_bitmap = (u_daddr_t)-1;
735 scan[i].u.bmu_avail = radix;
736 scan[i].bm_bighint = radix;
739 radix /= BLIST_META_RADIX;
744 * The allocation exceeds the number of blocks that are
745 * managed by a subtree of this meta node.
747 panic("allocation too large");
749 scan_from_start = cursor == blk;
750 child = (cursor - blk) / radix;
751 blk += child * radix;
752 for (i = 1 + child * next_skip; i < skip; i += next_skip) {
753 if (count <= scan[i].bm_bighint) {
755 * The allocation might fit beginning in the i'th subtree.
757 r = blst_meta_alloc(&scan[i],
758 cursor > blk ? cursor : blk, count, radix);
759 if (r != SWAPBLK_NONE) {
760 scan->u.bmu_avail -= count;
763 } else if (scan[i].bm_bighint == (daddr_t)-1) {
773 * We couldn't allocate count in this subtree, update bighint.
775 if (scan_from_start && scan->bm_bighint >= count)
776 scan->bm_bighint = count - 1;
778 return (SWAPBLK_NONE);
782 * BLST_LEAF_FREE() - free allocated block from leaf bitmap
786 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
792 * free some data in this bitmap
793 * mask=0000111111111110000
797 n = blk & BLIST_BMAP_MASK;
798 mask = ((u_daddr_t)-1 << n) &
799 ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
800 if (scan->u.bmu_bitmap & mask)
801 panic("freeing free block");
802 scan->u.bmu_bitmap |= mask;
805 * We could probably do a better job here. We are required to make
806 * bighint at least as large as the biggest contiguous block of
807 * data. If we just shoehorn it, a little extra overhead will
808 * be incured on the next allocation (but only that one typically).
810 scan->bm_bighint = BLIST_BMAP_RADIX;
814 * BLST_META_FREE() - free allocated blocks from radix tree meta info
816 * This support routine frees a range of blocks from the bitmap.
817 * The range must be entirely enclosed by this radix node. If a
818 * meta node, we break the range down recursively to free blocks
819 * in subnodes (which means that this code can free an arbitrary
820 * range whereas the allocation code cannot allocate an arbitrary
824 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
826 daddr_t blk, i, next_skip, skip, v;
829 if (scan->bm_bighint == (daddr_t)-1)
830 panic("freeing invalid range");
831 if (radix == BLIST_BMAP_RADIX)
832 return (blst_leaf_free(scan, freeBlk, count));
833 skip = radix_to_skip(radix);
834 next_skip = skip / BLIST_META_RADIX;
836 if (scan->u.bmu_avail == 0) {
838 * ALL-ALLOCATED special case, with possible
839 * shortcut to ALL-FREE special case.
841 scan->u.bmu_avail = count;
842 scan->bm_bighint = count;
844 if (count != radix) {
845 for (i = 1; i < skip; i += next_skip) {
846 if (scan[i].bm_bighint == (daddr_t)-1)
848 scan[i].bm_bighint = 0;
849 if (next_skip == 1) {
850 scan[i].u.bmu_bitmap = 0;
852 scan[i].u.bmu_avail = 0;
858 scan->u.bmu_avail += count;
859 /* scan->bm_bighint = radix; */
863 * ALL-FREE special case.
866 if (scan->u.bmu_avail == radix)
868 if (scan->u.bmu_avail > radix)
869 panic("blst_meta_free: freeing already free blocks (%lld) %lld/%lld",
870 (long long)count, (long long)scan->u.bmu_avail,
874 * Break the free down into its components
877 blk = freeBlk & -radix;
878 radix /= BLIST_META_RADIX;
880 child = (freeBlk - blk) / radix;
881 blk += child * radix;
882 i = 1 + child * next_skip;
883 while (i < skip && blk < freeBlk + count) {
884 v = blk + radix - freeBlk;
887 blst_meta_free(&scan[i], freeBlk, v, radix);
888 if (scan->bm_bighint < scan[i].bm_bighint)
889 scan->bm_bighint = scan[i].bm_bighint;
898 * BLIST_RADIX_COPY() - copy one radix tree to another
900 * Locates free space in the source tree and frees it in the destination
901 * tree. The space may not already be free in the destination.
904 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
907 daddr_t i, next_skip, skip;
913 if (radix == BLIST_BMAP_RADIX) {
914 u_daddr_t v = scan->u.bmu_bitmap;
916 if (v == (u_daddr_t)-1) {
917 blist_free(dest, blk, count);
921 for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) {
922 if (v & ((u_daddr_t)1 << i))
923 blist_free(dest, blk + i, 1);
933 if (scan->u.bmu_avail == 0) {
935 * Source all allocated, leave dest allocated
939 if (scan->u.bmu_avail == radix) {
941 * Source all free, free entire dest
944 blist_free(dest, blk, count);
946 blist_free(dest, blk, radix);
951 skip = radix_to_skip(radix);
952 next_skip = skip / BLIST_META_RADIX;
953 radix /= BLIST_META_RADIX;
955 for (i = 1; count && i < skip; i += next_skip) {
956 if (scan[i].bm_bighint == (daddr_t)-1)
959 if (count >= radix) {
960 blst_copy(&scan[i], blk, radix, dest, radix);
964 blst_copy(&scan[i], blk, radix, dest, count);
973 * BLST_LEAF_FILL() - allocate specific blocks in leaf bitmap
975 * This routine allocates all blocks in the specified range
976 * regardless of any existing allocations in that range. Returns
977 * the number of blocks allocated by the call.
980 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
986 n = blk & BLIST_BMAP_MASK;
987 mask = ((u_daddr_t)-1 << n) &
988 ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
990 /* Count the number of blocks that we are allocating. */
991 nblks = bitcount64(scan->u.bmu_bitmap & mask);
993 scan->u.bmu_bitmap &= ~mask;
998 * BLIST_META_FILL() - allocate specific blocks at a meta node
1000 * This routine allocates the specified range of blocks,
1001 * regardless of any existing allocations in the range. The
1002 * range must be within the extent of this node. Returns the
1003 * number of blocks allocated by the call.
1006 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
1008 daddr_t blk, i, nblks, next_skip, skip, v;
1011 if (scan->bm_bighint == (daddr_t)-1)
1012 panic("filling invalid range");
1013 if (count > radix) {
1015 * The allocation exceeds the number of blocks that are
1016 * managed by this node.
1018 panic("fill too large");
1020 if (radix == BLIST_BMAP_RADIX)
1021 return (blst_leaf_fill(scan, allocBlk, count));
1022 if (count == radix || scan->u.bmu_avail == 0) {
1024 * ALL-ALLOCATED special case
1026 nblks = scan->u.bmu_avail;
1027 scan->u.bmu_avail = 0;
1028 scan->bm_bighint = 0;
1031 skip = radix_to_skip(radix);
1032 next_skip = skip / BLIST_META_RADIX;
1033 blk = allocBlk & -radix;
1036 * An ALL-FREE meta node requires special handling before allocating
1037 * any of its blocks.
1039 if (scan->u.bmu_avail == radix) {
1040 radix /= BLIST_META_RADIX;
1043 * Reinitialize each of the meta node's children. An ALL-FREE
1044 * meta node cannot have a terminator in any subtree.
1046 for (i = 1; i < skip; i += next_skip) {
1048 scan[i].u.bmu_bitmap = (u_daddr_t)-1;
1050 scan[i].u.bmu_avail = radix;
1051 scan[i].bm_bighint = radix;
1054 radix /= BLIST_META_RADIX;
1058 child = (allocBlk - blk) / radix;
1059 blk += child * radix;
1060 i = 1 + child * next_skip;
1061 while (i < skip && blk < allocBlk + count) {
1062 v = blk + radix - allocBlk;
1065 nblks += blst_meta_fill(&scan[i], allocBlk, v, radix);
1071 scan->u.bmu_avail -= nblks;
1076 * BLST_RADIX_INIT() - initialize radix tree
1078 * Initialize our meta structures and bitmaps and calculate the exact
1079 * amount of space required to manage 'count' blocks - this space may
1080 * be considerably less than the calculated radix due to the large
1081 * RADIX values we use.
1084 blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t count)
1086 daddr_t i, memindex, next_skip, skip;
1094 if (radix == BLIST_BMAP_RADIX) {
1096 scan->bm_bighint = 0;
1097 scan->u.bmu_bitmap = 0;
1103 * Meta node. If allocating the entire object we can special
1104 * case it. However, we need to figure out how much memory
1105 * is required to manage 'count' blocks, so we continue on anyway.
1109 scan->bm_bighint = 0;
1110 scan->u.bmu_avail = 0;
1113 skip = radix_to_skip(radix);
1114 next_skip = skip / BLIST_META_RADIX;
1115 radix /= BLIST_META_RADIX;
1117 for (i = 1; i < skip; i += next_skip) {
1118 if (count >= radix) {
1120 * Allocate the entire object
1123 blst_radix_init(((scan) ? &scan[i] : NULL), radix,
1126 } else if (count > 0) {
1128 * Allocate a partial object
1131 blst_radix_init(((scan) ? &scan[i] : NULL), radix,
1136 * Add terminator and break out. Make terminator bitmap
1137 * zero to avoid a spanning leaf allocation that
1138 * includes the terminator.
1141 scan[i].bm_bighint = (daddr_t)-1;
1142 scan[i].u.bmu_bitmap = 0;
1155 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
1157 daddr_t i, next_skip, skip;
1159 if (radix == BLIST_BMAP_RADIX) {
1161 "%*.*s(%08llx,%lld): bitmap %016llx big=%lld\n",
1163 (long long)blk, (long long)radix,
1164 (long long)scan->u.bmu_bitmap,
1165 (long long)scan->bm_bighint
1170 if (scan->u.bmu_avail == 0) {
1172 "%*.*s(%08llx,%lld) ALL ALLOCATED\n",
1179 if (scan->u.bmu_avail == radix) {
1181 "%*.*s(%08llx,%lld) ALL FREE\n",
1190 "%*.*s(%08llx,%lld): subtree (%lld/%lld) big=%lld {\n",
1192 (long long)blk, (long long)radix,
1193 (long long)scan->u.bmu_avail,
1195 (long long)scan->bm_bighint
1198 skip = radix_to_skip(radix);
1199 next_skip = skip / BLIST_META_RADIX;
1200 radix /= BLIST_META_RADIX;
1203 for (i = 1; i < skip; i += next_skip) {
1204 if (scan[i].bm_bighint == (daddr_t)-1) {
1206 "%*.*s(%08llx,%lld): Terminator\n",
1208 (long long)blk, (long long)radix
1212 blst_radix_print(&scan[i], blk, radix, tab);
1228 main(int ac, char **av)
1235 for (i = 1; i < ac; ++i) {
1236 const char *ptr = av[i];
1238 size = strtol(ptr, NULL, 0);
1242 fprintf(stderr, "Bad option: %s\n", ptr - 2);
1245 bl = blist_create(size, M_WAITOK);
1246 blist_free(bl, 0, size);
1251 long long count = 0;
1253 printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1254 (long long)size, (long long)bl->bl_radix);
1256 if (fgets(buf, sizeof(buf), stdin) == NULL)
1260 if (sscanf(buf + 1, "%lld", &count) == 1) {
1261 blist_resize(&bl, count, 1, M_WAITOK);
1269 s = sbuf_new_auto();
1272 printf("%s", sbuf_data(s));
1276 if (sscanf(buf + 1, "%lld", &count) == 1) {
1277 daddr_t blk = blist_alloc(bl, count);
1278 printf(" R=%08llx\n", (long long)blk);
1284 if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1285 blist_free(bl, da, count);
1291 if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1293 (intmax_t)blist_fill(bl, da, count));
1319 panic(const char *ctl, ...)
1324 vfprintf(stderr, ctl, va);
1325 fprintf(stderr, "\n");