2 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved.
3 * Redistribution and use in source and binary forms, with or without
4 * modification, are permitted provided that the following conditions
6 * 1. Redistributions of source code must retain the above copyright
7 * notice, this list of conditions and the following disclaimer.
8 * 2. Redistributions in binary form must reproduce the above copyright
9 * notice, this list of conditions and the following disclaimer in the
10 * documentation and/or other materials provided with the distribution.
11 * 3. Neither the name of the University nor the names of its contributors
12 * may be used to endorse or promote products derived from this software
13 * without specific prior written permission.
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
16 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
19 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
21 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting
30 * This module implements a general bitmap allocator/deallocator. The
31 * allocator eats around 2 bits per 'block'. The module does not
32 * try to interpret the meaning of a 'block' other than to return
33 * SWAPBLK_NONE on an allocation failure.
35 * A radix tree controls access to pieces of the bitmap, and includes
36 * auxiliary information at each interior node about the availabilty of
37 * contiguous free blocks in the subtree rooted at that node. Two radix
38 * constants are involved: one for the size of the bitmaps contained in the
39 * leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of
40 * each of the meta (interior) nodes (BLIST_META_RADIX). Each subtree is
41 * associated with a range of blocks. The root of any subtree stores a
42 * hint field that defines an upper bound on the size of the largest
43 * allocation that can begin in the associated block range. A hint is an
44 * upper bound on a potential allocation, but not necessarily a tight upper
47 * The radix tree also implements two collapsed states for meta nodes:
48 * the ALL-ALLOCATED state and the ALL-FREE state. If a meta node is
49 * in either of these two states, all information contained underneath
50 * the node is considered stale. These states are used to optimize
51 * allocation and freeing operations.
53 * The hinting greatly increases code efficiency for allocations while
54 * the general radix structure optimizes both allocations and frees. The
55 * radix tree should be able to operate well no matter how much
56 * fragmentation there is and no matter how large a bitmap is used.
58 * The blist code wires all necessary memory at creation time. Neither
59 * allocations nor frees require interaction with the memory subsystem.
60 * The non-blocking features of the blist code are used in the swap code
63 * LAYOUT: The radix tree is laid out recursively using a
64 * linear array. Each meta node is immediately followed (laid out
65 * sequentially in memory) by BLIST_META_RADIX lower level nodes. This
66 * is a recursive structure but one that can be easily scanned through
67 * a very simple 'skip' calculation. In order to support large radixes,
68 * portions of the tree may reside outside our memory allocation. We
69 * handle this with an early-termination optimization (when bighint is
70 * set to -1) on the scan. The memory allocation is only large enough
71 * to cover the number of blocks requested at creation time even if it
72 * must be encompassed in larger root-node radix.
74 * NOTE: the allocator cannot currently allocate more than
75 * BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too
76 * large' if you try. This is an area that could use improvement. The
77 * radix is large enough that this restriction does not effect the swap
78 * system, though. Currently only the allocation code is affected by
79 * this algorithmic unfeature. The freeing code can handle arbitrary
82 * This code can be compiled stand-alone for debugging.
85 #include <sys/cdefs.h>
86 __FBSDID("$FreeBSD$");
90 #include <sys/param.h>
91 #include <sys/systm.h>
93 #include <sys/kernel.h>
94 #include <sys/blist.h>
95 #include <sys/malloc.h>
98 #include <sys/mutex.h>
102 #ifndef BLIST_NO_DEBUG
106 #include <sys/types.h>
107 #include <sys/malloc.h>
108 #include <sys/sbuf.h>
116 #define bitcount64(x) __bitcount64((uint64_t)(x))
117 #define malloc(a,b,c) calloc(a, 1)
118 #define free(a,b) free(a)
119 static __inline int imax(int a, int b) { return (a > b ? a : b); }
121 #include <sys/blist.h>
123 void panic(const char *ctl, ...);
128 * static support functions
130 static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
131 static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count,
133 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
134 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
136 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
137 blist_t dest, daddr_t count);
138 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
139 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
142 static void blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
147 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
150 _Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0,
151 "radix divisibility error");
152 #define BLIST_BMAP_MASK (BLIST_BMAP_RADIX - 1)
153 #define BLIST_META_MASK (BLIST_META_RADIX - 1)
156 * For a subtree that can represent the state of up to 'radix' blocks, the
157 * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX. If 'm'
158 * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h
159 * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
160 * or, equivalently, (m**(h+1)-1)/(m-1). This quantity is called 'skip'
161 * in the 'meta' functions that process subtrees. Since integer division
162 * discards remainders, we can express this computation as
163 * skip = (m * m**h) / (m - 1)
164 * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1)
165 * and since m divides BLIST_BMAP_RADIX, we can simplify further to
166 * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1)
167 * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1))
168 * so that simple integer division by a constant can safely be used for the
171 static inline daddr_t
172 radix_to_skip(daddr_t radix)
176 ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK));
180 * Use binary search, or a faster method, to find the 1 bit in a u_daddr_t.
181 * Assumes that the argument has only one bit set.
184 bitpos(u_daddr_t mask)
188 switch (sizeof(mask)) {
189 #ifdef HAVE_INLINE_FFSLL
190 case sizeof(long long):
191 return (ffsll(mask) - 1);
195 hi = BLIST_BMAP_RADIX;
196 while (lo + 1 < hi) {
197 mid = (lo + hi) >> 1;
198 if ((mask >> mid) != 0)
208 * blist_create() - create a blist capable of handling up to the specified
211 * blocks - must be greater than 0
212 * flags - malloc flags
214 * The smallest blist consists of a single leaf node capable of
215 * managing BLIST_BMAP_RADIX blocks.
218 blist_create(daddr_t blocks, int flags)
221 daddr_t i, last_block;
222 u_daddr_t nodes, radix, skip;
226 * Calculate the radix and node count used for scanning. Find the last
227 * block that is followed by a terminator.
229 last_block = blocks - 1;
230 radix = BLIST_BMAP_RADIX;
231 while (radix < blocks) {
232 if (((last_block / radix + 1) & BLIST_META_MASK) != 0)
234 * A terminator will be added. Update last_block to the
235 * position just before that terminator.
237 last_block |= radix - 1;
238 radix *= BLIST_META_RADIX;
242 * Count the meta-nodes in the expanded tree, including the final
243 * terminator, from the bottom level up to the root.
245 nodes = (last_block >= blocks) ? 2 : 1;
246 last_block /= BLIST_BMAP_RADIX;
247 while (last_block > 0) {
248 nodes += last_block + 1;
249 last_block /= BLIST_META_RADIX;
251 bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags |
256 bl->bl_blocks = blocks;
257 bl->bl_radix = radix;
261 * Initialize the empty tree by filling in root values, then initialize
262 * just the terminators in the rest of the tree.
264 bl->bl_root[0].bm_bighint = 0;
265 if (radix == BLIST_BMAP_RADIX)
266 bl->bl_root[0].u.bmu_bitmap = 0;
268 bl->bl_root[0].u.bmu_avail = 0;
269 last_block = blocks - 1;
271 while (radix > BLIST_BMAP_RADIX) {
272 radix /= BLIST_META_RADIX;
273 skip = radix_to_skip(radix);
274 digit = last_block / radix;
275 i += 1 + digit * skip;
276 if (digit != BLIST_META_MASK) {
280 bl->bl_root[i + skip].bm_bighint = (daddr_t)-1;
281 bl->bl_root[i + skip].u.bmu_bitmap = 0;
286 #if defined(BLIST_DEBUG)
288 "BLIST representing %lld blocks (%lld MB of swap)"
289 ", requiring %lldK of ram\n",
290 (long long)bl->bl_blocks,
291 (long long)bl->bl_blocks * 4 / 1024,
292 (long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
294 printf("BLIST raw radix tree contains %lld records\n",
302 blist_destroy(blist_t bl)
309 * blist_alloc() - reserve space in the block bitmap. Return the base
310 * of a contiguous region or SWAPBLK_NONE if space could
314 blist_alloc(blist_t bl, daddr_t count)
319 * This loop iterates at most twice. An allocation failure in the
320 * first iteration leads to a second iteration only if the cursor was
321 * non-zero. When the cursor is zero, an allocation failure will
322 * reduce the hint, stopping further iterations.
324 while (count <= bl->bl_root->bm_bighint) {
325 blk = blst_meta_alloc(bl->bl_root, bl->bl_cursor, count,
327 if (blk != SWAPBLK_NONE) {
328 bl->bl_cursor = blk + count;
329 if (bl->bl_cursor == bl->bl_blocks)
332 } else if (bl->bl_cursor != 0)
335 return (SWAPBLK_NONE);
339 * blist_avail() - return the number of free blocks.
342 blist_avail(blist_t bl)
345 if (bl->bl_radix == BLIST_BMAP_RADIX)
346 return (bitcount64(bl->bl_root->u.bmu_bitmap));
348 return (bl->bl_root->u.bmu_avail);
352 * blist_free() - free up space in the block bitmap. Return the base
353 * of a contiguous region. Panic if an inconsistancy is
357 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
360 blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
364 * blist_fill() - mark a region in the block bitmap as off-limits
365 * to the allocator (i.e. allocate it), ignoring any
366 * existing allocations. Return the number of blocks
367 * actually filled that were free before the call.
370 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
373 return (blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix));
377 * blist_resize() - resize an existing radix tree to handle the
378 * specified number of blocks. This will reallocate
379 * the tree and transfer the previous bitmap to the new
380 * one. When extending the tree you can specify whether
381 * the new blocks are to left allocated or freed.
384 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
386 blist_t newbl = blist_create(count, flags);
390 if (count > save->bl_blocks)
391 count = save->bl_blocks;
392 blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
395 * If resizing upwards, should we free the new space or not?
397 if (freenew && count < newbl->bl_blocks) {
398 blist_free(newbl, count, newbl->bl_blocks - count);
406 * blist_print() - dump radix tree
409 blist_print(blist_t bl)
411 printf("BLIST cursor = %08jx {\n", (uintmax_t)bl->bl_cursor);
412 blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
418 static const u_daddr_t fib[] = {
419 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
420 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
421 514229, 832040, 1346269, 2178309, 3524578,
425 * Use 'gap' to describe a maximal range of unallocated blocks/bits.
428 daddr_t start; /* current gap start, or SWAPBLK_NONE */
429 daddr_t num; /* number of gaps observed */
430 daddr_t max; /* largest gap size */
431 daddr_t avg; /* average gap size */
432 daddr_t err; /* sum - num * avg */
433 daddr_t histo[nitems(fib)]; /* # gaps in each size range */
434 int max_bucket; /* last histo elt with nonzero val */
438 * gap_stats_counting() - is the state 'counting 1 bits'?
439 * or 'skipping 0 bits'?
442 gap_stats_counting(const struct gap_stats *stats)
445 return (stats->start != SWAPBLK_NONE);
449 * init_gap_stats() - initialize stats on gap sizes
452 init_gap_stats(struct gap_stats *stats)
455 bzero(stats, sizeof(*stats));
456 stats->start = SWAPBLK_NONE;
460 * update_gap_stats() - update stats on gap sizes
463 update_gap_stats(struct gap_stats *stats, daddr_t posn)
468 if (!gap_stats_counting(stats)) {
472 size = posn - stats->start;
473 stats->start = SWAPBLK_NONE;
474 if (size > stats->max)
478 * Find the fibonacci range that contains size,
479 * expecting to find it in an early range.
483 while (hi < nitems(fib) && fib[hi] <= size) {
487 if (hi >= nitems(fib))
489 while (lo + 1 != hi) {
490 mid = (lo + hi) >> 1;
491 if (fib[mid] <= size)
497 if (lo > stats->max_bucket)
498 stats->max_bucket = lo;
499 stats->err += size - stats->avg;
501 stats->avg += stats->err / stats->num;
502 stats->err %= stats->num;
506 * dump_gap_stats() - print stats on gap sizes
509 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
513 sbuf_printf(s, "number of maximal free ranges: %jd\n",
514 (intmax_t)stats->num);
515 sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
516 sbuf_printf(s, "average maximal free range size: %jd\n",
517 (intmax_t)stats->avg);
518 sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
519 sbuf_printf(s, " count | size range\n");
520 sbuf_printf(s, " ----- | ----------\n");
521 for (i = 0; i < stats->max_bucket; i++) {
522 if (stats->histo[i] != 0) {
523 sbuf_printf(s, "%20jd | ",
524 (intmax_t)stats->histo[i]);
525 if (fib[i] != fib[i + 1] - 1)
526 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
527 (intmax_t)fib[i + 1] - 1);
529 sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
532 sbuf_printf(s, "%20jd | ", (intmax_t)stats->histo[i]);
533 if (stats->histo[i] > 1)
534 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
535 (intmax_t)stats->max);
537 sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
541 * blist_stats() - dump radix tree stats
544 blist_stats(blist_t bl, struct sbuf *s)
546 struct gap_stats gstats;
547 struct gap_stats *stats = &gstats;
548 daddr_t i, nodes, radix;
549 u_daddr_t bit, diff, mask;
551 init_gap_stats(stats);
554 while (i < bl->bl_radix + bl->bl_blocks) {
556 * Find max size subtree starting at i.
558 radix = BLIST_BMAP_RADIX;
559 while (((i / radix) & BLIST_META_MASK) == 0)
560 radix *= BLIST_META_RADIX;
563 * Check for skippable subtrees starting at i.
565 while (radix > BLIST_BMAP_RADIX) {
566 if (bl->bl_root[nodes].u.bmu_avail == 0) {
567 if (gap_stats_counting(stats))
568 update_gap_stats(stats, i);
571 if (bl->bl_root[nodes].u.bmu_avail == radix) {
572 if (!gap_stats_counting(stats))
573 update_gap_stats(stats, i);
581 radix /= BLIST_META_RADIX;
583 if (radix == BLIST_BMAP_RADIX) {
587 mask = bl->bl_root[nodes].u.bmu_bitmap;
588 diff = mask ^ (mask << 1);
589 if (gap_stats_counting(stats))
593 update_gap_stats(stats, i + bitpos(bit));
597 nodes += radix_to_skip(radix);
600 update_gap_stats(stats, i);
601 dump_gap_stats(stats, s);
604 /************************************************************************
605 * ALLOCATION SUPPORT FUNCTIONS *
606 ************************************************************************
608 * These support functions do all the actual work. They may seem
609 * rather longish, but that's because I've commented them up. The
610 * actual code is straight forward.
615 * blist_leaf_alloc() - allocate at a leaf in the radix tree (a bitmap).
617 * This is the core of the allocator and is optimized for the
618 * BLIST_BMAP_RADIX block allocation case. Otherwise, execution
619 * time is proportional to log2(count) + bitpos time.
622 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count)
625 int count1, hi, lo, num_shifts, range1, range_ext;
629 num_shifts = fls(count1);
630 mask = scan->u.bmu_bitmap;
631 while ((-mask & ~mask) != 0 && num_shifts > 0) {
633 * If bit i is set in mask, then bits in [i, i+range1] are set
634 * in scan->u.bmu_bitmap. The value of range1 is equal to
635 * count1 >> num_shifts. Grow range and reduce num_shifts to 0,
636 * while preserving these invariants. The updates to mask leave
637 * fewer bits set, but each bit that remains set represents a
638 * longer string of consecutive bits set in scan->u.bmu_bitmap.
639 * If more updates to mask cannot clear more bits, because mask
640 * is partitioned with all 0 bits preceding all 1 bits, the loop
641 * terminates immediately.
644 range_ext = range1 + ((count1 >> num_shifts) & 1);
646 * mask is a signed quantity for the shift because when it is
647 * shifted right, the sign bit should copied; when the last
648 * block of the leaf is free, pretend, for a while, that all the
649 * blocks that follow it are also free.
651 mask &= (daddr_t)mask >> range_ext;
656 * Update bighint. There is no allocation bigger than range1
657 * starting in this leaf.
659 scan->bm_bighint = range1;
660 return (SWAPBLK_NONE);
663 /* Discard any candidates that appear before blk. */
664 mask &= (u_daddr_t)-1 << (blk & BLIST_BMAP_MASK);
666 return (SWAPBLK_NONE);
669 * The least significant set bit in mask marks the start of the first
670 * available range of sufficient size. Clear all the bits but that one,
671 * and then find its position.
677 if (hi > BLIST_BMAP_RADIX) {
679 * An allocation within this leaf is impossible, so a successful
680 * allocation depends on the next leaf providing some of the blocks.
682 if (((blk / BLIST_BMAP_RADIX + 1) & BLIST_META_MASK) == 0) {
684 * The next leaf has a different meta-node parent, so it
685 * is not necessarily initialized. Update bighint,
686 * comparing the range found at the end of mask to the
687 * largest earlier range that could have been made to
688 * vanish in the initial processing of mask.
690 scan->bm_bighint = imax(BLIST_BMAP_RADIX - lo, range1);
691 return (SWAPBLK_NONE);
693 hi -= BLIST_BMAP_RADIX;
694 if (((scan[1].u.bmu_bitmap + 1) & ~((u_daddr_t)-1 << hi)) != 0) {
696 * The next leaf doesn't have enough free blocks at the
697 * beginning to complete the spanning allocation. The
698 * hint cannot be updated, because the same allocation
699 * request could be satisfied later, by this leaf, if
700 * the state of the next leaf changes, and without any
701 * changes to this leaf.
703 return (SWAPBLK_NONE);
705 /* Clear the first 'hi' bits in the next leaf, allocating them. */
706 scan[1].u.bmu_bitmap &= (u_daddr_t)-1 << hi;
707 hi = BLIST_BMAP_RADIX;
710 /* Set the bits of mask at position 'lo' and higher. */
712 if (hi == BLIST_BMAP_RADIX) {
714 * Update bighint. There is no allocation bigger than range1
715 * available in this leaf after this allocation completes.
717 scan->bm_bighint = range1;
719 /* Clear the bits of mask at position 'hi' and higher. */
720 mask &= (u_daddr_t)-1 >> (BLIST_BMAP_RADIX - hi);
721 /* If this allocation uses all the bits, clear the hint. */
722 if (mask == scan->u.bmu_bitmap)
723 scan->bm_bighint = 0;
725 /* Clear the allocated bits from this leaf. */
726 scan->u.bmu_bitmap &= ~mask;
727 return ((blk & ~BLIST_BMAP_MASK) + lo);
731 * blist_meta_alloc() - allocate at a meta in the radix tree.
733 * Attempt to allocate at a meta node. If we can't, we update
734 * bighint and return a failure. Updating bighint optimize future
735 * calls that hit this node. We have to check for our collapse cases
736 * and we have a few optimizations strewn in as well.
739 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix)
741 daddr_t blk, i, next_skip, r, skip;
743 bool scan_from_start;
745 if (radix == BLIST_BMAP_RADIX)
746 return (blst_leaf_alloc(scan, cursor, count));
747 if (scan->u.bmu_avail < count) {
749 * The meta node's hint must be too large if the allocation
750 * exceeds the number of free blocks. Reduce the hint, and
753 scan->bm_bighint = scan->u.bmu_avail;
754 return (SWAPBLK_NONE);
756 blk = cursor & -radix;
757 skip = radix_to_skip(radix);
758 next_skip = skip / BLIST_META_RADIX;
761 * An ALL-FREE meta node requires special handling before allocating
764 if (scan->u.bmu_avail == radix) {
765 radix /= BLIST_META_RADIX;
768 * Reinitialize each of the meta node's children. An ALL-FREE
769 * meta node cannot have a terminator in any subtree.
771 for (i = 1; i < skip; i += next_skip) {
773 scan[i].u.bmu_bitmap = (u_daddr_t)-1;
775 scan[i].u.bmu_avail = radix;
776 scan[i].bm_bighint = radix;
779 radix /= BLIST_META_RADIX;
784 * The allocation exceeds the number of blocks that are
785 * managed by a subtree of this meta node.
787 panic("allocation too large");
789 scan_from_start = cursor == blk;
790 child = (cursor - blk) / radix;
791 blk += child * radix;
792 for (i = 1 + child * next_skip; i < skip; i += next_skip) {
793 if (count <= scan[i].bm_bighint) {
795 * The allocation might fit beginning in the i'th subtree.
797 r = blst_meta_alloc(&scan[i],
798 cursor > blk ? cursor : blk, count, radix);
799 if (r != SWAPBLK_NONE) {
800 scan->u.bmu_avail -= count;
803 } else if (scan[i].bm_bighint == (daddr_t)-1) {
813 * We couldn't allocate count in this subtree, update bighint.
815 if (scan_from_start && scan->bm_bighint >= count)
816 scan->bm_bighint = count - 1;
818 return (SWAPBLK_NONE);
822 * BLST_LEAF_FREE() - free allocated block from leaf bitmap
826 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
832 * free some data in this bitmap
833 * mask=0000111111111110000
837 n = blk & BLIST_BMAP_MASK;
838 mask = ((u_daddr_t)-1 << n) &
839 ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
840 if (scan->u.bmu_bitmap & mask)
841 panic("freeing free block");
842 scan->u.bmu_bitmap |= mask;
845 * We could probably do a better job here. We are required to make
846 * bighint at least as large as the biggest contiguous block of
847 * data. If we just shoehorn it, a little extra overhead will
848 * be incured on the next allocation (but only that one typically).
850 scan->bm_bighint = BLIST_BMAP_RADIX;
854 * BLST_META_FREE() - free allocated blocks from radix tree meta info
856 * This support routine frees a range of blocks from the bitmap.
857 * The range must be entirely enclosed by this radix node. If a
858 * meta node, we break the range down recursively to free blocks
859 * in subnodes (which means that this code can free an arbitrary
860 * range whereas the allocation code cannot allocate an arbitrary
864 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
866 daddr_t blk, i, next_skip, skip, v;
869 if (scan->bm_bighint == (daddr_t)-1)
870 panic("freeing invalid range");
871 if (radix == BLIST_BMAP_RADIX)
872 return (blst_leaf_free(scan, freeBlk, count));
873 skip = radix_to_skip(radix);
874 next_skip = skip / BLIST_META_RADIX;
876 if (scan->u.bmu_avail == 0) {
878 * ALL-ALLOCATED special case, with possible
879 * shortcut to ALL-FREE special case.
881 scan->u.bmu_avail = count;
882 scan->bm_bighint = count;
884 if (count != radix) {
885 for (i = 1; i < skip; i += next_skip) {
886 if (scan[i].bm_bighint == (daddr_t)-1)
888 scan[i].bm_bighint = 0;
889 if (next_skip == 1) {
890 scan[i].u.bmu_bitmap = 0;
892 scan[i].u.bmu_avail = 0;
898 scan->u.bmu_avail += count;
899 /* scan->bm_bighint = radix; */
903 * ALL-FREE special case.
906 if (scan->u.bmu_avail == radix)
908 if (scan->u.bmu_avail > radix)
909 panic("blst_meta_free: freeing already free blocks (%lld) %lld/%lld",
910 (long long)count, (long long)scan->u.bmu_avail,
914 * Break the free down into its components
917 blk = freeBlk & -radix;
918 radix /= BLIST_META_RADIX;
920 child = (freeBlk - blk) / radix;
921 blk += child * radix;
922 i = 1 + child * next_skip;
923 while (i < skip && blk < freeBlk + count) {
924 v = blk + radix - freeBlk;
927 blst_meta_free(&scan[i], freeBlk, v, radix);
928 if (scan->bm_bighint < scan[i].bm_bighint)
929 scan->bm_bighint = scan[i].bm_bighint;
938 * BLIST_RADIX_COPY() - copy one radix tree to another
940 * Locates free space in the source tree and frees it in the destination
941 * tree. The space may not already be free in the destination.
944 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
947 daddr_t i, next_skip, skip;
953 if (radix == BLIST_BMAP_RADIX) {
954 u_daddr_t v = scan->u.bmu_bitmap;
956 if (v == (u_daddr_t)-1) {
957 blist_free(dest, blk, count);
961 for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) {
962 if (v & ((u_daddr_t)1 << i))
963 blist_free(dest, blk + i, 1);
973 if (scan->u.bmu_avail == 0) {
975 * Source all allocated, leave dest allocated
979 if (scan->u.bmu_avail == radix) {
981 * Source all free, free entire dest
984 blist_free(dest, blk, count);
986 blist_free(dest, blk, radix);
991 skip = radix_to_skip(radix);
992 next_skip = skip / BLIST_META_RADIX;
993 radix /= BLIST_META_RADIX;
995 for (i = 1; count && i < skip; i += next_skip) {
996 if (scan[i].bm_bighint == (daddr_t)-1)
999 if (count >= radix) {
1000 blst_copy(&scan[i], blk, radix, dest, radix);
1004 blst_copy(&scan[i], blk, radix, dest, count);
1013 * BLST_LEAF_FILL() - allocate specific blocks in leaf bitmap
1015 * This routine allocates all blocks in the specified range
1016 * regardless of any existing allocations in that range. Returns
1017 * the number of blocks allocated by the call.
1020 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
1026 n = blk & BLIST_BMAP_MASK;
1027 mask = ((u_daddr_t)-1 << n) &
1028 ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
1030 /* Count the number of blocks that we are allocating. */
1031 nblks = bitcount64(scan->u.bmu_bitmap & mask);
1033 scan->u.bmu_bitmap &= ~mask;
1038 * BLIST_META_FILL() - allocate specific blocks at a meta node
1040 * This routine allocates the specified range of blocks,
1041 * regardless of any existing allocations in the range. The
1042 * range must be within the extent of this node. Returns the
1043 * number of blocks allocated by the call.
1046 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
1048 daddr_t blk, i, nblks, next_skip, skip, v;
1051 if (scan->bm_bighint == (daddr_t)-1)
1052 panic("filling invalid range");
1053 if (count > radix) {
1055 * The allocation exceeds the number of blocks that are
1056 * managed by this node.
1058 panic("fill too large");
1060 if (radix == BLIST_BMAP_RADIX)
1061 return (blst_leaf_fill(scan, allocBlk, count));
1062 if (count == radix || scan->u.bmu_avail == 0) {
1064 * ALL-ALLOCATED special case
1066 nblks = scan->u.bmu_avail;
1067 scan->u.bmu_avail = 0;
1068 scan->bm_bighint = 0;
1071 skip = radix_to_skip(radix);
1072 next_skip = skip / BLIST_META_RADIX;
1073 blk = allocBlk & -radix;
1076 * An ALL-FREE meta node requires special handling before allocating
1077 * any of its blocks.
1079 if (scan->u.bmu_avail == radix) {
1080 radix /= BLIST_META_RADIX;
1083 * Reinitialize each of the meta node's children. An ALL-FREE
1084 * meta node cannot have a terminator in any subtree.
1086 for (i = 1; i < skip; i += next_skip) {
1088 scan[i].u.bmu_bitmap = (u_daddr_t)-1;
1090 scan[i].u.bmu_avail = radix;
1091 scan[i].bm_bighint = radix;
1094 radix /= BLIST_META_RADIX;
1098 child = (allocBlk - blk) / radix;
1099 blk += child * radix;
1100 i = 1 + child * next_skip;
1101 while (i < skip && blk < allocBlk + count) {
1102 v = blk + radix - allocBlk;
1105 nblks += blst_meta_fill(&scan[i], allocBlk, v, radix);
1111 scan->u.bmu_avail -= nblks;
1118 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
1120 daddr_t i, next_skip, skip;
1122 if (radix == BLIST_BMAP_RADIX) {
1124 "%*.*s(%08llx,%lld): bitmap %016llx big=%lld\n",
1126 (long long)blk, (long long)radix,
1127 (long long)scan->u.bmu_bitmap,
1128 (long long)scan->bm_bighint
1133 if (scan->u.bmu_avail == 0) {
1135 "%*.*s(%08llx,%lld) ALL ALLOCATED\n",
1142 if (scan->u.bmu_avail == radix) {
1144 "%*.*s(%08llx,%lld) ALL FREE\n",
1153 "%*.*s(%08llx,%lld): subtree (%lld/%lld) big=%lld {\n",
1155 (long long)blk, (long long)radix,
1156 (long long)scan->u.bmu_avail,
1158 (long long)scan->bm_bighint
1161 skip = radix_to_skip(radix);
1162 next_skip = skip / BLIST_META_RADIX;
1163 radix /= BLIST_META_RADIX;
1166 for (i = 1; i < skip; i += next_skip) {
1167 if (scan[i].bm_bighint == (daddr_t)-1) {
1169 "%*.*s(%08llx,%lld): Terminator\n",
1171 (long long)blk, (long long)radix
1175 blst_radix_print(&scan[i], blk, radix, tab);
1191 main(int ac, char **av)
1198 for (i = 1; i < ac; ++i) {
1199 const char *ptr = av[i];
1201 size = strtol(ptr, NULL, 0);
1205 fprintf(stderr, "Bad option: %s\n", ptr - 2);
1208 bl = blist_create(size, M_WAITOK);
1209 blist_free(bl, 0, size);
1214 long long count = 0;
1216 printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1217 (long long)size, (long long)bl->bl_radix);
1219 if (fgets(buf, sizeof(buf), stdin) == NULL)
1223 if (sscanf(buf + 1, "%lld", &count) == 1) {
1224 blist_resize(&bl, count, 1, M_WAITOK);
1232 s = sbuf_new_auto();
1235 printf("%s", sbuf_data(s));
1239 if (sscanf(buf + 1, "%lld", &count) == 1) {
1240 daddr_t blk = blist_alloc(bl, count);
1241 printf(" R=%08llx\n", (long long)blk);
1247 if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1248 blist_free(bl, da, count);
1254 if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1256 (intmax_t)blist_fill(bl, da, count));
1282 panic(const char *ctl, ...)
1287 vfprintf(stderr, ctl, va);
1288 fprintf(stderr, "\n");