sys/vm/uma_core.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3  *
   4  * Copyright (c) 2002-2005, 2009, 2013 Jeffrey Roberson <jeff@FreeBSD.org>
   5  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
   6  * Copyright (c) 2004-2006 Robert N. M. Watson
   7  * All rights reserved.
   8  *
   9  * Redistribution and use in source and binary forms, with or without
  10  * modification, are permitted provided that the following conditions
  11  * are met:
  12  * 1. Redistributions of source code must retain the above copyright
  13  *    notice unmodified, this list of conditions, and the following
  14  *    disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29  */
  30
  31 /*
  32  * uma_core.c  Implementation of the Universal Memory allocator
  33  *
  34  * This allocator is intended to replace the multitude of similar object caches
  35  * in the standard FreeBSD kernel.  The intent is to be flexible as well as
  36  * efficient.  A primary design goal is to return unused memory to the rest of
  37  * the system.  This will make the system as a whole more flexible due to the
  38  * ability to move memory to subsystems which most need it instead of leaving
  39  * pools of reserved memory unused.
  40  *
  41  * The basic ideas stem from similar slab/zone based allocators whose algorithms
  42  * are well known.
  43  *
  44  */
  45
  46 /*
  47  * TODO:
  48  *      - Improve memory usage for large allocations
  49  *      - Investigate cache size adjustments
  50  */
  51
  52 #include <sys/cdefs.h>
  53 __FBSDID("$FreeBSD$");
  54
  55 #include "opt_ddb.h"
  56 #include "opt_param.h"
  57 #include "opt_vm.h"
  58
  59 #include <sys/param.h>
  60 #include <sys/systm.h>
  61 #include <sys/bitset.h>
  62 #include <sys/eventhandler.h>
  63 #include <sys/kernel.h>
  64 #include <sys/types.h>
  65 #include <sys/limits.h>
  66 #include <sys/queue.h>
  67 #include <sys/malloc.h>
  68 #include <sys/ktr.h>
  69 #include <sys/lock.h>
  70 #include <sys/sysctl.h>
  71 #include <sys/mutex.h>
  72 #include <sys/proc.h>
  73 #include <sys/random.h>
  74 #include <sys/rwlock.h>
  75 #include <sys/sbuf.h>
  76 #include <sys/sched.h>
  77 #include <sys/smp.h>
  78 #include <sys/taskqueue.h>
  79 #include <sys/vmmeter.h>
  80
  81 #include <vm/vm.h>
  82 #include <vm/vm_object.h>
  83 #include <vm/vm_page.h>
  84 #include <vm/vm_pageout.h>
  85 #include <vm/vm_param.h>
  86 #include <vm/vm_phys.h>
  87 #include <vm/vm_map.h>
  88 #include <vm/vm_kern.h>
  89 #include <vm/vm_extern.h>
  90 #include <vm/uma.h>
  91 #include <vm/uma_int.h>
  92 #include <vm/uma_dbg.h>
  93
  94 #include <ddb/ddb.h>
  95
  96 #ifdef DEBUG_MEMGUARD
  97 #include <vm/memguard.h>
  98 #endif
  99
 100 /*
 101  * This is the zone and keg from which all zones are spawned.
 102  */
 103 static uma_zone_t kegs;
 104 static uma_zone_t zones;
 105
 106 /* This is the zone from which all offpage uma_slab_ts are allocated. */
 107 static uma_zone_t slabzone;
 108
 109 /*
 110  * The initial hash tables come out of this zone so they can be allocated
 111  * prior to malloc coming up.
 112  */
 113 static uma_zone_t hashzone;
 114
 115 /* The boot-time adjusted value for cache line alignment. */
 116 int uma_align_cache = 64 - 1;
 117
 118 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
 119
 120 /*
 121  * Are we allowed to allocate buckets?
 122  */
 123 static int bucketdisable = 1;
 124
 125 /* Linked list of all kegs in the system */
 126 static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs);
 127
 128 /* Linked list of all cache-only zones in the system */
 129 static LIST_HEAD(,uma_zone) uma_cachezones =
 130     LIST_HEAD_INITIALIZER(uma_cachezones);
 131
 132 /* This RW lock protects the keg list */
 133 static struct rwlock_padalign __exclusive_cache_line uma_rwlock;
 134
 135 /*
 136  * Pointer and counter to pool of pages, that is preallocated at
 137  * startup to bootstrap UMA.
 138  */
 139 static char *bootmem;
 140 static int boot_pages;
 141
 142 static struct sx uma_drain_lock;
 143
 144 /* kmem soft limit. */
 145 static unsigned long uma_kmem_limit = LONG_MAX;
 146 static volatile unsigned long uma_kmem_total;
 147
 148 /* Is the VM done starting up? */
 149 static enum { BOOT_COLD = 0, BOOT_STRAPPED, BOOT_PAGEALLOC, BOOT_BUCKETS,
 150     BOOT_RUNNING } booted = BOOT_COLD;
 151
 152 /*
 153  * This is the handle used to schedule events that need to happen
 154  * outside of the allocation fast path.
 155  */
 156 static struct callout uma_callout;
 157 #define UMA_TIMEOUT     20              /* Seconds for callout interval. */
 158
 159 /*
 160  * This structure is passed as the zone ctor arg so that I don't have to create
 161  * a special allocation function just for zones.
 162  */
 163 struct uma_zctor_args {
 164         const char *name;
 165         size_t size;
 166         uma_ctor ctor;
 167         uma_dtor dtor;
 168         uma_init uminit;
 169         uma_fini fini;
 170         uma_import import;
 171         uma_release release;
 172         void *arg;
 173         uma_keg_t keg;
 174         int align;
 175         uint32_t flags;
 176 };
 177
 178 struct uma_kctor_args {
 179         uma_zone_t zone;
 180         size_t size;
 181         uma_init uminit;
 182         uma_fini fini;
 183         int align;
 184         uint32_t flags;
 185 };
 186
 187 struct uma_bucket_zone {
 188         uma_zone_t      ubz_zone;
 189         char            *ubz_name;
 190         int             ubz_entries;    /* Number of items it can hold. */
 191         int             ubz_maxsize;    /* Maximum allocation size per-item. */
 192 };
 193
 194 /*
 195  * Compute the actual number of bucket entries to pack them in power
 196  * of two sizes for more efficient space utilization.
 197  */
 198 #define BUCKET_SIZE(n)                                          \
 199     (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *))
 200
 201 #define BUCKET_MAX      BUCKET_SIZE(256)
 202
 203 struct uma_bucket_zone bucket_zones[] = {
 204         { NULL, "4 Bucket", BUCKET_SIZE(4), 4096 },
 205         { NULL, "6 Bucket", BUCKET_SIZE(6), 3072 },
 206         { NULL, "8 Bucket", BUCKET_SIZE(8), 2048 },
 207         { NULL, "12 Bucket", BUCKET_SIZE(12), 1536 },
 208         { NULL, "16 Bucket", BUCKET_SIZE(16), 1024 },
 209         { NULL, "32 Bucket", BUCKET_SIZE(32), 512 },
 210         { NULL, "64 Bucket", BUCKET_SIZE(64), 256 },
 211         { NULL, "128 Bucket", BUCKET_SIZE(128), 128 },
 212         { NULL, "256 Bucket", BUCKET_SIZE(256), 64 },
 213         { NULL, NULL, 0}
 214 };
 215
 216 /*
 217  * Flags and enumerations to be passed to internal functions.
 218  */
 219 enum zfreeskip { SKIP_NONE = 0, SKIP_DTOR, SKIP_FINI };
 220
 221 #define UMA_ANYDOMAIN   -1      /* Special value for domain search. */
 222
 223 /* Prototypes.. */
 224
 225 int     uma_startup_count(int);
 226 void    uma_startup(void *, int);
 227 void    uma_startup1(void);
 228 void    uma_startup2(void);
 229
 230 static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 231 static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 232 static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 233 static void page_free(void *, vm_size_t, uint8_t);
 234 static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int);
 235 static void cache_drain(uma_zone_t);
 236 static void bucket_drain(uma_zone_t, uma_bucket_t);
 237 static void bucket_cache_drain(uma_zone_t zone);
 238 static int keg_ctor(void *, int, void *, int);
 239 static void keg_dtor(void *, int, void *);
 240 static int zone_ctor(void *, int, void *, int);
 241 static void zone_dtor(void *, int, void *);
 242 static int zero_init(void *, int, int);
 243 static void keg_small_init(uma_keg_t keg);
 244 static void keg_large_init(uma_keg_t keg);
 245 static void zone_foreach(void (*zfunc)(uma_zone_t));
 246 static void zone_timeout(uma_zone_t zone);
 247 static int hash_alloc(struct uma_hash *);
 248 static int hash_expand(struct uma_hash *, struct uma_hash *);
 249 static void hash_free(struct uma_hash *hash);
 250 static void uma_timeout(void *);
 251 static void uma_startup3(void);
 252 static void *zone_alloc_item(uma_zone_t, void *, int, int);
 253 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip);
 254 static void bucket_enable(void);
 255 static void bucket_init(void);
 256 static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int);
 257 static void bucket_free(uma_zone_t zone, uma_bucket_t, void *);
 258 static void bucket_zone_drain(void);
 259 static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int);
 260 static uma_slab_t zone_fetch_slab(uma_zone_t, uma_keg_t, int, int);
 261 static uma_slab_t zone_fetch_slab_multi(uma_zone_t, uma_keg_t, int, int);
 262 static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab);
 263 static void slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item);
 264 static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
 265     uma_fini fini, int align, uint32_t flags);
 266 static int zone_import(uma_zone_t, void **, int, int, int);
 267 static void zone_release(uma_zone_t, void **, int);
 268 static void uma_zero_item(void *, uma_zone_t);
 269
 270 void uma_print_zone(uma_zone_t);
 271 void uma_print_stats(void);
 272 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
 273 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
 274
 275 #ifdef INVARIANTS
 276 static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item);
 277 static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item);
 278 #endif
 279
 280 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
 281
 282 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT,
 283     0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones");
 284
 285 SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
 286     0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
 287
 288 static int zone_warnings = 1;
 289 SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0,
 290     "Warn when UMA zones becomes full");
 291
 292 /* Adjust bytes under management by UMA. */
 293 static inline void
 294 uma_total_dec(unsigned long size)
 295 {
 296
 297         atomic_subtract_long(&uma_kmem_total, size);
 298 }
 299
 300 static inline void
 301 uma_total_inc(unsigned long size)
 302 {
 303
 304         if (atomic_fetchadd_long(&uma_kmem_total, size) > uma_kmem_limit)
 305                 uma_reclaim_wakeup();
 306 }
 307
 308 /*
 309  * This routine checks to see whether or not it's safe to enable buckets.
 310  */
 311 static void
 312 bucket_enable(void)
 313 {
 314         bucketdisable = vm_page_count_min();
 315 }
 316
 317 /*
 318  * Initialize bucket_zones, the array of zones of buckets of various sizes.
 319  *
 320  * For each zone, calculate the memory required for each bucket, consisting
 321  * of the header and an array of pointers.
 322  */
 323 static void
 324 bucket_init(void)
 325 {
 326         struct uma_bucket_zone *ubz;
 327         int size;
 328
 329         for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) {
 330                 size = roundup(sizeof(struct uma_bucket), sizeof(void *));
 331                 size += sizeof(void *) * ubz->ubz_entries;
 332                 ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
 333                     NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 334                     UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET | UMA_ZONE_NUMA);
 335         }
 336 }
 337
 338 /*
 339  * Given a desired number of entries for a bucket, return the zone from which
 340  * to allocate the bucket.
 341  */
 342 static struct uma_bucket_zone *
 343 bucket_zone_lookup(int entries)
 344 {
 345         struct uma_bucket_zone *ubz;
 346
 347         for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
 348                 if (ubz->ubz_entries >= entries)
 349                         return (ubz);
 350         ubz--;
 351         return (ubz);
 352 }
 353
 354 static int
 355 bucket_select(int size)
 356 {
 357         struct uma_bucket_zone *ubz;
 358
 359         ubz = &bucket_zones[0];
 360         if (size > ubz->ubz_maxsize)
 361                 return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1);
 362
 363         for (; ubz->ubz_entries != 0; ubz++)
 364                 if (ubz->ubz_maxsize < size)
 365                         break;
 366         ubz--;
 367         return (ubz->ubz_entries);
 368 }
 369
 370 static uma_bucket_t
 371 bucket_alloc(uma_zone_t zone, void *udata, int flags)
 372 {
 373         struct uma_bucket_zone *ubz;
 374         uma_bucket_t bucket;
 375
 376         /*
 377          * This is to stop us from allocating per cpu buckets while we're
 378          * running out of vm.boot_pages.  Otherwise, we would exhaust the
 379          * boot pages.  This also prevents us from allocating buckets in
 380          * low memory situations.
 381          */
 382         if (bucketdisable)
 383                 return (NULL);
 384         /*
 385          * To limit bucket recursion we store the original zone flags
 386          * in a cookie passed via zalloc_arg/zfree_arg.  This allows the
 387          * NOVM flag to persist even through deep recursions.  We also
 388          * store ZFLAG_BUCKET once we have recursed attempting to allocate
 389          * a bucket for a bucket zone so we do not allow infinite bucket
 390          * recursion.  This cookie will even persist to frees of unused
 391          * buckets via the allocation path or bucket allocations in the
 392          * free path.
 393          */
 394         if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
 395                 udata = (void *)(uintptr_t)zone->uz_flags;
 396         else {
 397                 if ((uintptr_t)udata & UMA_ZFLAG_BUCKET)
 398                         return (NULL);
 399                 udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET);
 400         }
 401         if ((uintptr_t)udata & UMA_ZFLAG_CACHEONLY)
 402                 flags |= M_NOVM;
 403         ubz = bucket_zone_lookup(zone->uz_count);
 404         if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0)
 405                 ubz++;
 406         bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags);
 407         if (bucket) {
 408 #ifdef INVARIANTS
 409                 bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
 410 #endif
 411                 bucket->ub_cnt = 0;
 412                 bucket->ub_entries = ubz->ubz_entries;
 413         }
 414
 415         return (bucket);
 416 }
 417
 418 static void
 419 bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata)
 420 {
 421         struct uma_bucket_zone *ubz;
 422
 423         KASSERT(bucket->ub_cnt == 0,
 424             ("bucket_free: Freeing a non free bucket."));
 425         if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
 426                 udata = (void *)(uintptr_t)zone->uz_flags;
 427         ubz = bucket_zone_lookup(bucket->ub_entries);
 428         uma_zfree_arg(ubz->ubz_zone, bucket, udata);
 429 }
 430
 431 static void
 432 bucket_zone_drain(void)
 433 {
 434         struct uma_bucket_zone *ubz;
 435
 436         for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
 437                 zone_drain(ubz->ubz_zone);
 438 }
 439
 440 static void
 441 zone_log_warning(uma_zone_t zone)
 442 {
 443         static const struct timeval warninterval = { 300, 0 };
 444
 445         if (!zone_warnings || zone->uz_warning == NULL)
 446                 return;
 447
 448         if (ratecheck(&zone->uz_ratecheck, &warninterval))
 449                 printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning);
 450 }
 451
 452 static inline void
 453 zone_maxaction(uma_zone_t zone)
 454 {
 455
 456         if (zone->uz_maxaction.ta_func != NULL)
 457                 taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction);
 458 }
 459
 460 static void
 461 zone_foreach_keg(uma_zone_t zone, void (*kegfn)(uma_keg_t))
 462 {
 463         uma_klink_t klink;
 464
 465         LIST_FOREACH(klink, &zone->uz_kegs, kl_link)
 466                 kegfn(klink->kl_keg);
 467 }
 468
 469 /*
 470  * Routine called by timeout which is used to fire off some time interval
 471  * based calculations.  (stats, hash size, etc.)
 472  *
 473  * Arguments:
 474  *      arg   Unused
 475  *
 476  * Returns:
 477  *      Nothing
 478  */
 479 static void
 480 uma_timeout(void *unused)
 481 {
 482         bucket_enable();
 483         zone_foreach(zone_timeout);
 484
 485         /* Reschedule this event */
 486         callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
 487 }
 488
 489 /*
 490  * Routine to perform timeout driven calculations.  This expands the
 491  * hashes and does per cpu statistics aggregation.
 492  *
 493  *  Returns nothing.
 494  */
 495 static void
 496 keg_timeout(uma_keg_t keg)
 497 {
 498
 499         KEG_LOCK(keg);
 500         /*
 501          * Expand the keg hash table.
 502          *
 503          * This is done if the number of slabs is larger than the hash size.
 504          * What I'm trying to do here is completely reduce collisions.  This
 505          * may be a little aggressive.  Should I allow for two collisions max?
 506          */
 507         if (keg->uk_flags & UMA_ZONE_HASH &&
 508             keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) {
 509                 struct uma_hash newhash;
 510                 struct uma_hash oldhash;
 511                 int ret;
 512
 513                 /*
 514                  * This is so involved because allocating and freeing
 515                  * while the keg lock is held will lead to deadlock.
 516                  * I have to do everything in stages and check for
 517                  * races.
 518                  */
 519                 newhash = keg->uk_hash;
 520                 KEG_UNLOCK(keg);
 521                 ret = hash_alloc(&newhash);
 522                 KEG_LOCK(keg);
 523                 if (ret) {
 524                         if (hash_expand(&keg->uk_hash, &newhash)) {
 525                                 oldhash = keg->uk_hash;
 526                                 keg->uk_hash = newhash;
 527                         } else
 528                                 oldhash = newhash;
 529
 530                         KEG_UNLOCK(keg);
 531                         hash_free(&oldhash);
 532                         return;
 533                 }
 534         }
 535         KEG_UNLOCK(keg);
 536 }
 537
 538 static void
 539 zone_timeout(uma_zone_t zone)
 540 {
 541
 542         zone_foreach_keg(zone, &keg_timeout);
 543 }
 544
 545 /*
 546  * Allocate and zero fill the next sized hash table from the appropriate
 547  * backing store.
 548  *
 549  * Arguments:
 550  *      hash  A new hash structure with the old hash size in uh_hashsize
 551  *
 552  * Returns:
 553  *      1 on success and 0 on failure.
 554  */
 555 static int
 556 hash_alloc(struct uma_hash *hash)
 557 {
 558         int oldsize;
 559         int alloc;
 560
 561         oldsize = hash->uh_hashsize;
 562
 563         /* We're just going to go to a power of two greater */
 564         if (oldsize)  {
 565                 hash->uh_hashsize = oldsize * 2;
 566                 alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
 567                 hash->uh_slab_hash = (struct slabhead *)malloc(alloc,
 568                     M_UMAHASH, M_NOWAIT);
 569         } else {
 570                 alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
 571                 hash->uh_slab_hash = zone_alloc_item(hashzone, NULL,
 572                     UMA_ANYDOMAIN, M_WAITOK);
 573                 hash->uh_hashsize = UMA_HASH_SIZE_INIT;
 574         }
 575         if (hash->uh_slab_hash) {
 576                 bzero(hash->uh_slab_hash, alloc);
 577                 hash->uh_hashmask = hash->uh_hashsize - 1;
 578                 return (1);
 579         }
 580
 581         return (0);
 582 }
 583
 584 /*
 585  * Expands the hash table for HASH zones.  This is done from zone_timeout
 586  * to reduce collisions.  This must not be done in the regular allocation
 587  * path, otherwise, we can recurse on the vm while allocating pages.
 588  *
 589  * Arguments:
 590  *      oldhash  The hash you want to expand
 591  *      newhash  The hash structure for the new table
 592  *
 593  * Returns:
 594  *      Nothing
 595  *
 596  * Discussion:
 597  */
 598 static int
 599 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
 600 {
 601         uma_slab_t slab;
 602         int hval;
 603         int i;
 604
 605         if (!newhash->uh_slab_hash)
 606                 return (0);
 607
 608         if (oldhash->uh_hashsize >= newhash->uh_hashsize)
 609                 return (0);
 610
 611         /*
 612          * I need to investigate hash algorithms for resizing without a
 613          * full rehash.
 614          */
 615
 616         for (i = 0; i < oldhash->uh_hashsize; i++)
 617                 while (!SLIST_EMPTY(&oldhash->uh_slab_hash[i])) {
 618                         slab = SLIST_FIRST(&oldhash->uh_slab_hash[i]);
 619                         SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[i], us_hlink);
 620                         hval = UMA_HASH(newhash, slab->us_data);
 621                         SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
 622                             slab, us_hlink);
 623                 }
 624
 625         return (1);
 626 }
 627
 628 /*
 629  * Free the hash bucket to the appropriate backing store.
 630  *
 631  * Arguments:
 632  *      slab_hash  The hash bucket we're freeing
 633  *      hashsize   The number of entries in that hash bucket
 634  *
 635  * Returns:
 636  *      Nothing
 637  */
 638 static void
 639 hash_free(struct uma_hash *hash)
 640 {
 641         if (hash->uh_slab_hash == NULL)
 642                 return;
 643         if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
 644                 zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE);
 645         else
 646                 free(hash->uh_slab_hash, M_UMAHASH);
 647 }
 648
 649 /*
 650  * Frees all outstanding items in a bucket
 651  *
 652  * Arguments:
 653  *      zone   The zone to free to, must be unlocked.
 654  *      bucket The free/alloc bucket with items, cpu queue must be locked.
 655  *
 656  * Returns:
 657  *      Nothing
 658  */
 659
 660 static void
 661 bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
 662 {
 663         int i;
 664
 665         if (bucket == NULL)
 666                 return;
 667
 668         if (zone->uz_fini)
 669                 for (i = 0; i < bucket->ub_cnt; i++)
 670                         zone->uz_fini(bucket->ub_bucket[i], zone->uz_size);
 671         zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt);
 672         bucket->ub_cnt = 0;
 673 }
 674
 675 /*
 676  * Drains the per cpu caches for a zone.
 677  *
 678  * NOTE: This may only be called while the zone is being turn down, and not
 679  * during normal operation.  This is necessary in order that we do not have
 680  * to migrate CPUs to drain the per-CPU caches.
 681  *
 682  * Arguments:
 683  *      zone     The zone to drain, must be unlocked.
 684  *
 685  * Returns:
 686  *      Nothing
 687  */
 688 static void
 689 cache_drain(uma_zone_t zone)
 690 {
 691         uma_cache_t cache;
 692         int cpu;
 693
 694         /*
 695          * XXX: It is safe to not lock the per-CPU caches, because we're
 696          * tearing down the zone anyway.  I.e., there will be no further use
 697          * of the caches at this point.
 698          *
 699          * XXX: It would good to be able to assert that the zone is being
 700          * torn down to prevent improper use of cache_drain().
 701          *
 702          * XXX: We lock the zone before passing into bucket_cache_drain() as
 703          * it is used elsewhere.  Should the tear-down path be made special
 704          * there in some form?
 705          */
 706         CPU_FOREACH(cpu) {
 707                 cache = &zone->uz_cpu[cpu];
 708                 bucket_drain(zone, cache->uc_allocbucket);
 709                 bucket_drain(zone, cache->uc_freebucket);
 710                 if (cache->uc_allocbucket != NULL)
 711                         bucket_free(zone, cache->uc_allocbucket, NULL);
 712                 if (cache->uc_freebucket != NULL)
 713                         bucket_free(zone, cache->uc_freebucket, NULL);
 714                 cache->uc_allocbucket = cache->uc_freebucket = NULL;
 715         }
 716         ZONE_LOCK(zone);
 717         bucket_cache_drain(zone);
 718         ZONE_UNLOCK(zone);
 719 }
 720
 721 static void
 722 cache_shrink(uma_zone_t zone)
 723 {
 724
 725         if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
 726                 return;
 727
 728         ZONE_LOCK(zone);
 729         zone->uz_count = (zone->uz_count_min + zone->uz_count) / 2;
 730         ZONE_UNLOCK(zone);
 731 }
 732
 733 static void
 734 cache_drain_safe_cpu(uma_zone_t zone)
 735 {
 736         uma_cache_t cache;
 737         uma_bucket_t b1, b2;
 738         int domain;
 739
 740         if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
 741                 return;
 742
 743         b1 = b2 = NULL;
 744         ZONE_LOCK(zone);
 745         critical_enter();
 746         if (zone->uz_flags & UMA_ZONE_NUMA)
 747                 domain = PCPU_GET(domain);
 748         else
 749                 domain = 0;
 750         cache = &zone->uz_cpu[curcpu];
 751         if (cache->uc_allocbucket) {
 752                 if (cache->uc_allocbucket->ub_cnt != 0)
 753                         LIST_INSERT_HEAD(&zone->uz_domain[domain].uzd_buckets,
 754                             cache->uc_allocbucket, ub_link);
 755                 else
 756                         b1 = cache->uc_allocbucket;
 757                 cache->uc_allocbucket = NULL;
 758         }
 759         if (cache->uc_freebucket) {
 760                 if (cache->uc_freebucket->ub_cnt != 0)
 761                         LIST_INSERT_HEAD(&zone->uz_domain[domain].uzd_buckets,
 762                             cache->uc_freebucket, ub_link);
 763                 else
 764                         b2 = cache->uc_freebucket;
 765                 cache->uc_freebucket = NULL;
 766         }
 767         critical_exit();
 768         ZONE_UNLOCK(zone);
 769         if (b1)
 770                 bucket_free(zone, b1, NULL);
 771         if (b2)
 772                 bucket_free(zone, b2, NULL);
 773 }
 774
 775 /*
 776  * Safely drain per-CPU caches of a zone(s) to alloc bucket.
 777  * This is an expensive call because it needs to bind to all CPUs
 778  * one by one and enter a critical section on each of them in order
 779  * to safely access their cache buckets.
 780  * Zone lock must not be held on call this function.
 781  */
 782 static void
 783 cache_drain_safe(uma_zone_t zone)
 784 {
 785         int cpu;
 786
 787         /*
 788          * Polite bucket sizes shrinking was not enouth, shrink aggressively.
 789          */
 790         if (zone)
 791                 cache_shrink(zone);
 792         else
 793                 zone_foreach(cache_shrink);
 794
 795         CPU_FOREACH(cpu) {
 796                 thread_lock(curthread);
 797                 sched_bind(curthread, cpu);
 798                 thread_unlock(curthread);
 799
 800                 if (zone)
 801                         cache_drain_safe_cpu(zone);
 802                 else
 803                         zone_foreach(cache_drain_safe_cpu);
 804         }
 805         thread_lock(curthread);
 806         sched_unbind(curthread);
 807         thread_unlock(curthread);
 808 }
 809
 810 /*
 811  * Drain the cached buckets from a zone.  Expects a locked zone on entry.
 812  */
 813 static void
 814 bucket_cache_drain(uma_zone_t zone)
 815 {
 816         uma_zone_domain_t zdom;
 817         uma_bucket_t bucket;
 818         int i;
 819
 820         /*
 821          * Drain the bucket queues and free the buckets.
 822          */
 823         for (i = 0; i < vm_ndomains; i++) {
 824                 zdom = &zone->uz_domain[i];
 825                 while ((bucket = LIST_FIRST(&zdom->uzd_buckets)) != NULL) {
 826                         LIST_REMOVE(bucket, ub_link);
 827                         ZONE_UNLOCK(zone);
 828                         bucket_drain(zone, bucket);
 829                         bucket_free(zone, bucket, NULL);
 830                         ZONE_LOCK(zone);
 831                 }
 832         }
 833
 834         /*
 835          * Shrink further bucket sizes.  Price of single zone lock collision
 836          * is probably lower then price of global cache drain.
 837          */
 838         if (zone->uz_count > zone->uz_count_min)
 839                 zone->uz_count--;
 840 }
 841
 842 static void
 843 keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start)
 844 {
 845         uint8_t *mem;
 846         int i;
 847         uint8_t flags;
 848
 849         CTR4(KTR_UMA, "keg_free_slab keg %s(%p) slab %p, returning %d bytes",
 850             keg->uk_name, keg, slab, PAGE_SIZE * keg->uk_ppera);
 851
 852         mem = slab->us_data;
 853         flags = slab->us_flags;
 854         i = start;
 855         if (keg->uk_fini != NULL) {
 856                 for (i--; i > -1; i--)
 857                         keg->uk_fini(slab->us_data + (keg->uk_rsize * i),
 858                             keg->uk_size);
 859         }
 860         if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 861                 zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
 862         keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera, flags);
 863         uma_total_dec(PAGE_SIZE * keg->uk_ppera);
 864 }
 865
 866 /*
 867  * Frees pages from a keg back to the system.  This is done on demand from
 868  * the pageout daemon.
 869  *
 870  * Returns nothing.
 871  */
 872 static void
 873 keg_drain(uma_keg_t keg)
 874 {
 875         struct slabhead freeslabs = { 0 };
 876         uma_domain_t dom;
 877         uma_slab_t slab, tmp;
 878         int i;
 879
 880         /*
 881          * We don't want to take pages from statically allocated kegs at this
 882          * time
 883          */
 884         if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)
 885                 return;
 886
 887         CTR3(KTR_UMA, "keg_drain %s(%p) free items: %u",
 888             keg->uk_name, keg, keg->uk_free);
 889         KEG_LOCK(keg);
 890         if (keg->uk_free == 0)
 891                 goto finished;
 892
 893         for (i = 0; i < vm_ndomains; i++) {
 894                 dom = &keg->uk_domain[i];
 895                 LIST_FOREACH_SAFE(slab, &dom->ud_free_slab, us_link, tmp) {
 896                         /* We have nowhere to free these to. */
 897                         if (slab->us_flags & UMA_SLAB_BOOT)
 898                                 continue;
 899
 900                         LIST_REMOVE(slab, us_link);
 901                         keg->uk_pages -= keg->uk_ppera;
 902                         keg->uk_free -= keg->uk_ipers;
 903
 904                         if (keg->uk_flags & UMA_ZONE_HASH)
 905                                 UMA_HASH_REMOVE(&keg->uk_hash, slab,
 906                                     slab->us_data);
 907
 908                         SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
 909                 }
 910         }
 911
 912 finished:
 913         KEG_UNLOCK(keg);
 914
 915         while ((slab = SLIST_FIRST(&freeslabs)) != NULL) {
 916                 SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink);
 917                 keg_free_slab(keg, slab, keg->uk_ipers);
 918         }
 919 }
 920
 921 static void
 922 zone_drain_wait(uma_zone_t zone, int waitok)
 923 {
 924
 925         /*
 926          * Set draining to interlock with zone_dtor() so we can release our
 927          * locks as we go.  Only dtor() should do a WAITOK call since it
 928          * is the only call that knows the structure will still be available
 929          * when it wakes up.
 930          */
 931         ZONE_LOCK(zone);
 932         while (zone->uz_flags & UMA_ZFLAG_DRAINING) {
 933                 if (waitok == M_NOWAIT)
 934                         goto out;
 935                 msleep(zone, zone->uz_lockptr, PVM, "zonedrain", 1);
 936         }
 937         zone->uz_flags |= UMA_ZFLAG_DRAINING;
 938         bucket_cache_drain(zone);
 939         ZONE_UNLOCK(zone);
 940         /*
 941          * The DRAINING flag protects us from being freed while
 942          * we're running.  Normally the uma_rwlock would protect us but we
 943          * must be able to release and acquire the right lock for each keg.
 944          */
 945         zone_foreach_keg(zone, &keg_drain);
 946         ZONE_LOCK(zone);
 947         zone->uz_flags &= ~UMA_ZFLAG_DRAINING;
 948         wakeup(zone);
 949 out:
 950         ZONE_UNLOCK(zone);
 951 }
 952
 953 void
 954 zone_drain(uma_zone_t zone)
 955 {
 956
 957         zone_drain_wait(zone, M_NOWAIT);
 958 }
 959
 960 /*
 961  * Allocate a new slab for a keg.  This does not insert the slab onto a list.
 962  *
 963  * Arguments:
 964  *      wait  Shall we wait?
 965  *
 966  * Returns:
 967  *      The slab that was allocated or NULL if there is no memory and the
 968  *      caller specified M_NOWAIT.
 969  */
 970 static uma_slab_t
 971 keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int wait)
 972 {
 973         uma_alloc allocf;
 974         uma_slab_t slab;
 975         unsigned long size;
 976         uint8_t *mem;
 977         uint8_t flags;
 978         int i;
 979
 980         KASSERT(domain >= 0 && domain < vm_ndomains,
 981             ("keg_alloc_slab: domain %d out of range", domain));
 982         mtx_assert(&keg->uk_lock, MA_OWNED);
 983         slab = NULL;
 984         mem = NULL;
 985
 986         allocf = keg->uk_allocf;
 987         KEG_UNLOCK(keg);
 988         size = keg->uk_ppera * PAGE_SIZE;
 989
 990         if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
 991                 slab = zone_alloc_item(keg->uk_slabzone, NULL, domain, wait);
 992                 if (slab == NULL)
 993                         goto out;
 994         }
 995
 996         /*
 997          * This reproduces the old vm_zone behavior of zero filling pages the
 998          * first time they are added to a zone.
 999          *
1000          * Malloced items are zeroed in uma_zalloc.
1001          */
1002
1003         if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
1004                 wait |= M_ZERO;
1005         else
1006                 wait &= ~M_ZERO;
1007
1008         if (keg->uk_flags & UMA_ZONE_NODUMP)
1009                 wait |= M_NODUMP;
1010
1011         /* zone is passed for legacy reasons. */
1012         mem = allocf(zone, size, domain, &flags, wait);
1013         if (mem == NULL) {
1014                 if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1015                         zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
1016                 slab = NULL;
1017                 goto out;
1018         }
1019         uma_total_inc(size);
1020
1021         /* Point the slab into the allocated memory */
1022         if (!(keg->uk_flags & UMA_ZONE_OFFPAGE))
1023                 slab = (uma_slab_t )(mem + keg->uk_pgoff);
1024
1025         if (keg->uk_flags & UMA_ZONE_VTOSLAB)
1026                 for (i = 0; i < keg->uk_ppera; i++)
1027                         vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab);
1028
1029         slab->us_keg = keg;
1030         slab->us_data = mem;
1031         slab->us_freecount = keg->uk_ipers;
1032         slab->us_flags = flags;
1033         slab->us_domain = domain;
1034         BIT_FILL(SLAB_SETSIZE, &slab->us_free);
1035 #ifdef INVARIANTS
1036         BIT_ZERO(SLAB_SETSIZE, &slab->us_debugfree);
1037 #endif
1038
1039         if (keg->uk_init != NULL) {
1040                 for (i = 0; i < keg->uk_ipers; i++)
1041                         if (keg->uk_init(slab->us_data + (keg->uk_rsize * i),
1042                             keg->uk_size, wait) != 0)
1043                                 break;
1044                 if (i != keg->uk_ipers) {
1045                         keg_free_slab(keg, slab, i);
1046                         slab = NULL;
1047                         goto out;
1048                 }
1049         }
1050 out:
1051         KEG_LOCK(keg);
1052
1053         CTR3(KTR_UMA, "keg_alloc_slab: allocated slab %p for %s(%p)",
1054             slab, keg->uk_name, keg);
1055
1056         if (slab != NULL) {
1057                 if (keg->uk_flags & UMA_ZONE_HASH)
1058                         UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
1059
1060                 keg->uk_pages += keg->uk_ppera;
1061                 keg->uk_free += keg->uk_ipers;
1062         }
1063
1064         return (slab);
1065 }
1066
1067 /*
1068  * This function is intended to be used early on in place of page_alloc() so
1069  * that we may use the boot time page cache to satisfy allocations before
1070  * the VM is ready.
1071  */
1072 static void *
1073 startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1074     int wait)
1075 {
1076         uma_keg_t keg;
1077         void *mem;
1078         int pages;
1079
1080         keg = zone_first_keg(zone);
1081
1082         /*
1083          * If we are in BOOT_BUCKETS or higher, than switch to real
1084          * allocator.  Zones with page sized slabs switch at BOOT_PAGEALLOC.
1085          */
1086         switch (booted) {
1087                 case BOOT_COLD:
1088                 case BOOT_STRAPPED:
1089                         break;
1090                 case BOOT_PAGEALLOC:
1091                         if (keg->uk_ppera > 1)
1092                                 break;
1093                 case BOOT_BUCKETS:
1094                 case BOOT_RUNNING:
1095 #ifdef UMA_MD_SMALL_ALLOC
1096                         keg->uk_allocf = (keg->uk_ppera > 1) ?
1097                             page_alloc : uma_small_alloc;
1098 #else
1099                         keg->uk_allocf = page_alloc;
1100 #endif
1101                         return keg->uk_allocf(zone, bytes, domain, pflag, wait);
1102         }
1103
1104         /*
1105          * Check our small startup cache to see if it has pages remaining.
1106          */
1107         pages = howmany(bytes, PAGE_SIZE);
1108         KASSERT(pages > 0, ("%s can't reserve 0 pages", __func__));
1109         if (pages > boot_pages)
1110                 panic("UMA zone \"%s\": Increase vm.boot_pages", zone->uz_name);
1111 #ifdef DIAGNOSTIC
1112         printf("%s from \"%s\", %d boot pages left\n", __func__, zone->uz_name,
1113             boot_pages);
1114 #endif
1115         mem = bootmem;
1116         boot_pages -= pages;
1117         bootmem += pages * PAGE_SIZE;
1118         *pflag = UMA_SLAB_BOOT;
1119
1120         return (mem);
1121 }
1122
1123 /*
1124  * Allocates a number of pages from the system
1125  *
1126  * Arguments:
1127  *      bytes  The number of bytes requested
1128  *      wait  Shall we wait?
1129  *
1130  * Returns:
1131  *      A pointer to the alloced memory or possibly
1132  *      NULL if M_NOWAIT is set.
1133  */
1134 static void *
1135 page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1136     int wait)
1137 {
1138         void *p;        /* Returned page */
1139
1140         *pflag = UMA_SLAB_KERNEL;
1141         p = (void *) kmem_malloc_domain(domain, bytes, wait);
1142
1143         return (p);
1144 }
1145
1146 /*
1147  * Allocates a number of pages from within an object
1148  *
1149  * Arguments:
1150  *      bytes  The number of bytes requested
1151  *      wait   Shall we wait?
1152  *
1153  * Returns:
1154  *      A pointer to the alloced memory or possibly
1155  *      NULL if M_NOWAIT is set.
1156  */
1157 static void *
1158 noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
1159     int wait)
1160 {
1161         TAILQ_HEAD(, vm_page) alloctail;
1162         u_long npages;
1163         vm_offset_t retkva, zkva;
1164         vm_page_t p, p_next;
1165         uma_keg_t keg;
1166
1167         TAILQ_INIT(&alloctail);
1168         keg = zone_first_keg(zone);
1169
1170         npages = howmany(bytes, PAGE_SIZE);
1171         while (npages > 0) {
1172                 p = vm_page_alloc_domain(NULL, 0, domain, VM_ALLOC_INTERRUPT |
1173                     VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
1174                     ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK :
1175                     VM_ALLOC_NOWAIT));
1176                 if (p != NULL) {
1177                         /*
1178                          * Since the page does not belong to an object, its
1179                          * listq is unused.
1180                          */
1181                         TAILQ_INSERT_TAIL(&alloctail, p, listq);
1182                         npages--;
1183                         continue;
1184                 }
1185                 /*
1186                  * Page allocation failed, free intermediate pages and
1187                  * exit.
1188                  */
1189                 TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
1190                         vm_page_unwire(p, PQ_NONE);
1191                         vm_page_free(p);
1192                 }
1193                 return (NULL);
1194         }
1195         *flags = UMA_SLAB_PRIV;
1196         zkva = keg->uk_kva +
1197             atomic_fetchadd_long(&keg->uk_offset, round_page(bytes));
1198         retkva = zkva;
1199         TAILQ_FOREACH(p, &alloctail, listq) {
1200                 pmap_qenter(zkva, &p, 1);
1201                 zkva += PAGE_SIZE;
1202         }
1203
1204         return ((void *)retkva);
1205 }
1206
1207 /*
1208  * Frees a number of pages to the system
1209  *
1210  * Arguments:
1211  *      mem   A pointer to the memory to be freed
1212  *      size  The size of the memory being freed
1213  *      flags The original p->us_flags field
1214  *
1215  * Returns:
1216  *      Nothing
1217  */
1218 static void
1219 page_free(void *mem, vm_size_t size, uint8_t flags)
1220 {
1221         struct vmem *vmem;
1222
1223         if (flags & UMA_SLAB_KERNEL)
1224                 vmem = kernel_arena;
1225         else
1226                 panic("UMA: page_free used with invalid flags %x", flags);
1227
1228         kmem_free(vmem, (vm_offset_t)mem, size);
1229 }
1230
1231 /*
1232  * Zero fill initializer
1233  *
1234  * Arguments/Returns follow uma_init specifications
1235  */
1236 static int
1237 zero_init(void *mem, int size, int flags)
1238 {
1239         bzero(mem, size);
1240         return (0);
1241 }
1242
1243 /*
1244  * Finish creating a small uma keg.  This calculates ipers, and the keg size.
1245  *
1246  * Arguments
1247  *      keg  The zone we should initialize
1248  *
1249  * Returns
1250  *      Nothing
1251  */
1252 static void
1253 keg_small_init(uma_keg_t keg)
1254 {
1255         u_int rsize;
1256         u_int memused;
1257         u_int wastedspace;
1258         u_int shsize;
1259         u_int slabsize;
1260
1261         if (keg->uk_flags & UMA_ZONE_PCPU) {
1262                 u_int ncpus = (mp_maxid + 1) ? (mp_maxid + 1) : MAXCPU;
1263
1264                 slabsize = sizeof(struct pcpu);
1265                 keg->uk_ppera = howmany(ncpus * sizeof(struct pcpu),
1266                     PAGE_SIZE);
1267         } else {
1268                 slabsize = UMA_SLAB_SIZE;
1269                 keg->uk_ppera = 1;
1270         }
1271
1272         /*
1273          * Calculate the size of each allocation (rsize) according to
1274          * alignment.  If the requested size is smaller than we have
1275          * allocation bits for we round it up.
1276          */
1277         rsize = keg->uk_size;
1278         if (rsize < slabsize / SLAB_SETSIZE)
1279                 rsize = slabsize / SLAB_SETSIZE;
1280         if (rsize & keg->uk_align)
1281                 rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1);
1282         keg->uk_rsize = rsize;
1283
1284         KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 ||
1285             keg->uk_rsize < sizeof(struct pcpu),
1286             ("%s: size %u too large", __func__, keg->uk_rsize));
1287
1288         if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1289                 shsize = 0;
1290         else
1291                 shsize = sizeof(struct uma_slab);
1292
1293         keg->uk_ipers = (slabsize - shsize) / rsize;
1294         KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
1295             ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
1296
1297         memused = keg->uk_ipers * rsize + shsize;
1298         wastedspace = slabsize - memused;
1299
1300         /*
1301          * We can't do OFFPAGE if we're internal or if we've been
1302          * asked to not go to the VM for buckets.  If we do this we
1303          * may end up going to the VM  for slabs which we do not
1304          * want to do if we're UMA_ZFLAG_CACHEONLY as a result
1305          * of UMA_ZONE_VM, which clearly forbids it.
1306          */
1307         if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) ||
1308             (keg->uk_flags & UMA_ZFLAG_CACHEONLY))
1309                 return;
1310
1311         /*
1312          * See if using an OFFPAGE slab will limit our waste.  Only do
1313          * this if it permits more items per-slab.
1314          *
1315          * XXX We could try growing slabsize to limit max waste as well.
1316          * Historically this was not done because the VM could not
1317          * efficiently handle contiguous allocations.
1318          */
1319         if ((wastedspace >= slabsize / UMA_MAX_WASTE) &&
1320             (keg->uk_ipers < (slabsize / keg->uk_rsize))) {
1321                 keg->uk_ipers = slabsize / keg->uk_rsize;
1322                 KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
1323                     ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
1324                 CTR6(KTR_UMA, "UMA decided we need offpage slab headers for "
1325                     "keg: %s(%p), calculated wastedspace = %d, "
1326                     "maximum wasted space allowed = %d, "
1327                     "calculated ipers = %d, "
1328                     "new wasted space = %d\n", keg->uk_name, keg, wastedspace,
1329                     slabsize / UMA_MAX_WASTE, keg->uk_ipers,
1330                     slabsize - keg->uk_ipers * keg->uk_rsize);
1331                 keg->uk_flags |= UMA_ZONE_OFFPAGE;
1332         }
1333
1334         if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
1335             (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1336                 keg->uk_flags |= UMA_ZONE_HASH;
1337 }
1338
1339 /*
1340  * Finish creating a large (> UMA_SLAB_SIZE) uma kegs.  Just give in and do
1341  * OFFPAGE for now.  When I can allow for more dynamic slab sizes this will be
1342  * more complicated.
1343  *
1344  * Arguments
1345  *      keg  The keg we should initialize
1346  *
1347  * Returns
1348  *      Nothing
1349  */
1350 static void
1351 keg_large_init(uma_keg_t keg)
1352 {
1353         u_int shsize;
1354
1355         KASSERT(keg != NULL, ("Keg is null in keg_large_init"));
1356         KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0,
1357             ("keg_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY keg"));
1358         KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
1359             ("%s: Cannot large-init a UMA_ZONE_PCPU keg", __func__));
1360
1361         keg->uk_ppera = howmany(keg->uk_size, PAGE_SIZE);
1362         keg->uk_ipers = 1;
1363         keg->uk_rsize = keg->uk_size;
1364
1365         /* Check whether we have enough space to not do OFFPAGE. */
1366         if ((keg->uk_flags & UMA_ZONE_OFFPAGE) == 0) {
1367                 shsize = sizeof(struct uma_slab);
1368                 if (shsize & UMA_ALIGN_PTR)
1369                         shsize = (shsize & ~UMA_ALIGN_PTR) +
1370                             (UMA_ALIGN_PTR + 1);
1371
1372                 if (PAGE_SIZE * keg->uk_ppera - keg->uk_rsize < shsize) {
1373                         /*
1374                          * We can't do OFFPAGE if we're internal, in which case
1375                          * we need an extra page per allocation to contain the
1376                          * slab header.
1377                          */
1378                         if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) == 0)
1379                                 keg->uk_flags |= UMA_ZONE_OFFPAGE;
1380                         else
1381                                 keg->uk_ppera++;
1382                 }
1383         }
1384
1385         if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
1386             (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1387                 keg->uk_flags |= UMA_ZONE_HASH;
1388 }
1389
1390 static void
1391 keg_cachespread_init(uma_keg_t keg)
1392 {
1393         int alignsize;
1394         int trailer;
1395         int pages;
1396         int rsize;
1397
1398         KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
1399             ("%s: Cannot cachespread-init a UMA_ZONE_PCPU keg", __func__));
1400
1401         alignsize = keg->uk_align + 1;
1402         rsize = keg->uk_size;
1403         /*
1404          * We want one item to start on every align boundary in a page.  To
1405          * do this we will span pages.  We will also extend the item by the
1406          * size of align if it is an even multiple of align.  Otherwise, it
1407          * would fall on the same boundary every time.
1408          */
1409         if (rsize & keg->uk_align)
1410                 rsize = (rsize & ~keg->uk_align) + alignsize;
1411         if ((rsize & alignsize) == 0)
1412                 rsize += alignsize;
1413         trailer = rsize - keg->uk_size;
1414         pages = (rsize * (PAGE_SIZE / alignsize)) / PAGE_SIZE;
1415         pages = MIN(pages, (128 * 1024) / PAGE_SIZE);
1416         keg->uk_rsize = rsize;
1417         keg->uk_ppera = pages;
1418         keg->uk_ipers = ((pages * PAGE_SIZE) + trailer) / rsize;
1419         keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB;
1420         KASSERT(keg->uk_ipers <= SLAB_SETSIZE,
1421             ("%s: keg->uk_ipers too high(%d) increase max_ipers", __func__,
1422             keg->uk_ipers));
1423 }
1424
1425 /*
1426  * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
1427  * the keg onto the global keg list.
1428  *
1429  * Arguments/Returns follow uma_ctor specifications
1430  *      udata  Actually uma_kctor_args
1431  */
1432 static int
1433 keg_ctor(void *mem, int size, void *udata, int flags)
1434 {
1435         struct uma_kctor_args *arg = udata;
1436         uma_keg_t keg = mem;
1437         uma_zone_t zone;
1438
1439         bzero(keg, size);
1440         keg->uk_size = arg->size;
1441         keg->uk_init = arg->uminit;
1442         keg->uk_fini = arg->fini;
1443         keg->uk_align = arg->align;
1444         keg->uk_cursor = 0;
1445         keg->uk_free = 0;
1446         keg->uk_reserve = 0;
1447         keg->uk_pages = 0;
1448         keg->uk_flags = arg->flags;
1449         keg->uk_slabzone = NULL;
1450
1451         /*
1452          * The master zone is passed to us at keg-creation time.
1453          */
1454         zone = arg->zone;
1455         keg->uk_name = zone->uz_name;
1456
1457         if (arg->flags & UMA_ZONE_VM)
1458                 keg->uk_flags |= UMA_ZFLAG_CACHEONLY;
1459
1460         if (arg->flags & UMA_ZONE_ZINIT)
1461                 keg->uk_init = zero_init;
1462
1463         if (arg->flags & UMA_ZONE_MALLOC)
1464                 keg->uk_flags |= UMA_ZONE_VTOSLAB;
1465
1466         if (arg->flags & UMA_ZONE_PCPU)
1467 #ifdef SMP
1468                 keg->uk_flags |= UMA_ZONE_OFFPAGE;
1469 #else
1470                 keg->uk_flags &= ~UMA_ZONE_PCPU;
1471 #endif
1472
1473         if (keg->uk_flags & UMA_ZONE_CACHESPREAD) {
1474                 keg_cachespread_init(keg);
1475         } else {
1476                 if (keg->uk_size > (UMA_SLAB_SIZE - sizeof(struct uma_slab)))
1477                         keg_large_init(keg);
1478                 else
1479                         keg_small_init(keg);
1480         }
1481
1482         if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1483                 keg->uk_slabzone = slabzone;
1484
1485         /*
1486          * If we haven't booted yet we need allocations to go through the
1487          * startup cache until the vm is ready.
1488          */
1489         if (booted < BOOT_PAGEALLOC)
1490                 keg->uk_allocf = startup_alloc;
1491 #ifdef UMA_MD_SMALL_ALLOC
1492         else if (keg->uk_ppera == 1)
1493                 keg->uk_allocf = uma_small_alloc;
1494 #endif
1495         else
1496                 keg->uk_allocf = page_alloc;
1497 #ifdef UMA_MD_SMALL_ALLOC
1498         if (keg->uk_ppera == 1)
1499                 keg->uk_freef = uma_small_free;
1500         else
1501 #endif
1502                 keg->uk_freef = page_free;
1503
1504         /*
1505          * Initialize keg's lock
1506          */
1507         KEG_LOCK_INIT(keg, (arg->flags & UMA_ZONE_MTXCLASS));
1508
1509         /*
1510          * If we're putting the slab header in the actual page we need to
1511          * figure out where in each page it goes.  This calculates a right
1512          * justified offset into the memory on an ALIGN_PTR boundary.
1513          */
1514         if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) {
1515                 u_int totsize;
1516
1517                 /* Size of the slab struct and free list */
1518                 totsize = sizeof(struct uma_slab);
1519
1520                 if (totsize & UMA_ALIGN_PTR)
1521                         totsize = (totsize & ~UMA_ALIGN_PTR) +
1522                             (UMA_ALIGN_PTR + 1);
1523                 keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - totsize;
1524
1525                 /*
1526                  * The only way the following is possible is if with our
1527                  * UMA_ALIGN_PTR adjustments we are now bigger than
1528                  * UMA_SLAB_SIZE.  I haven't checked whether this is
1529                  * mathematically possible for all cases, so we make
1530                  * sure here anyway.
1531                  */
1532                 totsize = keg->uk_pgoff + sizeof(struct uma_slab);
1533                 if (totsize > PAGE_SIZE * keg->uk_ppera) {
1534                         printf("zone %s ipers %d rsize %d size %d\n",
1535                             zone->uz_name, keg->uk_ipers, keg->uk_rsize,
1536                             keg->uk_size);
1537                         panic("UMA slab won't fit.");
1538                 }
1539         }
1540
1541         if (keg->uk_flags & UMA_ZONE_HASH)
1542                 hash_alloc(&keg->uk_hash);
1543
1544         CTR5(KTR_UMA, "keg_ctor %p zone %s(%p) out %d free %d\n",
1545             keg, zone->uz_name, zone,
1546             (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free,
1547             keg->uk_free);
1548
1549         LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
1550
1551         rw_wlock(&uma_rwlock);
1552         LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
1553         rw_wunlock(&uma_rwlock);
1554         return (0);
1555 }
1556
1557 /*
1558  * Zone header ctor.  This initializes all fields, locks, etc.
1559  *
1560  * Arguments/Returns follow uma_ctor specifications
1561  *      udata  Actually uma_zctor_args
1562  */
1563 static int
1564 zone_ctor(void *mem, int size, void *udata, int flags)
1565 {
1566         struct uma_zctor_args *arg = udata;
1567         uma_zone_t zone = mem;
1568         uma_zone_t z;
1569         uma_keg_t keg;
1570
1571         bzero(zone, size);
1572         zone->uz_name = arg->name;
1573         zone->uz_ctor = arg->ctor;
1574         zone->uz_dtor = arg->dtor;
1575         zone->uz_slab = zone_fetch_slab;
1576         zone->uz_init = NULL;
1577         zone->uz_fini = NULL;
1578         zone->uz_allocs = 0;
1579         zone->uz_frees = 0;
1580         zone->uz_fails = 0;
1581         zone->uz_sleeps = 0;
1582         zone->uz_count = 0;
1583         zone->uz_count_min = 0;
1584         zone->uz_flags = 0;
1585         zone->uz_warning = NULL;
1586         /* The domain structures follow the cpu structures. */
1587         zone->uz_domain = (struct uma_zone_domain *)&zone->uz_cpu[mp_ncpus];
1588         timevalclear(&zone->uz_ratecheck);
1589         keg = arg->keg;
1590
1591         ZONE_LOCK_INIT(zone, (arg->flags & UMA_ZONE_MTXCLASS));
1592
1593         /*
1594          * This is a pure cache zone, no kegs.
1595          */
1596         if (arg->import) {
1597                 if (arg->flags & UMA_ZONE_VM)
1598                         arg->flags |= UMA_ZFLAG_CACHEONLY;
1599                 zone->uz_flags = arg->flags;
1600                 zone->uz_size = arg->size;
1601                 zone->uz_import = arg->import;
1602                 zone->uz_release = arg->release;
1603                 zone->uz_arg = arg->arg;
1604                 zone->uz_lockptr = &zone->uz_lock;
1605                 rw_wlock(&uma_rwlock);
1606                 LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link);
1607                 rw_wunlock(&uma_rwlock);
1608                 goto out;
1609         }
1610
1611         /*
1612          * Use the regular zone/keg/slab allocator.
1613          */
1614         zone->uz_import = (uma_import)zone_import;
1615         zone->uz_release = (uma_release)zone_release;
1616         zone->uz_arg = zone;
1617
1618         if (arg->flags & UMA_ZONE_SECONDARY) {
1619                 KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
1620                 zone->uz_init = arg->uminit;
1621                 zone->uz_fini = arg->fini;
1622                 zone->uz_lockptr = &keg->uk_lock;
1623                 zone->uz_flags |= UMA_ZONE_SECONDARY;
1624                 rw_wlock(&uma_rwlock);
1625                 ZONE_LOCK(zone);
1626                 LIST_FOREACH(z, &keg->uk_zones, uz_link) {
1627                         if (LIST_NEXT(z, uz_link) == NULL) {
1628                                 LIST_INSERT_AFTER(z, zone, uz_link);
1629                                 break;
1630                         }
1631                 }
1632                 ZONE_UNLOCK(zone);
1633                 rw_wunlock(&uma_rwlock);
1634         } else if (keg == NULL) {
1635                 if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
1636                     arg->align, arg->flags)) == NULL)
1637                         return (ENOMEM);
1638         } else {
1639                 struct uma_kctor_args karg;
1640                 int error;
1641
1642                 /* We should only be here from uma_startup() */
1643                 karg.size = arg->size;
1644                 karg.uminit = arg->uminit;
1645                 karg.fini = arg->fini;
1646                 karg.align = arg->align;
1647                 karg.flags = arg->flags;
1648                 karg.zone = zone;
1649                 error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg,
1650                     flags);
1651                 if (error)
1652                         return (error);
1653         }
1654
1655         /*
1656          * Link in the first keg.
1657          */
1658         zone->uz_klink.kl_keg = keg;
1659         LIST_INSERT_HEAD(&zone->uz_kegs, &zone->uz_klink, kl_link);
1660         zone->uz_lockptr = &keg->uk_lock;
1661         zone->uz_size = keg->uk_size;
1662         zone->uz_flags |= (keg->uk_flags &
1663             (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT));
1664
1665         /*
1666          * Some internal zones don't have room allocated for the per cpu
1667          * caches.  If we're internal, bail out here.
1668          */
1669         if (keg->uk_flags & UMA_ZFLAG_INTERNAL) {
1670                 KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0,
1671                     ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
1672                 return (0);
1673         }
1674
1675 out:
1676         if ((arg->flags & UMA_ZONE_MAXBUCKET) == 0)
1677                 zone->uz_count = bucket_select(zone->uz_size);
1678         else
1679                 zone->uz_count = BUCKET_MAX;
1680         zone->uz_count_min = zone->uz_count;
1681
1682         return (0);
1683 }
1684
1685 /*
1686  * Keg header dtor.  This frees all data, destroys locks, frees the hash
1687  * table and removes the keg from the global list.
1688  *
1689  * Arguments/Returns follow uma_dtor specifications
1690  *      udata  unused
1691  */
1692 static void
1693 keg_dtor(void *arg, int size, void *udata)
1694 {
1695         uma_keg_t keg;
1696
1697         keg = (uma_keg_t)arg;
1698         KEG_LOCK(keg);
1699         if (keg->uk_free != 0) {
1700                 printf("Freed UMA keg (%s) was not empty (%d items). "
1701                     " Lost %d pages of memory.\n",
1702                     keg->uk_name ? keg->uk_name : "",
1703                     keg->uk_free, keg->uk_pages);
1704         }
1705         KEG_UNLOCK(keg);
1706
1707         hash_free(&keg->uk_hash);
1708
1709         KEG_LOCK_FINI(keg);
1710 }
1711
1712 /*
1713  * Zone header dtor.
1714  *
1715  * Arguments/Returns follow uma_dtor specifications
1716  *      udata  unused
1717  */
1718 static void
1719 zone_dtor(void *arg, int size, void *udata)
1720 {
1721         uma_klink_t klink;
1722         uma_zone_t zone;
1723         uma_keg_t keg;
1724
1725         zone = (uma_zone_t)arg;
1726         keg = zone_first_keg(zone);
1727
1728         if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
1729                 cache_drain(zone);
1730
1731         rw_wlock(&uma_rwlock);
1732         LIST_REMOVE(zone, uz_link);
1733         rw_wunlock(&uma_rwlock);
1734         /*
1735          * XXX there are some races here where
1736          * the zone can be drained but zone lock
1737          * released and then refilled before we
1738          * remove it... we dont care for now
1739          */
1740         zone_drain_wait(zone, M_WAITOK);
1741         /*
1742          * Unlink all of our kegs.
1743          */
1744         while ((klink = LIST_FIRST(&zone->uz_kegs)) != NULL) {
1745                 klink->kl_keg = NULL;
1746                 LIST_REMOVE(klink, kl_link);
1747                 if (klink == &zone->uz_klink)
1748                         continue;
1749                 free(klink, M_TEMP);
1750         }
1751         /*
1752          * We only destroy kegs from non secondary zones.
1753          */
1754         if (keg != NULL && (zone->uz_flags & UMA_ZONE_SECONDARY) == 0)  {
1755                 rw_wlock(&uma_rwlock);
1756                 LIST_REMOVE(keg, uk_link);
1757                 rw_wunlock(&uma_rwlock);
1758                 zone_free_item(kegs, keg, NULL, SKIP_NONE);
1759         }
1760         ZONE_LOCK_FINI(zone);
1761 }
1762
1763 /*
1764  * Traverses every zone in the system and calls a callback
1765  *
1766  * Arguments:
1767  *      zfunc  A pointer to a function which accepts a zone
1768  *              as an argument.
1769  *
1770  * Returns:
1771  *      Nothing
1772  */
1773 static void
1774 zone_foreach(void (*zfunc)(uma_zone_t))
1775 {
1776         uma_keg_t keg;
1777         uma_zone_t zone;
1778
1779         rw_rlock(&uma_rwlock);
1780         LIST_FOREACH(keg, &uma_kegs, uk_link) {
1781                 LIST_FOREACH(zone, &keg->uk_zones, uz_link)
1782                         zfunc(zone);
1783         }
1784         rw_runlock(&uma_rwlock);
1785 }
1786
1787 /*
1788  * Count how many pages do we need to bootstrap.  VM supplies
1789  * its need in early zones in the argument, we add up our zones,
1790  * which consist of: UMA Slabs, UMA Hash and 9 Bucket zones. The
1791  * zone of zones and zone of kegs are accounted separately.
1792  */
1793 #define UMA_BOOT_ZONES  11
1794 /* Zone of zones and zone of kegs have arbitrary alignment. */
1795 #define UMA_BOOT_ALIGN  32
1796 static int zsize, ksize;
1797 int
1798 uma_startup_count(int vm_zones)
1799 {
1800         int zones, pages;
1801
1802         ksize = sizeof(struct uma_keg) +
1803             (sizeof(struct uma_domain) * vm_ndomains);
1804         zsize = sizeof(struct uma_zone) +
1805             (sizeof(struct uma_cache) * (mp_maxid + 1)) +
1806             (sizeof(struct uma_zone_domain) * vm_ndomains);
1807
1808         /*
1809          * Memory for the zone of kegs and its keg,
1810          * and for zone of zones.
1811          */
1812         pages = howmany(roundup(zsize, CACHE_LINE_SIZE) * 2 +
1813             roundup(ksize, CACHE_LINE_SIZE), PAGE_SIZE);
1814
1815 #ifdef  UMA_MD_SMALL_ALLOC
1816         zones = UMA_BOOT_ZONES;
1817 #else
1818         zones = UMA_BOOT_ZONES + vm_zones;
1819         vm_zones = 0;
1820 #endif
1821
1822         /* Memory for the rest of startup zones, UMA and VM, ... */
1823         if (zsize > UMA_SLAB_SIZE)
1824                 pages += (zones + vm_zones) *
1825                     howmany(roundup2(zsize, UMA_BOOT_ALIGN), UMA_SLAB_SIZE);
1826         else
1827                 pages += howmany(zones,
1828                     UMA_SLAB_SPACE / roundup2(zsize, UMA_BOOT_ALIGN));
1829
1830         /* ... and their kegs. Note that zone of zones allocates a keg! */
1831         pages += howmany(zones + 1,
1832             UMA_SLAB_SPACE / roundup2(ksize, UMA_BOOT_ALIGN));
1833
1834         /*
1835          * Most of startup zones are not going to be offpages, that's
1836          * why we use UMA_SLAB_SPACE instead of UMA_SLAB_SIZE in all
1837          * calculations.  Some large bucket zones will be offpage, and
1838          * thus will allocate hashes.  We take conservative approach
1839          * and assume that all zones may allocate hash.  This may give
1840          * us some positive inaccuracy, usually an extra single page.
1841          */
1842         pages += howmany(zones, UMA_SLAB_SPACE /
1843             (sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT));
1844
1845         return (pages);
1846 }
1847
1848 void
1849 uma_startup(void *mem, int npages)
1850 {
1851         struct uma_zctor_args args;
1852         uma_keg_t masterkeg;
1853         uintptr_t m;
1854
1855 #ifdef DIAGNOSTIC
1856         printf("Entering %s with %d boot pages configured\n", __func__, npages);
1857 #endif
1858
1859         rw_init(&uma_rwlock, "UMA lock");
1860
1861         /* Use bootpages memory for the zone of zones and zone of kegs. */
1862         m = (uintptr_t)mem;
1863         zones = (uma_zone_t)m;
1864         m += roundup(zsize, CACHE_LINE_SIZE);
1865         kegs = (uma_zone_t)m;
1866         m += roundup(zsize, CACHE_LINE_SIZE);
1867         masterkeg = (uma_keg_t)m;
1868         m += roundup(ksize, CACHE_LINE_SIZE);
1869         m = roundup(m, PAGE_SIZE);
1870         npages -= (m - (uintptr_t)mem) / PAGE_SIZE;
1871         mem = (void *)m;
1872
1873         /* "manually" create the initial zone */
1874         memset(&args, 0, sizeof(args));
1875         args.name = "UMA Kegs";
1876         args.size = ksize;
1877         args.ctor = keg_ctor;
1878         args.dtor = keg_dtor;
1879         args.uminit = zero_init;
1880         args.fini = NULL;
1881         args.keg = masterkeg;
1882         args.align = UMA_BOOT_ALIGN - 1;
1883         args.flags = UMA_ZFLAG_INTERNAL;
1884         zone_ctor(kegs, zsize, &args, M_WAITOK);
1885
1886         bootmem = mem;
1887         boot_pages = npages;
1888
1889         args.name = "UMA Zones";
1890         args.size = zsize;
1891         args.ctor = zone_ctor;
1892         args.dtor = zone_dtor;
1893         args.uminit = zero_init;
1894         args.fini = NULL;
1895         args.keg = NULL;
1896         args.align = UMA_BOOT_ALIGN - 1;
1897         args.flags = UMA_ZFLAG_INTERNAL;
1898         zone_ctor(zones, zsize, &args, M_WAITOK);
1899
1900         /* Now make a zone for slab headers */
1901         slabzone = uma_zcreate("UMA Slabs",
1902                                 sizeof(struct uma_slab),
1903                                 NULL, NULL, NULL, NULL,
1904                                 UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
1905
1906         hashzone = uma_zcreate("UMA Hash",
1907             sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
1908             NULL, NULL, NULL, NULL,
1909             UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
1910
1911         bucket_init();
1912
1913         booted = BOOT_STRAPPED;
1914 }
1915
1916 void
1917 uma_startup1(void)
1918 {
1919
1920 #ifdef DIAGNOSTIC
1921         printf("Entering %s with %d boot pages left\n", __func__, boot_pages);
1922 #endif
1923         booted = BOOT_PAGEALLOC;
1924 }
1925
1926 void
1927 uma_startup2(void)
1928 {
1929
1930 #ifdef DIAGNOSTIC
1931         printf("Entering %s with %d boot pages left\n", __func__, boot_pages);
1932 #endif
1933         booted = BOOT_BUCKETS;
1934         sx_init(&uma_drain_lock, "umadrain");
1935         bucket_enable();
1936 }
1937
1938 /*
1939  * Initialize our callout handle
1940  *
1941  */
1942 static void
1943 uma_startup3(void)
1944 {
1945
1946         booted = BOOT_RUNNING;
1947         callout_init(&uma_callout, 1);
1948         callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
1949 }
1950
1951 static uma_keg_t
1952 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
1953                 int align, uint32_t flags)
1954 {
1955         struct uma_kctor_args args;
1956
1957         args.size = size;
1958         args.uminit = uminit;
1959         args.fini = fini;
1960         args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
1961         args.flags = flags;
1962         args.zone = zone;
1963         return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK));
1964 }
1965
1966 /* Public functions */
1967 /* See uma.h */
1968 void
1969 uma_set_align(int align)
1970 {
1971
1972         if (align != UMA_ALIGN_CACHE)
1973                 uma_align_cache = align;
1974 }
1975
1976 /* See uma.h */
1977 uma_zone_t
1978 uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
1979                 uma_init uminit, uma_fini fini, int align, uint32_t flags)
1980
1981 {
1982         struct uma_zctor_args args;
1983         uma_zone_t res;
1984         bool locked;
1985
1986         KASSERT(powerof2(align + 1), ("invalid zone alignment %d for \"%s\"",
1987             align, name));
1988
1989         /* This stuff is essential for the zone ctor */
1990         memset(&args, 0, sizeof(args));
1991         args.name = name;
1992         args.size = size;
1993         args.ctor = ctor;
1994         args.dtor = dtor;
1995         args.uminit = uminit;
1996         args.fini = fini;
1997 #ifdef  INVARIANTS
1998         /*
1999          * If a zone is being created with an empty constructor and
2000          * destructor, pass UMA constructor/destructor which checks for
2001          * memory use after free.
2002          */
2003         if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOFREE))) &&
2004             ctor == NULL && dtor == NULL && uminit == NULL && fini == NULL) {
2005                 args.ctor = trash_ctor;
2006                 args.dtor = trash_dtor;
2007                 args.uminit = trash_init;
2008                 args.fini = trash_fini;
2009         }
2010 #endif
2011         args.align = align;
2012         args.flags = flags;
2013         args.keg = NULL;
2014
2015         if (booted < BOOT_BUCKETS) {
2016                 locked = false;
2017         } else {
2018                 sx_slock(&uma_drain_lock);
2019                 locked = true;
2020         }
2021         res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
2022         if (locked)
2023                 sx_sunlock(&uma_drain_lock);
2024         return (res);
2025 }
2026
2027 /* See uma.h */
2028 uma_zone_t
2029 uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
2030                     uma_init zinit, uma_fini zfini, uma_zone_t master)
2031 {
2032         struct uma_zctor_args args;
2033         uma_keg_t keg;
2034         uma_zone_t res;
2035         bool locked;
2036
2037         keg = zone_first_keg(master);
2038         memset(&args, 0, sizeof(args));
2039         args.name = name;
2040         args.size = keg->uk_size;
2041         args.ctor = ctor;
2042         args.dtor = dtor;
2043         args.uminit = zinit;
2044         args.fini = zfini;
2045         args.align = keg->uk_align;
2046         args.flags = keg->uk_flags | UMA_ZONE_SECONDARY;
2047         args.keg = keg;
2048
2049         if (booted < BOOT_BUCKETS) {
2050                 locked = false;
2051         } else {
2052                 sx_slock(&uma_drain_lock);
2053                 locked = true;
2054         }
2055         /* XXX Attaches only one keg of potentially many. */
2056         res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
2057         if (locked)
2058                 sx_sunlock(&uma_drain_lock);
2059         return (res);
2060 }
2061
2062 /* See uma.h */
2063 uma_zone_t
2064 uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor,
2065                     uma_init zinit, uma_fini zfini, uma_import zimport,
2066                     uma_release zrelease, void *arg, int flags)
2067 {
2068         struct uma_zctor_args args;
2069
2070         memset(&args, 0, sizeof(args));
2071         args.name = name;
2072         args.size = size;
2073         args.ctor = ctor;
2074         args.dtor = dtor;
2075         args.uminit = zinit;
2076         args.fini = zfini;
2077         args.import = zimport;
2078         args.release = zrelease;
2079         args.arg = arg;
2080         args.align = 0;
2081         args.flags = flags;
2082
2083         return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK));
2084 }
2085
2086 static void
2087 zone_lock_pair(uma_zone_t a, uma_zone_t b)
2088 {
2089         if (a < b) {
2090                 ZONE_LOCK(a);
2091                 mtx_lock_flags(b->uz_lockptr, MTX_DUPOK);
2092         } else {
2093                 ZONE_LOCK(b);
2094                 mtx_lock_flags(a->uz_lockptr, MTX_DUPOK);
2095         }
2096 }
2097
2098 static void
2099 zone_unlock_pair(uma_zone_t a, uma_zone_t b)
2100 {
2101
2102         ZONE_UNLOCK(a);
2103         ZONE_UNLOCK(b);
2104 }
2105
2106 int
2107 uma_zsecond_add(uma_zone_t zone, uma_zone_t master)
2108 {
2109         uma_klink_t klink;
2110         uma_klink_t kl;
2111         int error;
2112
2113         error = 0;
2114         klink = malloc(sizeof(*klink), M_TEMP, M_WAITOK | M_ZERO);
2115
2116         zone_lock_pair(zone, master);
2117         /*
2118          * zone must use vtoslab() to resolve objects and must already be
2119          * a secondary.
2120          */
2121         if ((zone->uz_flags & (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY))
2122             != (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY)) {
2123                 error = EINVAL;
2124                 goto out;
2125         }
2126         /*
2127          * The new master must also use vtoslab().
2128          */
2129         if ((zone->uz_flags & UMA_ZONE_VTOSLAB) != UMA_ZONE_VTOSLAB) {
2130                 error = EINVAL;
2131                 goto out;
2132         }
2133
2134         /*
2135          * The underlying object must be the same size.  rsize
2136          * may be different.
2137          */
2138         if (master->uz_size != zone->uz_size) {
2139                 error = E2BIG;
2140                 goto out;
2141         }
2142         /*
2143          * Put it at the end of the list.
2144          */
2145         klink->kl_keg = zone_first_keg(master);
2146         LIST_FOREACH(kl, &zone->uz_kegs, kl_link) {
2147                 if (LIST_NEXT(kl, kl_link) == NULL) {
2148                         LIST_INSERT_AFTER(kl, klink, kl_link);
2149                         break;
2150                 }
2151         }
2152         klink = NULL;
2153         zone->uz_flags |= UMA_ZFLAG_MULTI;
2154         zone->uz_slab = zone_fetch_slab_multi;
2155
2156 out:
2157         zone_unlock_pair(zone, master);
2158         if (klink != NULL)
2159                 free(klink, M_TEMP);
2160
2161         return (error);
2162 }
2163
2164
2165 /* See uma.h */
2166 void
2167 uma_zdestroy(uma_zone_t zone)
2168 {
2169
2170         sx_slock(&uma_drain_lock);
2171         zone_free_item(zones, zone, NULL, SKIP_NONE);
2172         sx_sunlock(&uma_drain_lock);
2173 }
2174
2175 void
2176 uma_zwait(uma_zone_t zone)
2177 {
2178         void *item;
2179
2180         item = uma_zalloc_arg(zone, NULL, M_WAITOK);
2181         uma_zfree(zone, item);
2182 }
2183
2184 /* See uma.h */
2185 void *
2186 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
2187 {
2188         uma_zone_domain_t zdom;
2189         uma_bucket_t bucket;
2190         uma_cache_t cache;
2191         void *item;
2192         int cpu, domain, lockfail;
2193
2194         /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
2195         random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA);
2196
2197         /* This is the fast path allocation */
2198         CTR4(KTR_UMA, "uma_zalloc_arg thread %x zone %s(%p) flags %d",
2199             curthread, zone->uz_name, zone, flags);
2200
2201         if (flags & M_WAITOK) {
2202                 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2203                     "uma_zalloc_arg: zone \"%s\"", zone->uz_name);
2204         }
2205         KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
2206             ("uma_zalloc_arg: called with spinlock or critical section held"));
2207
2208 #ifdef DEBUG_MEMGUARD
2209         if (memguard_cmp_zone(zone)) {
2210                 item = memguard_alloc(zone->uz_size, flags);
2211                 if (item != NULL) {
2212                         if (zone->uz_init != NULL &&
2213                             zone->uz_init(item, zone->uz_size, flags) != 0)
2214                                 return (NULL);
2215                         if (zone->uz_ctor != NULL &&
2216                             zone->uz_ctor(item, zone->uz_size, udata,
2217                             flags) != 0) {
2218                                 zone->uz_fini(item, zone->uz_size);
2219                                 return (NULL);
2220                         }
2221                         return (item);
2222                 }
2223                 /* This is unfortunate but should not be fatal. */
2224         }
2225 #endif
2226         /*
2227          * If possible, allocate from the per-CPU cache.  There are two
2228          * requirements for safe access to the per-CPU cache: (1) the thread
2229          * accessing the cache must not be preempted or yield during access,
2230          * and (2) the thread must not migrate CPUs without switching which
2231          * cache it accesses.  We rely on a critical section to prevent
2232          * preemption and migration.  We release the critical section in
2233          * order to acquire the zone mutex if we are unable to allocate from
2234          * the current cache; when we re-acquire the critical section, we
2235          * must detect and handle migration if it has occurred.
2236          */
2237         critical_enter();
2238         cpu = curcpu;
2239         cache = &zone->uz_cpu[cpu];
2240
2241 zalloc_start:
2242         bucket = cache->uc_allocbucket;
2243         if (bucket != NULL && bucket->ub_cnt > 0) {
2244                 bucket->ub_cnt--;
2245                 item = bucket->ub_bucket[bucket->ub_cnt];
2246 #ifdef INVARIANTS
2247                 bucket->ub_bucket[bucket->ub_cnt] = NULL;
2248 #endif
2249                 KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled."));
2250                 cache->uc_allocs++;
2251                 critical_exit();
2252                 if (zone->uz_ctor != NULL &&
2253                     zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
2254                         atomic_add_long(&zone->uz_fails, 1);
2255                         zone_free_item(zone, item, udata, SKIP_DTOR);
2256                         return (NULL);
2257                 }
2258 #ifdef INVARIANTS
2259                 uma_dbg_alloc(zone, NULL, item);
2260 #endif
2261                 if (flags & M_ZERO)
2262                         uma_zero_item(item, zone);
2263                 return (item);
2264         }
2265
2266         /*
2267          * We have run out of items in our alloc bucket.
2268          * See if we can switch with our free bucket.
2269          */
2270         bucket = cache->uc_freebucket;
2271         if (bucket != NULL && bucket->ub_cnt > 0) {
2272                 CTR2(KTR_UMA,
2273                     "uma_zalloc: zone %s(%p) swapping empty with alloc",
2274                     zone->uz_name, zone);
2275                 cache->uc_freebucket = cache->uc_allocbucket;
2276                 cache->uc_allocbucket = bucket;
2277                 goto zalloc_start;
2278         }
2279
2280         /*
2281          * Discard any empty allocation bucket while we hold no locks.
2282          */
2283         bucket = cache->uc_allocbucket;
2284         cache->uc_allocbucket = NULL;
2285         critical_exit();
2286         if (bucket != NULL)
2287                 bucket_free(zone, bucket, udata);
2288
2289         if (zone->uz_flags & UMA_ZONE_NUMA)
2290                 domain = PCPU_GET(domain);
2291         else
2292                 domain = UMA_ANYDOMAIN;
2293
2294         /* Short-circuit for zones without buckets and low memory. */
2295         if (zone->uz_count == 0 || bucketdisable)
2296                 goto zalloc_item;
2297
2298         /*
2299          * Attempt to retrieve the item from the per-CPU cache has failed, so
2300          * we must go back to the zone.  This requires the zone lock, so we
2301          * must drop the critical section, then re-acquire it when we go back
2302          * to the cache.  Since the critical section is released, we may be
2303          * preempted or migrate.  As such, make sure not to maintain any
2304          * thread-local state specific to the cache from prior to releasing
2305          * the critical section.
2306          */
2307         lockfail = 0;
2308         if (ZONE_TRYLOCK(zone) == 0) {
2309                 /* Record contention to size the buckets. */
2310                 ZONE_LOCK(zone);
2311                 lockfail = 1;
2312         }
2313         critical_enter();
2314         cpu = curcpu;
2315         cache = &zone->uz_cpu[cpu];
2316
2317         /*
2318          * Since we have locked the zone we may as well send back our stats.
2319          */
2320         atomic_add_long(&zone->uz_allocs, cache->uc_allocs);
2321         atomic_add_long(&zone->uz_frees, cache->uc_frees);
2322         cache->uc_allocs = 0;
2323         cache->uc_frees = 0;
2324
2325         /* See if we lost the race to fill the cache. */
2326         if (cache->uc_allocbucket != NULL) {
2327                 ZONE_UNLOCK(zone);
2328                 goto zalloc_start;
2329         }
2330
2331         /*
2332          * Check the zone's cache of buckets.
2333          */
2334         if (domain == UMA_ANYDOMAIN)
2335                 zdom = &zone->uz_domain[0];
2336         else
2337                 zdom = &zone->uz_domain[domain];
2338         if ((bucket = LIST_FIRST(&zdom->uzd_buckets)) != NULL) {
2339                 KASSERT(bucket->ub_cnt != 0,
2340                     ("uma_zalloc_arg: Returning an empty bucket."));
2341
2342                 LIST_REMOVE(bucket, ub_link);
2343                 cache->uc_allocbucket = bucket;
2344                 ZONE_UNLOCK(zone);
2345                 goto zalloc_start;
2346         }
2347         /* We are no longer associated with this CPU. */
2348         critical_exit();
2349
2350         /*
2351          * We bump the uz count when the cache size is insufficient to
2352          * handle the working set.
2353          */
2354         if (lockfail && zone->uz_count < BUCKET_MAX)
2355                 zone->uz_count++;
2356         ZONE_UNLOCK(zone);
2357
2358         /*
2359          * Now lets just fill a bucket and put it on the free list.  If that
2360          * works we'll restart the allocation from the beginning and it
2361          * will use the just filled bucket.
2362          */
2363         bucket = zone_alloc_bucket(zone, udata, domain, flags);
2364         CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p",
2365             zone->uz_name, zone, bucket);
2366         if (bucket != NULL) {
2367                 ZONE_LOCK(zone);
2368                 critical_enter();
2369                 cpu = curcpu;
2370                 cache = &zone->uz_cpu[cpu];
2371                 /*
2372                  * See if we lost the race or were migrated.  Cache the
2373                  * initialized bucket to make this less likely or claim
2374                  * the memory directly.
2375                  */
2376                 if (cache->uc_allocbucket != NULL ||
2377                     (zone->uz_flags & UMA_ZONE_NUMA &&
2378                     domain != PCPU_GET(domain)))
2379                         LIST_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link);
2380                 else
2381                         cache->uc_allocbucket = bucket;
2382                 ZONE_UNLOCK(zone);
2383                 goto zalloc_start;
2384         }
2385
2386         /*
2387          * We may not be able to get a bucket so return an actual item.
2388          */
2389 zalloc_item:
2390         item = zone_alloc_item(zone, udata, domain, flags);
2391
2392         return (item);
2393 }
2394
2395 void *
2396 uma_zalloc_domain(uma_zone_t zone, void *udata, int domain, int flags)
2397 {
2398
2399         /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
2400         random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA);
2401
2402         /* This is the fast path allocation */
2403         CTR5(KTR_UMA,
2404             "uma_zalloc_domain thread %x zone %s(%p) domain %d flags %d",
2405             curthread, zone->uz_name, zone, domain, flags);
2406
2407         if (flags & M_WAITOK) {
2408                 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2409                     "uma_zalloc_domain: zone \"%s\"", zone->uz_name);
2410         }
2411         KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
2412             ("uma_zalloc_domain: called with spinlock or critical section held"));
2413
2414         return (zone_alloc_item(zone, udata, domain, flags));
2415 }
2416
2417 /*
2418  * Find a slab with some space.  Prefer slabs that are partially used over those
2419  * that are totally full.  This helps to reduce fragmentation.
2420  *
2421  * If 'rr' is 1, search all domains starting from 'domain'.  Otherwise check
2422  * only 'domain'.
2423  */
2424 static uma_slab_t
2425 keg_first_slab(uma_keg_t keg, int domain, int rr)
2426 {
2427         uma_domain_t dom;
2428         uma_slab_t slab;
2429         int start;
2430
2431         KASSERT(domain >= 0 && domain < vm_ndomains,
2432             ("keg_first_slab: domain %d out of range", domain));
2433
2434         slab = NULL;
2435         start = domain;
2436         do {
2437                 dom = &keg->uk_domain[domain];
2438                 if (!LIST_EMPTY(&dom->ud_part_slab))
2439                         return (LIST_FIRST(&dom->ud_part_slab));
2440                 if (!LIST_EMPTY(&dom->ud_free_slab)) {
2441                         slab = LIST_FIRST(&dom->ud_free_slab);
2442                         LIST_REMOVE(slab, us_link);
2443                         LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
2444                         return (slab);
2445                 }
2446                 if (rr)
2447                         domain = (domain + 1) % vm_ndomains;
2448         } while (domain != start);
2449
2450         return (NULL);
2451 }
2452
2453 static uma_slab_t
2454 keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, int flags)
2455 {
2456         uma_domain_t dom;
2457         uma_slab_t slab;
2458         int allocflags, domain, reserve, rr, start;
2459
2460         mtx_assert(&keg->uk_lock, MA_OWNED);
2461         slab = NULL;
2462         reserve = 0;
2463         allocflags = flags;
2464         if ((flags & M_USE_RESERVE) == 0)
2465                 reserve = keg->uk_reserve;
2466
2467         /*
2468          * Round-robin for non first-touch zones when there is more than one
2469          * domain.
2470          */
2471         if (vm_ndomains == 1)
2472                 rdomain = 0;
2473         rr = rdomain == UMA_ANYDOMAIN;
2474         if (rr) {
2475                 keg->uk_cursor = (keg->uk_cursor + 1) % vm_ndomains;
2476                 domain = start = keg->uk_cursor;
2477                 /* Only block on the second pass. */
2478                 if ((flags & (M_WAITOK | M_NOVM)) == M_WAITOK)
2479                         allocflags = (allocflags & ~M_WAITOK) | M_NOWAIT;
2480         } else
2481                 domain = start = rdomain;
2482
2483 again:
2484         do {
2485                 if (keg->uk_free > reserve &&
2486                     (slab = keg_first_slab(keg, domain, rr)) != NULL) {
2487                         MPASS(slab->us_keg == keg);
2488                         return (slab);
2489                 }
2490
2491                 /*
2492                  * M_NOVM means don't ask at all!
2493                  */
2494                 if (flags & M_NOVM)
2495                         break;
2496
2497                 if (keg->uk_maxpages && keg->uk_pages >= keg->uk_maxpages) {
2498                         keg->uk_flags |= UMA_ZFLAG_FULL;
2499                         /*
2500                          * If this is not a multi-zone, set the FULL bit.
2501                          * Otherwise slab_multi() takes care of it.
2502                          */
2503                         if ((zone->uz_flags & UMA_ZFLAG_MULTI) == 0) {
2504                                 zone->uz_flags |= UMA_ZFLAG_FULL;
2505                                 zone_log_warning(zone);
2506                                 zone_maxaction(zone);
2507                         }
2508                         if (flags & M_NOWAIT)
2509                                 return (NULL);
2510                         zone->uz_sleeps++;
2511                         msleep(keg, &keg->uk_lock, PVM, "keglimit", 0);
2512                         continue;
2513                 }
2514                 slab = keg_alloc_slab(keg, zone, domain, allocflags);
2515                 /*
2516                  * If we got a slab here it's safe to mark it partially used
2517                  * and return.  We assume that the caller is going to remove
2518                  * at least one item.
2519                  */
2520                 if (slab) {
2521                         MPASS(slab->us_keg == keg);
2522                         dom = &keg->uk_domain[slab->us_domain];
2523                         LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
2524                         return (slab);
2525                 }
2526                 if (rr) {
2527                         keg->uk_cursor = (keg->uk_cursor + 1) % vm_ndomains;
2528                         domain = keg->uk_cursor;
2529                 }
2530         } while (domain != start);
2531
2532         /* Retry domain scan with blocking. */
2533         if (allocflags != flags) {
2534                 allocflags = flags;
2535                 goto again;
2536         }
2537
2538         /*
2539          * We might not have been able to get a slab but another cpu
2540          * could have while we were unlocked.  Check again before we
2541          * fail.
2542          */
2543         if (keg->uk_free > reserve &&
2544             (slab = keg_first_slab(keg, domain, rr)) != NULL) {
2545                 MPASS(slab->us_keg == keg);
2546                 return (slab);
2547         }
2548         return (NULL);
2549 }
2550
2551 static uma_slab_t
2552 zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int domain, int flags)
2553 {
2554         uma_slab_t slab;
2555
2556         if (keg == NULL) {
2557                 keg = zone_first_keg(zone);
2558                 KEG_LOCK(keg);
2559         }
2560
2561         for (;;) {
2562                 slab = keg_fetch_slab(keg, zone, domain, flags);
2563                 if (slab)
2564                         return (slab);
2565                 if (flags & (M_NOWAIT | M_NOVM))
2566                         break;
2567         }
2568         KEG_UNLOCK(keg);
2569         return (NULL);
2570 }
2571
2572 /*
2573  * uma_zone_fetch_slab_multi:  Fetches a slab from one available keg.  Returns
2574  * with the keg locked.  On NULL no lock is held.
2575  *
2576  * The last pointer is used to seed the search.  It is not required.
2577  */
2578 static uma_slab_t
2579 zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int domain, int rflags)
2580 {
2581         uma_klink_t klink;
2582         uma_slab_t slab;
2583         uma_keg_t keg;
2584         int flags;
2585         int empty;
2586         int full;
2587
2588         /*
2589          * Don't wait on the first pass.  This will skip limit tests
2590          * as well.  We don't want to block if we can find a provider
2591          * without blocking.
2592          */
2593         flags = (rflags & ~M_WAITOK) | M_NOWAIT;
2594         /*
2595          * Use the last slab allocated as a hint for where to start
2596          * the search.
2597          */
2598         if (last != NULL) {
2599                 slab = keg_fetch_slab(last, zone, domain, flags);
2600                 if (slab)
2601                         return (slab);
2602                 KEG_UNLOCK(last);
2603         }
2604         /*
2605          * Loop until we have a slab incase of transient failures
2606          * while M_WAITOK is specified.  I'm not sure this is 100%
2607          * required but we've done it for so long now.
2608          */
2609         for (;;) {
2610                 empty = 0;
2611                 full = 0;
2612                 /*
2613                  * Search the available kegs for slabs.  Be careful to hold the
2614                  * correct lock while calling into the keg layer.
2615                  */
2616                 LIST_FOREACH(klink, &zone->uz_kegs, kl_link) {
2617                         keg = klink->kl_keg;
2618                         KEG_LOCK(keg);
2619                         if ((keg->uk_flags & UMA_ZFLAG_FULL) == 0) {
2620                                 slab = keg_fetch_slab(keg, zone, domain, flags);
2621                                 if (slab)
2622                                         return (slab);
2623                         }
2624                         if (keg->uk_flags & UMA_ZFLAG_FULL)
2625                                 full++;
2626                         else
2627                                 empty++;
2628                         KEG_UNLOCK(keg);
2629                 }
2630                 if (rflags & (M_NOWAIT | M_NOVM))
2631                         break;
2632                 flags = rflags;
2633                 /*
2634                  * All kegs are full.  XXX We can't atomically check all kegs
2635                  * and sleep so just sleep for a short period and retry.
2636                  */
2637                 if (full && !empty) {
2638                         ZONE_LOCK(zone);
2639                         zone->uz_flags |= UMA_ZFLAG_FULL;
2640                         zone->uz_sleeps++;
2641                         zone_log_warning(zone);
2642                         zone_maxaction(zone);
2643                         msleep(zone, zone->uz_lockptr, PVM,
2644                             "zonelimit", hz/100);
2645                         zone->uz_flags &= ~UMA_ZFLAG_FULL;
2646                         ZONE_UNLOCK(zone);
2647                         continue;
2648                 }
2649         }
2650         return (NULL);
2651 }
2652
2653 static void *
2654 slab_alloc_item(uma_keg_t keg, uma_slab_t slab)
2655 {
2656         uma_domain_t dom;
2657         void *item;
2658         uint8_t freei;
2659
2660         MPASS(keg == slab->us_keg);
2661         mtx_assert(&keg->uk_lock, MA_OWNED);
2662
2663         freei = BIT_FFS(SLAB_SETSIZE, &slab->us_free) - 1;
2664         BIT_CLR(SLAB_SETSIZE, freei, &slab->us_free);
2665         item = slab->us_data + (keg->uk_rsize * freei);
2666         slab->us_freecount--;
2667         keg->uk_free--;
2668
2669         /* Move this slab to the full list */
2670         if (slab->us_freecount == 0) {
2671                 LIST_REMOVE(slab, us_link);
2672                 dom = &keg->uk_domain[slab->us_domain];
2673                 LIST_INSERT_HEAD(&dom->ud_full_slab, slab, us_link);
2674         }
2675
2676         return (item);
2677 }
2678
2679 static int
2680 zone_import(uma_zone_t zone, void **bucket, int max, int domain, int flags)
2681 {
2682         uma_slab_t slab;
2683         uma_keg_t keg;
2684         int stripe;
2685         int i;
2686
2687         slab = NULL;
2688         keg = NULL;
2689         /* Try to keep the buckets totally full */
2690         for (i = 0; i < max; ) {
2691                 if ((slab = zone->uz_slab(zone, keg, domain, flags)) == NULL)
2692                         break;
2693                 keg = slab->us_keg;
2694                 stripe = howmany(max, vm_ndomains);
2695                 while (slab->us_freecount && i < max) {
2696                         bucket[i++] = slab_alloc_item(keg, slab);
2697                         if (keg->uk_free <= keg->uk_reserve)
2698                                 break;
2699 #ifdef NUMA
2700                         /*
2701                          * If the zone is striped we pick a new slab for every
2702                          * N allocations.  Eliminating this conditional will
2703                          * instead pick a new domain for each bucket rather
2704                          * than stripe within each bucket.  The current option
2705                          * produces more fragmentation and requires more cpu
2706                          * time but yields better distribution.
2707                          */
2708                         if ((zone->uz_flags & UMA_ZONE_NUMA) == 0 &&
2709                             vm_ndomains > 1 && --stripe == 0)
2710                                 break;
2711 #endif
2712                 }
2713                 /* Don't block if we allocated any successfully. */
2714                 flags &= ~M_WAITOK;
2715                 flags |= M_NOWAIT;
2716         }
2717         if (slab != NULL)
2718                 KEG_UNLOCK(keg);
2719
2720         return i;
2721 }
2722
2723 static uma_bucket_t
2724 zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags)
2725 {
2726         uma_bucket_t bucket;
2727         int max;
2728
2729         /* Don't wait for buckets, preserve caller's NOVM setting. */
2730         bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM));
2731         if (bucket == NULL)
2732                 return (NULL);
2733
2734         max = MIN(bucket->ub_entries, zone->uz_count);
2735         bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket,
2736             max, domain, flags);
2737
2738         /*
2739          * Initialize the memory if necessary.
2740          */
2741         if (bucket->ub_cnt != 0 && zone->uz_init != NULL) {
2742                 int i;
2743
2744                 for (i = 0; i < bucket->ub_cnt; i++)
2745                         if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size,
2746                             flags) != 0)
2747                                 break;
2748                 /*
2749                  * If we couldn't initialize the whole bucket, put the
2750                  * rest back onto the freelist.
2751                  */
2752                 if (i != bucket->ub_cnt) {
2753                         zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i],
2754                             bucket->ub_cnt - i);
2755 #ifdef INVARIANTS
2756                         bzero(&bucket->ub_bucket[i],
2757                             sizeof(void *) * (bucket->ub_cnt - i));
2758 #endif
2759                         bucket->ub_cnt = i;
2760                 }
2761         }
2762
2763         if (bucket->ub_cnt == 0) {
2764                 bucket_free(zone, bucket, udata);
2765                 atomic_add_long(&zone->uz_fails, 1);
2766                 return (NULL);
2767         }
2768
2769         return (bucket);
2770 }
2771
2772 /*
2773  * Allocates a single item from a zone.
2774  *
2775  * Arguments
2776  *      zone   The zone to alloc for.
2777  *      udata  The data to be passed to the constructor.
2778  *      domain The domain to allocate from or UMA_ANYDOMAIN.
2779  *      flags  M_WAITOK, M_NOWAIT, M_ZERO.
2780  *
2781  * Returns
2782  *      NULL if there is no memory and M_NOWAIT is set
2783  *      An item if successful
2784  */
2785
2786 static void *
2787 zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags)
2788 {
2789         void *item;
2790
2791         item = NULL;
2792
2793         if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1)
2794                 goto fail;
2795         atomic_add_long(&zone->uz_allocs, 1);
2796
2797         /*
2798          * We have to call both the zone's init (not the keg's init)
2799          * and the zone's ctor.  This is because the item is going from
2800          * a keg slab directly to the user, and the user is expecting it
2801          * to be both zone-init'd as well as zone-ctor'd.
2802          */
2803         if (zone->uz_init != NULL) {
2804                 if (zone->uz_init(item, zone->uz_size, flags) != 0) {
2805                         zone_free_item(zone, item, udata, SKIP_FINI);
2806                         goto fail;
2807                 }
2808         }
2809         if (zone->uz_ctor != NULL) {
2810                 if (zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
2811                         zone_free_item(zone, item, udata, SKIP_DTOR);
2812                         goto fail;
2813                 }
2814         }
2815 #ifdef INVARIANTS
2816         uma_dbg_alloc(zone, NULL, item);
2817 #endif
2818         if (flags & M_ZERO)
2819                 uma_zero_item(item, zone);
2820
2821         CTR3(KTR_UMA, "zone_alloc_item item %p from %s(%p)", item,
2822             zone->uz_name, zone);
2823
2824         return (item);
2825
2826 fail:
2827         CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)",
2828             zone->uz_name, zone);
2829         atomic_add_long(&zone->uz_fails, 1);
2830         return (NULL);
2831 }
2832
2833 /* See uma.h */
2834 void
2835 uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
2836 {
2837         uma_cache_t cache;
2838         uma_bucket_t bucket;
2839         uma_zone_domain_t zdom;
2840         int cpu, domain, lockfail;
2841
2842         /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
2843         random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA);
2844
2845         CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread,
2846             zone->uz_name);
2847
2848         KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
2849             ("uma_zfree_arg: called with spinlock or critical section held"));
2850
2851         /* uma_zfree(..., NULL) does nothing, to match free(9). */
2852         if (item == NULL)
2853                 return;
2854 #ifdef DEBUG_MEMGUARD
2855         if (is_memguard_addr(item)) {
2856                 if (zone->uz_dtor != NULL)
2857                         zone->uz_dtor(item, zone->uz_size, udata);
2858                 if (zone->uz_fini != NULL)
2859                         zone->uz_fini(item, zone->uz_size);
2860                 memguard_free(item);
2861                 return;
2862         }
2863 #endif
2864 #ifdef INVARIANTS
2865         if (zone->uz_flags & UMA_ZONE_MALLOC)
2866                 uma_dbg_free(zone, udata, item);
2867         else
2868                 uma_dbg_free(zone, NULL, item);
2869 #endif
2870         if (zone->uz_dtor != NULL)
2871                 zone->uz_dtor(item, zone->uz_size, udata);
2872
2873         /*
2874          * The race here is acceptable.  If we miss it we'll just have to wait
2875          * a little longer for the limits to be reset.
2876          */
2877         if (zone->uz_flags & UMA_ZFLAG_FULL)
2878                 goto zfree_item;
2879
2880         /*
2881          * If possible, free to the per-CPU cache.  There are two
2882          * requirements for safe access to the per-CPU cache: (1) the thread
2883          * accessing the cache must not be preempted or yield during access,
2884          * and (2) the thread must not migrate CPUs without switching which
2885          * cache it accesses.  We rely on a critical section to prevent
2886          * preemption and migration.  We release the critical section in
2887          * order to acquire the zone mutex if we are unable to free to the
2888          * current cache; when we re-acquire the critical section, we must
2889          * detect and handle migration if it has occurred.
2890          */
2891 zfree_restart:
2892         critical_enter();
2893         cpu = curcpu;
2894         cache = &zone->uz_cpu[cpu];
2895
2896 zfree_start:
2897         /*
2898          * Try to free into the allocbucket first to give LIFO ordering
2899          * for cache-hot datastructures.  Spill over into the freebucket
2900          * if necessary.  Alloc will swap them if one runs dry.
2901          */
2902         bucket = cache->uc_allocbucket;
2903         if (bucket == NULL || bucket->ub_cnt >= bucket->ub_entries)
2904                 bucket = cache->uc_freebucket;
2905         if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
2906                 KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL,
2907                     ("uma_zfree: Freeing to non free bucket index."));
2908                 bucket->ub_bucket[bucket->ub_cnt] = item;
2909                 bucket->ub_cnt++;
2910                 cache->uc_frees++;
2911                 critical_exit();
2912                 return;
2913         }
2914
2915         /*
2916          * We must go back the zone, which requires acquiring the zone lock,
2917          * which in turn means we must release and re-acquire the critical
2918          * section.  Since the critical section is released, we may be
2919          * preempted or migrate.  As such, make sure not to maintain any
2920          * thread-local state specific to the cache from prior to releasing
2921          * the critical section.
2922          */
2923         critical_exit();
2924         if (zone->uz_count == 0 || bucketdisable)
2925                 goto zfree_item;
2926
2927         lockfail = 0;
2928         if (ZONE_TRYLOCK(zone) == 0) {
2929                 /* Record contention to size the buckets. */
2930                 ZONE_LOCK(zone);
2931                 lockfail = 1;
2932         }
2933         critical_enter();
2934         cpu = curcpu;
2935         cache = &zone->uz_cpu[cpu];
2936
2937         /*
2938          * Since we have locked the zone we may as well send back our stats.
2939          */
2940         atomic_add_long(&zone->uz_allocs, cache->uc_allocs);
2941         atomic_add_long(&zone->uz_frees, cache->uc_frees);
2942         cache->uc_allocs = 0;
2943         cache->uc_frees = 0;
2944
2945         bucket = cache->uc_freebucket;
2946         if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
2947                 ZONE_UNLOCK(zone);
2948                 goto zfree_start;
2949         }
2950         cache->uc_freebucket = NULL;
2951         /* We are no longer associated with this CPU. */
2952         critical_exit();
2953
2954         if ((zone->uz_flags & UMA_ZONE_NUMA) != 0)
2955                 domain = PCPU_GET(domain);
2956         else
2957                 domain = 0;
2958         zdom = &zone->uz_domain[0];
2959
2960         /* Can we throw this on the zone full list? */
2961         if (bucket != NULL) {
2962                 CTR3(KTR_UMA,
2963                     "uma_zfree: zone %s(%p) putting bucket %p on free list",
2964                     zone->uz_name, zone, bucket);
2965                 /* ub_cnt is pointing to the last free item */
2966                 KASSERT(bucket->ub_cnt != 0,
2967                     ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n"));
2968                 LIST_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link);
2969         }
2970
2971         /*
2972          * We bump the uz count when the cache size is insufficient to
2973          * handle the working set.
2974          */
2975         if (lockfail && zone->uz_count < BUCKET_MAX)
2976                 zone->uz_count++;
2977         ZONE_UNLOCK(zone);
2978
2979         bucket = bucket_alloc(zone, udata, M_NOWAIT);
2980         CTR3(KTR_UMA, "uma_zfree: zone %s(%p) allocated bucket %p",
2981             zone->uz_name, zone, bucket);
2982         if (bucket) {
2983                 critical_enter();
2984                 cpu = curcpu;
2985                 cache = &zone->uz_cpu[cpu];
2986                 if (cache->uc_freebucket == NULL &&
2987                     ((zone->uz_flags & UMA_ZONE_NUMA) == 0 ||
2988                     domain == PCPU_GET(domain))) {
2989                         cache->uc_freebucket = bucket;
2990                         goto zfree_start;
2991                 }
2992                 /*
2993                  * We lost the race, start over.  We have to drop our
2994                  * critical section to free the bucket.
2995                  */
2996                 critical_exit();
2997                 bucket_free(zone, bucket, udata);
2998                 goto zfree_restart;
2999         }
3000
3001         /*
3002          * If nothing else caught this, we'll just do an internal free.
3003          */
3004 zfree_item:
3005         zone_free_item(zone, item, udata, SKIP_DTOR);
3006
3007         return;
3008 }
3009
3010 void
3011 uma_zfree_domain(uma_zone_t zone, void *item, void *udata)
3012 {
3013
3014         /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
3015         random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA);
3016
3017         CTR2(KTR_UMA, "uma_zfree_domain thread %x zone %s", curthread,
3018             zone->uz_name);
3019
3020         KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
3021             ("uma_zfree_domain: called with spinlock or critical section held"));
3022
3023         /* uma_zfree(..., NULL) does nothing, to match free(9). */
3024         if (item == NULL)
3025                 return;
3026         zone_free_item(zone, item, udata, SKIP_NONE);
3027 }
3028
3029 static void
3030 slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item)
3031 {
3032         uma_domain_t dom;
3033         uint8_t freei;
3034
3035         mtx_assert(&keg->uk_lock, MA_OWNED);
3036         MPASS(keg == slab->us_keg);
3037
3038         dom = &keg->uk_domain[slab->us_domain];
3039
3040         /* Do we need to remove from any lists? */
3041         if (slab->us_freecount+1 == keg->uk_ipers) {
3042                 LIST_REMOVE(slab, us_link);
3043                 LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link);
3044         } else if (slab->us_freecount == 0) {
3045                 LIST_REMOVE(slab, us_link);
3046                 LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
3047         }
3048
3049         /* Slab management. */
3050         freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
3051         BIT_SET(SLAB_SETSIZE, freei, &slab->us_free);
3052         slab->us_freecount++;
3053
3054         /* Keg statistics. */
3055         keg->uk_free++;
3056 }
3057
3058 static void
3059 zone_release(uma_zone_t zone, void **bucket, int cnt)
3060 {
3061         void *item;
3062         uma_slab_t slab;
3063         uma_keg_t keg;
3064         uint8_t *mem;
3065         int clearfull;
3066         int i;
3067
3068         clearfull = 0;
3069         keg = zone_first_keg(zone);
3070         KEG_LOCK(keg);
3071         for (i = 0; i < cnt; i++) {
3072                 item = bucket[i];
3073                 if (!(zone->uz_flags & UMA_ZONE_VTOSLAB)) {
3074                         mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
3075                         if (zone->uz_flags & UMA_ZONE_HASH) {
3076                                 slab = hash_sfind(&keg->uk_hash, mem);
3077                         } else {
3078                                 mem += keg->uk_pgoff;
3079                                 slab = (uma_slab_t)mem;
3080                         }
3081                 } else {
3082                         slab = vtoslab((vm_offset_t)item);
3083                         if (slab->us_keg != keg) {
3084                                 KEG_UNLOCK(keg);
3085                                 keg = slab->us_keg;
3086                                 KEG_LOCK(keg);
3087                         }
3088                 }
3089                 slab_free_item(keg, slab, item);
3090                 if (keg->uk_flags & UMA_ZFLAG_FULL) {
3091                         if (keg->uk_pages < keg->uk_maxpages) {
3092                                 keg->uk_flags &= ~UMA_ZFLAG_FULL;
3093                                 clearfull = 1;
3094                         }
3095
3096                         /*
3097                          * We can handle one more allocation. Since we're
3098                          * clearing ZFLAG_FULL, wake up all procs blocked
3099                          * on pages. This should be uncommon, so keeping this
3100                          * simple for now (rather than adding count of blocked
3101                          * threads etc).
3102                          */
3103                         wakeup(keg);
3104                 }
3105         }
3106         KEG_UNLOCK(keg);
3107         if (clearfull) {
3108                 ZONE_LOCK(zone);
3109                 zone->uz_flags &= ~UMA_ZFLAG_FULL;
3110                 wakeup(zone);
3111                 ZONE_UNLOCK(zone);
3112         }
3113
3114 }
3115
3116 /*
3117  * Frees a single item to any zone.
3118  *
3119  * Arguments:
3120  *      zone   The zone to free to
3121  *      item   The item we're freeing
3122  *      udata  User supplied data for the dtor
3123  *      skip   Skip dtors and finis
3124  */
3125 static void
3126 zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip)
3127 {
3128
3129 #ifdef INVARIANTS
3130         if (skip == SKIP_NONE) {
3131                 if (zone->uz_flags & UMA_ZONE_MALLOC)
3132                         uma_dbg_free(zone, udata, item);
3133                 else
3134                         uma_dbg_free(zone, NULL, item);
3135         }
3136 #endif
3137         if (skip < SKIP_DTOR && zone->uz_dtor)
3138                 zone->uz_dtor(item, zone->uz_size, udata);
3139
3140         if (skip < SKIP_FINI && zone->uz_fini)
3141                 zone->uz_fini(item, zone->uz_size);
3142
3143         atomic_add_long(&zone->uz_frees, 1);
3144         zone->uz_release(zone->uz_arg, &item, 1);
3145 }
3146
3147 /* See uma.h */
3148 int
3149 uma_zone_set_max(uma_zone_t zone, int nitems)
3150 {
3151         uma_keg_t keg;
3152
3153         keg = zone_first_keg(zone);
3154         if (keg == NULL)
3155                 return (0);
3156         KEG_LOCK(keg);
3157         keg->uk_maxpages = (nitems / keg->uk_ipers) * keg->uk_ppera;
3158         if (keg->uk_maxpages * keg->uk_ipers < nitems)
3159                 keg->uk_maxpages += keg->uk_ppera;
3160         nitems = (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers;
3161         KEG_UNLOCK(keg);
3162
3163         return (nitems);
3164 }
3165
3166 /* See uma.h */
3167 int
3168 uma_zone_get_max(uma_zone_t zone)
3169 {
3170         int nitems;
3171         uma_keg_t keg;
3172
3173         keg = zone_first_keg(zone);
3174         if (keg == NULL)
3175                 return (0);
3176         KEG_LOCK(keg);
3177         nitems = (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers;
3178         KEG_UNLOCK(keg);
3179
3180         return (nitems);
3181 }
3182
3183 /* See uma.h */
3184 void
3185 uma_zone_set_warning(uma_zone_t zone, const char *warning)
3186 {
3187
3188         ZONE_LOCK(zone);
3189         zone->uz_warning = warning;
3190         ZONE_UNLOCK(zone);
3191 }
3192
3193 /* See uma.h */
3194 void
3195 uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction)
3196 {
3197
3198         ZONE_LOCK(zone);
3199         TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone);
3200         ZONE_UNLOCK(zone);
3201 }
3202
3203 /* See uma.h */
3204 int
3205 uma_zone_get_cur(uma_zone_t zone)
3206 {
3207         int64_t nitems;
3208         u_int i;
3209
3210         ZONE_LOCK(zone);
3211         nitems = zone->uz_allocs - zone->uz_frees;
3212         CPU_FOREACH(i) {
3213                 /*
3214                  * See the comment in sysctl_vm_zone_stats() regarding the
3215                  * safety of accessing the per-cpu caches. With the zone lock
3216                  * held, it is safe, but can potentially result in stale data.
3217                  */
3218                 nitems += zone->uz_cpu[i].uc_allocs -
3219                     zone->uz_cpu[i].uc_frees;
3220         }
3221         ZONE_UNLOCK(zone);
3222
3223         return (nitems < 0 ? 0 : nitems);
3224 }
3225
3226 /* See uma.h */
3227 void
3228 uma_zone_set_init(uma_zone_t zone, uma_init uminit)
3229 {
3230         uma_keg_t keg;
3231
3232         keg = zone_first_keg(zone);
3233         KASSERT(keg != NULL, ("uma_zone_set_init: Invalid zone type"));
3234         KEG_LOCK(keg);
3235         KASSERT(keg->uk_pages == 0,
3236             ("uma_zone_set_init on non-empty keg"));
3237         keg->uk_init = uminit;
3238         KEG_UNLOCK(keg);
3239 }
3240
3241 /* See uma.h */
3242 void
3243 uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
3244 {
3245         uma_keg_t keg;
3246
3247         keg = zone_first_keg(zone);
3248         KASSERT(keg != NULL, ("uma_zone_set_fini: Invalid zone type"));
3249         KEG_LOCK(keg);
3250         KASSERT(keg->uk_pages == 0,
3251             ("uma_zone_set_fini on non-empty keg"));
3252         keg->uk_fini = fini;
3253         KEG_UNLOCK(keg);
3254 }
3255
3256 /* See uma.h */
3257 void
3258 uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
3259 {
3260
3261         ZONE_LOCK(zone);
3262         KASSERT(zone_first_keg(zone)->uk_pages == 0,
3263             ("uma_zone_set_zinit on non-empty keg"));
3264         zone->uz_init = zinit;
3265         ZONE_UNLOCK(zone);
3266 }
3267
3268 /* See uma.h */
3269 void
3270 uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
3271 {
3272
3273         ZONE_LOCK(zone);
3274         KASSERT(zone_first_keg(zone)->uk_pages == 0,
3275             ("uma_zone_set_zfini on non-empty keg"));
3276         zone->uz_fini = zfini;
3277         ZONE_UNLOCK(zone);
3278 }
3279
3280 /* See uma.h */
3281 /* XXX uk_freef is not actually used with the zone locked */
3282 void
3283 uma_zone_set_freef(uma_zone_t zone, uma_free freef)
3284 {
3285         uma_keg_t keg;
3286
3287         keg = zone_first_keg(zone);
3288         KASSERT(keg != NULL, ("uma_zone_set_freef: Invalid zone type"));
3289         KEG_LOCK(keg);
3290         keg->uk_freef = freef;
3291         KEG_UNLOCK(keg);
3292 }
3293
3294 /* See uma.h */
3295 /* XXX uk_allocf is not actually used with the zone locked */
3296 void
3297 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
3298 {
3299         uma_keg_t keg;
3300
3301         keg = zone_first_keg(zone);
3302         KEG_LOCK(keg);
3303         keg->uk_allocf = allocf;
3304         KEG_UNLOCK(keg);
3305 }
3306
3307 /* See uma.h */
3308 void
3309 uma_zone_reserve(uma_zone_t zone, int items)
3310 {
3311         uma_keg_t keg;
3312
3313         keg = zone_first_keg(zone);
3314         if (keg == NULL)
3315                 return;
3316         KEG_LOCK(keg);
3317         keg->uk_reserve = items;
3318         KEG_UNLOCK(keg);
3319
3320         return;
3321 }
3322
3323 /* See uma.h */
3324 int
3325 uma_zone_reserve_kva(uma_zone_t zone, int count)
3326 {
3327         uma_keg_t keg;
3328         vm_offset_t kva;
3329         u_int pages;
3330
3331         keg = zone_first_keg(zone);
3332         if (keg == NULL)
3333                 return (0);
3334         pages = count / keg->uk_ipers;
3335
3336         if (pages * keg->uk_ipers < count)
3337                 pages++;
3338         pages *= keg->uk_ppera;
3339
3340 #ifdef UMA_MD_SMALL_ALLOC
3341         if (keg->uk_ppera > 1) {
3342 #else
3343         if (1) {
3344 #endif
3345                 kva = kva_alloc((vm_size_t)pages * PAGE_SIZE);
3346                 if (kva == 0)
3347                         return (0);
3348         } else
3349                 kva = 0;
3350         KEG_LOCK(keg);
3351         keg->uk_kva = kva;
3352         keg->uk_offset = 0;
3353         keg->uk_maxpages = pages;
3354 #ifdef UMA_MD_SMALL_ALLOC
3355         keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc;
3356 #else
3357         keg->uk_allocf = noobj_alloc;
3358 #endif
3359         keg->uk_flags |= UMA_ZONE_NOFREE;
3360         KEG_UNLOCK(keg);
3361
3362         return (1);
3363 }
3364
3365 /* See uma.h */
3366 void
3367 uma_prealloc(uma_zone_t zone, int items)
3368 {
3369         uma_domain_t dom;
3370         uma_slab_t slab;
3371         uma_keg_t keg;
3372         int domain, slabs;
3373
3374         keg = zone_first_keg(zone);
3375         if (keg == NULL)
3376                 return;
3377         KEG_LOCK(keg);
3378         slabs = items / keg->uk_ipers;
3379         domain = 0;
3380         if (slabs * keg->uk_ipers < items)
3381                 slabs++;
3382         while (slabs > 0) {
3383                 slab = keg_alloc_slab(keg, zone, domain, M_WAITOK);
3384                 if (slab == NULL)
3385                         break;
3386                 MPASS(slab->us_keg == keg);
3387                 dom = &keg->uk_domain[slab->us_domain];
3388                 LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link);
3389                 slabs--;
3390                 domain = (domain + 1) % vm_ndomains;
3391         }
3392         KEG_UNLOCK(keg);
3393 }
3394
3395 /* See uma.h */
3396 static void
3397 uma_reclaim_locked(bool kmem_danger)
3398 {
3399
3400         CTR0(KTR_UMA, "UMA: vm asked us to release pages!");
3401         sx_assert(&uma_drain_lock, SA_XLOCKED);
3402         bucket_enable();
3403         zone_foreach(zone_drain);
3404         if (vm_page_count_min() || kmem_danger) {
3405                 cache_drain_safe(NULL);
3406                 zone_foreach(zone_drain);
3407         }
3408         /*
3409          * Some slabs may have been freed but this zone will be visited early
3410          * we visit again so that we can free pages that are empty once other
3411          * zones are drained.  We have to do the same for buckets.
3412          */
3413         zone_drain(slabzone);
3414         bucket_zone_drain();
3415 }
3416
3417 void
3418 uma_reclaim(void)
3419 {
3420
3421         sx_xlock(&uma_drain_lock);
3422         uma_reclaim_locked(false);
3423         sx_xunlock(&uma_drain_lock);
3424 }
3425
3426 static volatile int uma_reclaim_needed;
3427
3428 void
3429 uma_reclaim_wakeup(void)
3430 {
3431
3432         if (atomic_fetchadd_int(&uma_reclaim_needed, 1) == 0)
3433                 wakeup(uma_reclaim);
3434 }
3435
3436 void
3437 uma_reclaim_worker(void *arg __unused)
3438 {
3439
3440         for (;;) {
3441                 sx_xlock(&uma_drain_lock);
3442                 while (atomic_load_int(&uma_reclaim_needed) == 0)
3443                         sx_sleep(uma_reclaim, &uma_drain_lock, PVM, "umarcl",
3444                             hz);
3445                 sx_xunlock(&uma_drain_lock);
3446                 EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM);
3447                 sx_xlock(&uma_drain_lock);
3448                 uma_reclaim_locked(true);
3449                 atomic_store_int(&uma_reclaim_needed, 0);
3450                 sx_xunlock(&uma_drain_lock);
3451                 /* Don't fire more than once per-second. */
3452                 pause("umarclslp", hz);
3453         }
3454 }
3455
3456 /* See uma.h */
3457 int
3458 uma_zone_exhausted(uma_zone_t zone)
3459 {
3460         int full;
3461
3462         ZONE_LOCK(zone);
3463         full = (zone->uz_flags & UMA_ZFLAG_FULL);
3464         ZONE_UNLOCK(zone);
3465         return (full);
3466 }
3467
3468 int
3469 uma_zone_exhausted_nolock(uma_zone_t zone)
3470 {
3471         return (zone->uz_flags & UMA_ZFLAG_FULL);
3472 }
3473
3474 void *
3475 uma_large_malloc_domain(vm_size_t size, int domain, int wait)
3476 {
3477         vm_offset_t addr;
3478         uma_slab_t slab;
3479
3480         slab = zone_alloc_item(slabzone, NULL, domain, wait);
3481         if (slab == NULL)
3482                 return (NULL);
3483         if (domain == UMA_ANYDOMAIN)
3484                 addr = kmem_malloc(kernel_arena, size, wait);
3485         else
3486                 addr = kmem_malloc_domain(domain, size, wait);
3487         if (addr != 0) {
3488                 vsetslab(addr, slab);
3489                 slab->us_data = (void *)addr;
3490                 slab->us_flags = UMA_SLAB_KERNEL | UMA_SLAB_MALLOC;
3491                 slab->us_size = size;
3492                 slab->us_domain = vm_phys_domain(PHYS_TO_VM_PAGE(
3493                     pmap_kextract(addr)));
3494                 uma_total_inc(size);
3495         } else {
3496                 zone_free_item(slabzone, slab, NULL, SKIP_NONE);
3497         }
3498
3499         return ((void *)addr);
3500 }
3501
3502 void *
3503 uma_large_malloc(vm_size_t size, int wait)
3504 {
3505
3506         return uma_large_malloc_domain(size, UMA_ANYDOMAIN, wait);
3507 }
3508
3509 void
3510 uma_large_free(uma_slab_t slab)
3511 {
3512
3513         KASSERT((slab->us_flags & UMA_SLAB_KERNEL) != 0,
3514             ("uma_large_free:  Memory not allocated with uma_large_malloc."));
3515         kmem_free(kernel_arena, (vm_offset_t)slab->us_data, slab->us_size);
3516         uma_total_dec(slab->us_size);
3517         zone_free_item(slabzone, slab, NULL, SKIP_NONE);
3518 }
3519
3520 static void
3521 uma_zero_item(void *item, uma_zone_t zone)
3522 {
3523         int i;
3524
3525         if (zone->uz_flags & UMA_ZONE_PCPU) {
3526                 CPU_FOREACH(i)
3527                         bzero(zpcpu_get_cpu(item, i), zone->uz_size);
3528         } else
3529                 bzero(item, zone->uz_size);
3530 }
3531
3532 unsigned long
3533 uma_limit(void)
3534 {
3535
3536         return (uma_kmem_limit);
3537 }
3538
3539 void
3540 uma_set_limit(unsigned long limit)
3541 {
3542
3543         uma_kmem_limit = limit;
3544 }
3545
3546 unsigned long
3547 uma_size(void)
3548 {
3549
3550         return (uma_kmem_total);
3551 }
3552
3553 long
3554 uma_avail(void)
3555 {
3556
3557         return (uma_kmem_limit - uma_kmem_total);
3558 }
3559
3560 void
3561 uma_print_stats(void)
3562 {
3563         zone_foreach(uma_print_zone);
3564 }
3565
3566 static void
3567 slab_print(uma_slab_t slab)
3568 {
3569         printf("slab: keg %p, data %p, freecount %d\n",
3570                 slab->us_keg, slab->us_data, slab->us_freecount);
3571 }
3572
3573 static void
3574 cache_print(uma_cache_t cache)
3575 {
3576         printf("alloc: %p(%d), free: %p(%d)\n",
3577                 cache->uc_allocbucket,
3578                 cache->uc_allocbucket?cache->uc_allocbucket->ub_cnt:0,
3579                 cache->uc_freebucket,
3580                 cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0);
3581 }
3582
3583 static void
3584 uma_print_keg(uma_keg_t keg)
3585 {
3586         uma_domain_t dom;
3587         uma_slab_t slab;
3588         int i;
3589
3590         printf("keg: %s(%p) size %d(%d) flags %#x ipers %d ppera %d "
3591             "out %d free %d limit %d\n",
3592             keg->uk_name, keg, keg->uk_size, keg->uk_rsize, keg->uk_flags,
3593             keg->uk_ipers, keg->uk_ppera,
3594             (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free,
3595             keg->uk_free, (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers);
3596         for (i = 0; i < vm_ndomains; i++) {
3597                 dom = &keg->uk_domain[i];
3598                 printf("Part slabs:\n");
3599                 LIST_FOREACH(slab, &dom->ud_part_slab, us_link)
3600                         slab_print(slab);
3601                 printf("Free slabs:\n");
3602                 LIST_FOREACH(slab, &dom->ud_free_slab, us_link)
3603                         slab_print(slab);
3604                 printf("Full slabs:\n");
3605                 LIST_FOREACH(slab, &dom->ud_full_slab, us_link)
3606                         slab_print(slab);
3607         }
3608 }
3609
3610 void
3611 uma_print_zone(uma_zone_t zone)
3612 {
3613         uma_cache_t cache;
3614         uma_klink_t kl;
3615         int i;
3616
3617         printf("zone: %s(%p) size %d flags %#x\n",
3618             zone->uz_name, zone, zone->uz_size, zone->uz_flags);
3619         LIST_FOREACH(kl, &zone->uz_kegs, kl_link)
3620                 uma_print_keg(kl->kl_keg);
3621         CPU_FOREACH(i) {
3622                 cache = &zone->uz_cpu[i];
3623                 printf("CPU %d Cache:\n", i);
3624                 cache_print(cache);
3625         }
3626 }
3627
3628 #ifdef DDB
3629 /*
3630  * Generate statistics across both the zone and its per-cpu cache's.  Return
3631  * desired statistics if the pointer is non-NULL for that statistic.
3632  *
3633  * Note: does not update the zone statistics, as it can't safely clear the
3634  * per-CPU cache statistic.
3635  *
3636  * XXXRW: Following the uc_allocbucket and uc_freebucket pointers here isn't
3637  * safe from off-CPU; we should modify the caches to track this information
3638  * directly so that we don't have to.
3639  */
3640 static void
3641 uma_zone_sumstat(uma_zone_t z, int *cachefreep, uint64_t *allocsp,
3642     uint64_t *freesp, uint64_t *sleepsp)
3643 {
3644         uma_cache_t cache;
3645         uint64_t allocs, frees, sleeps;
3646         int cachefree, cpu;
3647
3648         allocs = frees = sleeps = 0;
3649         cachefree = 0;
3650         CPU_FOREACH(cpu) {
3651                 cache = &z->uz_cpu[cpu];
3652                 if (cache->uc_allocbucket != NULL)
3653                         cachefree += cache->uc_allocbucket->ub_cnt;
3654                 if (cache->uc_freebucket != NULL)
3655                         cachefree += cache->uc_freebucket->ub_cnt;
3656                 allocs += cache->uc_allocs;
3657                 frees += cache->uc_frees;
3658         }
3659         allocs += z->uz_allocs;
3660         frees += z->uz_frees;
3661         sleeps += z->uz_sleeps;
3662         if (cachefreep != NULL)
3663                 *cachefreep = cachefree;
3664         if (allocsp != NULL)
3665                 *allocsp = allocs;
3666         if (freesp != NULL)
3667                 *freesp = frees;
3668         if (sleepsp != NULL)
3669                 *sleepsp = sleeps;
3670 }
3671 #endif /* DDB */
3672
3673 static int
3674 sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS)
3675 {
3676         uma_keg_t kz;
3677         uma_zone_t z;
3678         int count;
3679
3680         count = 0;
3681         rw_rlock(&uma_rwlock);
3682         LIST_FOREACH(kz, &uma_kegs, uk_link) {
3683                 LIST_FOREACH(z, &kz->uk_zones, uz_link)
3684                         count++;
3685         }
3686         rw_runlock(&uma_rwlock);
3687         return (sysctl_handle_int(oidp, &count, 0, req));
3688 }
3689
3690 static int
3691 sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS)
3692 {
3693         struct uma_stream_header ush;
3694         struct uma_type_header uth;
3695         struct uma_percpu_stat ups;
3696         uma_bucket_t bucket;
3697         uma_zone_domain_t zdom;
3698         struct sbuf sbuf;
3699         uma_cache_t cache;
3700         uma_klink_t kl;
3701         uma_keg_t kz;
3702         uma_zone_t z;
3703         uma_keg_t k;
3704         int count, error, i;
3705
3706         error = sysctl_wire_old_buffer(req, 0);
3707         if (error != 0)
3708                 return (error);
3709         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
3710         sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL);
3711
3712         count = 0;
3713         rw_rlock(&uma_rwlock);
3714         LIST_FOREACH(kz, &uma_kegs, uk_link) {
3715                 LIST_FOREACH(z, &kz->uk_zones, uz_link)
3716                         count++;
3717         }
3718
3719         /*
3720          * Insert stream header.
3721          */
3722         bzero(&ush, sizeof(ush));
3723         ush.ush_version = UMA_STREAM_VERSION;
3724         ush.ush_maxcpus = (mp_maxid + 1);
3725         ush.ush_count = count;
3726         (void)sbuf_bcat(&sbuf, &ush, sizeof(ush));
3727
3728         LIST_FOREACH(kz, &uma_kegs, uk_link) {
3729                 LIST_FOREACH(z, &kz->uk_zones, uz_link) {
3730                         bzero(&uth, sizeof(uth));
3731                         ZONE_LOCK(z);
3732                         strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
3733                         uth.uth_align = kz->uk_align;
3734                         uth.uth_size = kz->uk_size;
3735                         uth.uth_rsize = kz->uk_rsize;
3736                         LIST_FOREACH(kl, &z->uz_kegs, kl_link) {
3737                                 k = kl->kl_keg;
3738                                 uth.uth_maxpages += k->uk_maxpages;
3739                                 uth.uth_pages += k->uk_pages;
3740                                 uth.uth_keg_free += k->uk_free;
3741                                 uth.uth_limit = (k->uk_maxpages / k->uk_ppera)
3742                                     * k->uk_ipers;
3743                         }
3744
3745                         /*
3746                          * A zone is secondary is it is not the first entry
3747                          * on the keg's zone list.
3748                          */
3749                         if ((z->uz_flags & UMA_ZONE_SECONDARY) &&
3750                             (LIST_FIRST(&kz->uk_zones) != z))
3751                                 uth.uth_zone_flags = UTH_ZONE_SECONDARY;
3752
3753                         for (i = 0; i < vm_ndomains; i++) {
3754                                 zdom = &z->uz_domain[i];
3755                                 LIST_FOREACH(bucket, &zdom->uzd_buckets,
3756                                     ub_link)
3757                                         uth.uth_zone_free += bucket->ub_cnt;
3758                         }
3759                         uth.uth_allocs = z->uz_allocs;
3760                         uth.uth_frees = z->uz_frees;
3761                         uth.uth_fails = z->uz_fails;
3762                         uth.uth_sleeps = z->uz_sleeps;
3763                         (void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
3764                         /*
3765                          * While it is not normally safe to access the cache
3766                          * bucket pointers while not on the CPU that owns the
3767                          * cache, we only allow the pointers to be exchanged
3768                          * without the zone lock held, not invalidated, so
3769                          * accept the possible race associated with bucket
3770                          * exchange during monitoring.
3771                          */
3772                         for (i = 0; i < (mp_maxid + 1); i++) {
3773                                 bzero(&ups, sizeof(ups));
3774                                 if (kz->uk_flags & UMA_ZFLAG_INTERNAL)
3775                                         goto skip;
3776                                 if (CPU_ABSENT(i))
3777                                         goto skip;
3778                                 cache = &z->uz_cpu[i];
3779                                 if (cache->uc_allocbucket != NULL)
3780                                         ups.ups_cache_free +=
3781                                             cache->uc_allocbucket->ub_cnt;
3782                                 if (cache->uc_freebucket != NULL)
3783                                         ups.ups_cache_free +=
3784                                             cache->uc_freebucket->ub_cnt;
3785                                 ups.ups_allocs = cache->uc_allocs;
3786                                 ups.ups_frees = cache->uc_frees;
3787 skip:
3788                                 (void)sbuf_bcat(&sbuf, &ups, sizeof(ups));
3789                         }
3790                         ZONE_UNLOCK(z);
3791                 }
3792         }
3793         rw_runlock(&uma_rwlock);
3794         error = sbuf_finish(&sbuf);
3795         sbuf_delete(&sbuf);
3796         return (error);
3797 }
3798
3799 int
3800 sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS)
3801 {
3802         uma_zone_t zone = *(uma_zone_t *)arg1;
3803         int error, max;
3804
3805         max = uma_zone_get_max(zone);
3806         error = sysctl_handle_int(oidp, &max, 0, req);
3807         if (error || !req->newptr)
3808                 return (error);
3809
3810         uma_zone_set_max(zone, max);
3811
3812         return (0);
3813 }
3814
3815 int
3816 sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS)
3817 {
3818         uma_zone_t zone = *(uma_zone_t *)arg1;
3819         int cur;
3820
3821         cur = uma_zone_get_cur(zone);
3822         return (sysctl_handle_int(oidp, &cur, 0, req));
3823 }
3824
3825 #ifdef INVARIANTS
3826 static uma_slab_t
3827 uma_dbg_getslab(uma_zone_t zone, void *item)
3828 {
3829         uma_slab_t slab;
3830         uma_keg_t keg;
3831         uint8_t *mem;
3832
3833         mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
3834         if (zone->uz_flags & UMA_ZONE_VTOSLAB) {
3835                 slab = vtoslab((vm_offset_t)mem);
3836         } else {
3837                 /*
3838                  * It is safe to return the slab here even though the
3839                  * zone is unlocked because the item's allocation state
3840                  * essentially holds a reference.
3841                  */
3842                 ZONE_LOCK(zone);
3843                 keg = LIST_FIRST(&zone->uz_kegs)->kl_keg;
3844                 if (keg->uk_flags & UMA_ZONE_HASH)
3845                         slab = hash_sfind(&keg->uk_hash, mem);
3846                 else
3847                         slab = (uma_slab_t)(mem + keg->uk_pgoff);
3848                 ZONE_UNLOCK(zone);
3849         }
3850
3851         return (slab);
3852 }
3853
3854 /*
3855  * Set up the slab's freei data such that uma_dbg_free can function.
3856  *
3857  */
3858 static void
3859 uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
3860 {
3861         uma_keg_t keg;
3862         int freei;
3863
3864         if (zone_first_keg(zone) == NULL)
3865                 return;
3866         if (slab == NULL) {
3867                 slab = uma_dbg_getslab(zone, item);
3868                 if (slab == NULL)
3869                         panic("uma: item %p did not belong to zone %s\n",
3870                             item, zone->uz_name);
3871         }
3872         keg = slab->us_keg;
3873         freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
3874
3875         if (BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
3876                 panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n",
3877                     item, zone, zone->uz_name, slab, freei);
3878         BIT_SET_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
3879
3880         return;
3881 }
3882
3883 /*
3884  * Verifies freed addresses.  Checks for alignment, valid slab membership
3885  * and duplicate frees.
3886  *
3887  */
3888 static void
3889 uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
3890 {
3891         uma_keg_t keg;
3892         int freei;
3893
3894         if (zone_first_keg(zone) == NULL)
3895                 return;
3896         if (slab == NULL) {
3897                 slab = uma_dbg_getslab(zone, item);
3898                 if (slab == NULL)
3899                         panic("uma: Freed item %p did not belong to zone %s\n",
3900                             item, zone->uz_name);
3901         }
3902         keg = slab->us_keg;
3903         freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
3904
3905         if (freei >= keg->uk_ipers)
3906                 panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n",
3907                     item, zone, zone->uz_name, slab, freei);
3908
3909         if (((freei * keg->uk_rsize) + slab->us_data) != item)
3910                 panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n",
3911                     item, zone, zone->uz_name, slab, freei);
3912
3913         if (!BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
3914                 panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n",
3915                     item, zone, zone->uz_name, slab, freei);
3916
3917         BIT_CLR_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
3918 }
3919 #endif /* INVARIANTS */
3920
3921 #ifdef DDB
3922 DB_SHOW_COMMAND(uma, db_show_uma)
3923 {
3924         uma_bucket_t bucket;
3925         uma_keg_t kz;
3926         uma_zone_t z;
3927         uma_zone_domain_t zdom;
3928         uint64_t allocs, frees, sleeps;
3929         int cachefree, i;
3930
3931         db_printf("%18s %8s %8s %8s %12s %8s %8s\n", "Zone", "Size", "Used",
3932             "Free", "Requests", "Sleeps", "Bucket");
3933         LIST_FOREACH(kz, &uma_kegs, uk_link) {
3934                 LIST_FOREACH(z, &kz->uk_zones, uz_link) {
3935                         if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
3936                                 allocs = z->uz_allocs;
3937                                 frees = z->uz_frees;
3938                                 sleeps = z->uz_sleeps;
3939                                 cachefree = 0;
3940                         } else
3941                                 uma_zone_sumstat(z, &cachefree, &allocs,
3942                                     &frees, &sleeps);
3943                         if (!((z->uz_flags & UMA_ZONE_SECONDARY) &&
3944                             (LIST_FIRST(&kz->uk_zones) != z)))
3945                                 cachefree += kz->uk_free;
3946                         for (i = 0; i < vm_ndomains; i++) {
3947                                 zdom = &z->uz_domain[i];
3948                                 LIST_FOREACH(bucket, &zdom->uzd_buckets,
3949                                     ub_link)
3950                                         cachefree += bucket->ub_cnt;
3951                         }
3952                         db_printf("%18s %8ju %8jd %8d %12ju %8ju %8u\n",
3953                             z->uz_name, (uintmax_t)kz->uk_size,
3954                             (intmax_t)(allocs - frees), cachefree,
3955                             (uintmax_t)allocs, sleeps, z->uz_count);
3956                         if (db_pager_quit)
3957                                 return;
3958                 }
3959         }
3960 }
3961
3962 DB_SHOW_COMMAND(umacache, db_show_umacache)
3963 {
3964         uma_bucket_t bucket;
3965         uma_zone_t z;
3966         uma_zone_domain_t zdom;
3967         uint64_t allocs, frees;
3968         int cachefree, i;
3969
3970         db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
3971             "Requests", "Bucket");
3972         LIST_FOREACH(z, &uma_cachezones, uz_link) {
3973                 uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL);
3974                 for (i = 0; i < vm_ndomains; i++) {
3975                         zdom = &z->uz_domain[i];
3976                         LIST_FOREACH(bucket, &zdom->uzd_buckets, ub_link)
3977                                 cachefree += bucket->ub_cnt;
3978                 }
3979                 db_printf("%18s %8ju %8jd %8d %12ju %8u\n",
3980                     z->uz_name, (uintmax_t)z->uz_size,
3981                     (intmax_t)(allocs - frees), cachefree,
3982                     (uintmax_t)allocs, z->uz_count);
3983                 if (db_pager_quit)
3984                         return;
3985         }
3986 }
3987 #endif  /* DDB */