sys/vm/uma_core.c

   1 /*-
   2  * Copyright (c) 2002-2005, 2009 Jeffrey Roberson <jeff@FreeBSD.org>
   3  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
   4  * Copyright (c) 2004-2006 Robert N. M. Watson
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice unmodified, this list of conditions, and the following
  12  *    disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27  */
  28
  29 /*
  30  * uma_core.c  Implementation of the Universal Memory allocator
  31  *
  32  * This allocator is intended to replace the multitude of similar object caches
  33  * in the standard FreeBSD kernel.  The intent is to be flexible as well as
  34  * effecient.  A primary design goal is to return unused memory to the rest of
  35  * the system.  This will make the system as a whole more flexible due to the
  36  * ability to move memory to subsystems which most need it instead of leaving
  37  * pools of reserved memory unused.
  38  *
  39  * The basic ideas stem from similar slab/zone based allocators whose algorithms
  40  * are well known.
  41  *
  42  */
  43
  44 /*
  45  * TODO:
  46  *      - Improve memory usage for large allocations
  47  *      - Investigate cache size adjustments
  48  */
  49
  50 #include <sys/cdefs.h>
  51 __FBSDID("$FreeBSD$");
  52
  53 /* I should really use ktr.. */
  54 /*
  55 #define UMA_DEBUG 1
  56 #define UMA_DEBUG_ALLOC 1
  57 #define UMA_DEBUG_ALLOC_1 1
  58 */
  59
  60 #include "opt_ddb.h"
  61 #include "opt_param.h"
  62 #include "opt_vm.h"
  63
  64 #include <sys/param.h>
  65 #include <sys/systm.h>
  66 #include <sys/kernel.h>
  67 #include <sys/types.h>
  68 #include <sys/queue.h>
  69 #include <sys/malloc.h>
  70 #include <sys/ktr.h>
  71 #include <sys/lock.h>
  72 #include <sys/sysctl.h>
  73 #include <sys/mutex.h>
  74 #include <sys/proc.h>
  75 #include <sys/rwlock.h>
  76 #include <sys/sbuf.h>
  77 #include <sys/smp.h>
  78 #include <sys/vmmeter.h>
  79
  80 #include <vm/vm.h>
  81 #include <vm/vm_object.h>
  82 #include <vm/vm_page.h>
  83 #include <vm/vm_pageout.h>
  84 #include <vm/vm_param.h>
  85 #include <vm/vm_map.h>
  86 #include <vm/vm_kern.h>
  87 #include <vm/vm_extern.h>
  88 #include <vm/uma.h>
  89 #include <vm/uma_int.h>
  90 #include <vm/uma_dbg.h>
  91
  92 #include <ddb/ddb.h>
  93
  94 #ifdef DEBUG_MEMGUARD
  95 #include <vm/memguard.h>
  96 #endif
  97
  98 /*
  99  * This is the zone and keg from which all zones are spawned.  The idea is that
 100  * even the zone & keg heads are allocated from the allocator, so we use the
 101  * bss section to bootstrap us.
 102  */
 103 static struct uma_keg masterkeg;
 104 static struct uma_zone masterzone_k;
 105 static struct uma_zone masterzone_z;
 106 static uma_zone_t kegs = &masterzone_k;
 107 static uma_zone_t zones = &masterzone_z;
 108
 109 /* This is the zone from which all of uma_slab_t's are allocated. */
 110 static uma_zone_t slabzone;
 111 static uma_zone_t slabrefzone;  /* With refcounters (for UMA_ZONE_REFCNT) */
 112
 113 /*
 114  * The initial hash tables come out of this zone so they can be allocated
 115  * prior to malloc coming up.
 116  */
 117 static uma_zone_t hashzone;
 118
 119 /* The boot-time adjusted value for cache line alignment. */
 120 int uma_align_cache = 64 - 1;
 121
 122 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
 123
 124 /*
 125  * Are we allowed to allocate buckets?
 126  */
 127 static int bucketdisable = 1;
 128
 129 /* Linked list of all kegs in the system */
 130 static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs);
 131
 132 /* This mutex protects the keg list */
 133 static struct mtx uma_mtx;
 134
 135 /* Linked list of boot time pages */
 136 static LIST_HEAD(,uma_slab) uma_boot_pages =
 137     LIST_HEAD_INITIALIZER(uma_boot_pages);
 138
 139 /* This mutex protects the boot time pages list */
 140 static struct mtx uma_boot_pages_mtx;
 141
 142 /* Is the VM done starting up? */
 143 static int booted = 0;
 144 #define UMA_STARTUP     1
 145 #define UMA_STARTUP2    2
 146
 147 /* Maximum number of allowed items-per-slab if the slab header is OFFPAGE */
 148 static u_int uma_max_ipers;
 149 static u_int uma_max_ipers_ref;
 150
 151 /*
 152  * This is the handle used to schedule events that need to happen
 153  * outside of the allocation fast path.
 154  */
 155 static struct callout uma_callout;
 156 #define UMA_TIMEOUT     20              /* Seconds for callout interval. */
 157
 158 /*
 159  * This structure is passed as the zone ctor arg so that I don't have to create
 160  * a special allocation function just for zones.
 161  */
 162 struct uma_zctor_args {
 163         const char *name;
 164         size_t size;
 165         uma_ctor ctor;
 166         uma_dtor dtor;
 167         uma_init uminit;
 168         uma_fini fini;
 169         uma_keg_t keg;
 170         int align;
 171         u_int32_t flags;
 172 };
 173
 174 struct uma_kctor_args {
 175         uma_zone_t zone;
 176         size_t size;
 177         uma_init uminit;
 178         uma_fini fini;
 179         int align;
 180         u_int32_t flags;
 181 };
 182
 183 struct uma_bucket_zone {
 184         uma_zone_t      ubz_zone;
 185         char            *ubz_name;
 186         int             ubz_entries;
 187 };
 188
 189 #define BUCKET_MAX      128
 190
 191 struct uma_bucket_zone bucket_zones[] = {
 192         { NULL, "16 Bucket", 16 },
 193         { NULL, "32 Bucket", 32 },
 194         { NULL, "64 Bucket", 64 },
 195         { NULL, "128 Bucket", 128 },
 196         { NULL, NULL, 0}
 197 };
 198
 199 #define BUCKET_SHIFT    4
 200 #define BUCKET_ZONES    ((BUCKET_MAX >> BUCKET_SHIFT) + 1)
 201
 202 /*
 203  * bucket_size[] maps requested bucket sizes to zones that allocate a bucket
 204  * of approximately the right size.
 205  */
 206 static uint8_t bucket_size[BUCKET_ZONES];
 207
 208 /*
 209  * Flags and enumerations to be passed to internal functions.
 210  */
 211 enum zfreeskip { SKIP_NONE, SKIP_DTOR, SKIP_FINI };
 212
 213 #define ZFREE_STATFAIL  0x00000001      /* Update zone failure statistic. */
 214 #define ZFREE_STATFREE  0x00000002      /* Update zone free statistic. */
 215
 216 /* Prototypes.. */
 217
 218 static void *noobj_alloc(uma_zone_t, int, u_int8_t *, int);
 219 static void *page_alloc(uma_zone_t, int, u_int8_t *, int);
 220 static void *startup_alloc(uma_zone_t, int, u_int8_t *, int);
 221 static void page_free(void *, int, u_int8_t);
 222 static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int);
 223 static void cache_drain(uma_zone_t);
 224 static void bucket_drain(uma_zone_t, uma_bucket_t);
 225 static void bucket_cache_drain(uma_zone_t zone);
 226 static int keg_ctor(void *, int, void *, int);
 227 static void keg_dtor(void *, int, void *);
 228 static int zone_ctor(void *, int, void *, int);
 229 static void zone_dtor(void *, int, void *);
 230 static int zero_init(void *, int, int);
 231 static void keg_small_init(uma_keg_t keg);
 232 static void keg_large_init(uma_keg_t keg);
 233 static void zone_foreach(void (*zfunc)(uma_zone_t));
 234 static void zone_timeout(uma_zone_t zone);
 235 static int hash_alloc(struct uma_hash *);
 236 static int hash_expand(struct uma_hash *, struct uma_hash *);
 237 static void hash_free(struct uma_hash *hash);
 238 static void uma_timeout(void *);
 239 static void uma_startup3(void);
 240 static void *zone_alloc_item(uma_zone_t, void *, int);
 241 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip,
 242     int);
 243 static void bucket_enable(void);
 244 static void bucket_init(void);
 245 static uma_bucket_t bucket_alloc(int, int);
 246 static void bucket_free(uma_bucket_t);
 247 static void bucket_zone_drain(void);
 248 static int zone_alloc_bucket(uma_zone_t zone, int flags);
 249 static uma_slab_t zone_fetch_slab(uma_zone_t zone, uma_keg_t last, int flags);
 250 static uma_slab_t zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int flags);
 251 static void *slab_alloc_item(uma_zone_t zone, uma_slab_t slab);
 252 static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
 253     uma_fini fini, int align, u_int32_t flags);
 254 static inline void zone_relock(uma_zone_t zone, uma_keg_t keg);
 255 static inline void keg_relock(uma_keg_t keg, uma_zone_t zone);
 256
 257 void uma_print_zone(uma_zone_t);
 258 void uma_print_stats(void);
 259 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
 260 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
 261
 262 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
 263
 264 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT,
 265     0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones");
 266
 267 SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
 268     0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
 269
 270 static int zone_warnings = 1;
 271 TUNABLE_INT("vm.zone_warnings", &zone_warnings);
 272 SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RW, &zone_warnings, 0,
 273     "Warn when UMA zones becomes full");
 274
 275 /*
 276  * This routine checks to see whether or not it's safe to enable buckets.
 277  */
 278
 279 static void
 280 bucket_enable(void)
 281 {
 282         bucketdisable = vm_page_count_min();
 283 }
 284
 285 /*
 286  * Initialize bucket_zones, the array of zones of buckets of various sizes.
 287  *
 288  * For each zone, calculate the memory required for each bucket, consisting
 289  * of the header and an array of pointers.  Initialize bucket_size[] to point
 290  * the range of appropriate bucket sizes at the zone.
 291  */
 292 static void
 293 bucket_init(void)
 294 {
 295         struct uma_bucket_zone *ubz;
 296         int i;
 297         int j;
 298
 299         for (i = 0, j = 0; bucket_zones[j].ubz_entries != 0; j++) {
 300                 int size;
 301
 302                 ubz = &bucket_zones[j];
 303                 size = roundup(sizeof(struct uma_bucket), sizeof(void *));
 304                 size += sizeof(void *) * ubz->ubz_entries;
 305                 ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
 306                     NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 307                     UMA_ZFLAG_INTERNAL | UMA_ZFLAG_BUCKET);
 308                 for (; i <= ubz->ubz_entries; i += (1 << BUCKET_SHIFT))
 309                         bucket_size[i >> BUCKET_SHIFT] = j;
 310         }
 311 }
 312
 313 /*
 314  * Given a desired number of entries for a bucket, return the zone from which
 315  * to allocate the bucket.
 316  */
 317 static struct uma_bucket_zone *
 318 bucket_zone_lookup(int entries)
 319 {
 320         int idx;
 321
 322         idx = howmany(entries, 1 << BUCKET_SHIFT);
 323         return (&bucket_zones[bucket_size[idx]]);
 324 }
 325
 326 static uma_bucket_t
 327 bucket_alloc(int entries, int bflags)
 328 {
 329         struct uma_bucket_zone *ubz;
 330         uma_bucket_t bucket;
 331
 332         /*
 333          * This is to stop us from allocating per cpu buckets while we're
 334          * running out of vm.boot_pages.  Otherwise, we would exhaust the
 335          * boot pages.  This also prevents us from allocating buckets in
 336          * low memory situations.
 337          */
 338         if (bucketdisable)
 339                 return (NULL);
 340
 341         ubz = bucket_zone_lookup(entries);
 342         bucket = zone_alloc_item(ubz->ubz_zone, NULL, bflags);
 343         if (bucket) {
 344 #ifdef INVARIANTS
 345                 bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
 346 #endif
 347                 bucket->ub_cnt = 0;
 348                 bucket->ub_entries = ubz->ubz_entries;
 349         }
 350
 351         return (bucket);
 352 }
 353
 354 static void
 355 bucket_free(uma_bucket_t bucket)
 356 {
 357         struct uma_bucket_zone *ubz;
 358
 359         ubz = bucket_zone_lookup(bucket->ub_entries);
 360         zone_free_item(ubz->ubz_zone, bucket, NULL, SKIP_NONE,
 361             ZFREE_STATFREE);
 362 }
 363
 364 static void
 365 bucket_zone_drain(void)
 366 {
 367         struct uma_bucket_zone *ubz;
 368
 369         for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
 370                 zone_drain(ubz->ubz_zone);
 371 }
 372
 373 static void
 374 zone_log_warning(uma_zone_t zone)
 375 {
 376         static const struct timeval warninterval = { 300, 0 };
 377
 378         if (!zone_warnings || zone->uz_warning == NULL)
 379                 return;
 380
 381         if (ratecheck(&zone->uz_ratecheck, &warninterval))
 382                 printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning);
 383 }
 384
 385 static inline uma_keg_t
 386 zone_first_keg(uma_zone_t zone)
 387 {
 388
 389         return (LIST_FIRST(&zone->uz_kegs)->kl_keg);
 390 }
 391
 392 static void
 393 zone_foreach_keg(uma_zone_t zone, void (*kegfn)(uma_keg_t))
 394 {
 395         uma_klink_t klink;
 396
 397         LIST_FOREACH(klink, &zone->uz_kegs, kl_link)
 398                 kegfn(klink->kl_keg);
 399 }
 400
 401 /*
 402  * Routine called by timeout which is used to fire off some time interval
 403  * based calculations.  (stats, hash size, etc.)
 404  *
 405  * Arguments:
 406  *      arg   Unused
 407  *
 408  * Returns:
 409  *      Nothing
 410  */
 411 static void
 412 uma_timeout(void *unused)
 413 {
 414         bucket_enable();
 415         zone_foreach(zone_timeout);
 416
 417         /* Reschedule this event */
 418         callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
 419 }
 420
 421 /*
 422  * Routine to perform timeout driven calculations.  This expands the
 423  * hashes and does per cpu statistics aggregation.
 424  *
 425  *  Returns nothing.
 426  */
 427 static void
 428 keg_timeout(uma_keg_t keg)
 429 {
 430
 431         KEG_LOCK(keg);
 432         /*
 433          * Expand the keg hash table.
 434          *
 435          * This is done if the number of slabs is larger than the hash size.
 436          * What I'm trying to do here is completely reduce collisions.  This
 437          * may be a little aggressive.  Should I allow for two collisions max?
 438          */
 439         if (keg->uk_flags & UMA_ZONE_HASH &&
 440             keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) {
 441                 struct uma_hash newhash;
 442                 struct uma_hash oldhash;
 443                 int ret;
 444
 445                 /*
 446                  * This is so involved because allocating and freeing
 447                  * while the keg lock is held will lead to deadlock.
 448                  * I have to do everything in stages and check for
 449                  * races.
 450                  */
 451                 newhash = keg->uk_hash;
 452                 KEG_UNLOCK(keg);
 453                 ret = hash_alloc(&newhash);
 454                 KEG_LOCK(keg);
 455                 if (ret) {
 456                         if (hash_expand(&keg->uk_hash, &newhash)) {
 457                                 oldhash = keg->uk_hash;
 458                                 keg->uk_hash = newhash;
 459                         } else
 460                                 oldhash = newhash;
 461
 462                         KEG_UNLOCK(keg);
 463                         hash_free(&oldhash);
 464                         KEG_LOCK(keg);
 465                 }
 466         }
 467         KEG_UNLOCK(keg);
 468 }
 469
 470 static void
 471 zone_timeout(uma_zone_t zone)
 472 {
 473
 474         zone_foreach_keg(zone, &keg_timeout);
 475 }
 476
 477 /*
 478  * Allocate and zero fill the next sized hash table from the appropriate
 479  * backing store.
 480  *
 481  * Arguments:
 482  *      hash  A new hash structure with the old hash size in uh_hashsize
 483  *
 484  * Returns:
 485  *      1 on sucess and 0 on failure.
 486  */
 487 static int
 488 hash_alloc(struct uma_hash *hash)
 489 {
 490         int oldsize;
 491         int alloc;
 492
 493         oldsize = hash->uh_hashsize;
 494
 495         /* We're just going to go to a power of two greater */
 496         if (oldsize)  {
 497                 hash->uh_hashsize = oldsize * 2;
 498                 alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
 499                 hash->uh_slab_hash = (struct slabhead *)malloc(alloc,
 500                     M_UMAHASH, M_NOWAIT);
 501         } else {
 502                 alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
 503                 hash->uh_slab_hash = zone_alloc_item(hashzone, NULL,
 504                     M_WAITOK);
 505                 hash->uh_hashsize = UMA_HASH_SIZE_INIT;
 506         }
 507         if (hash->uh_slab_hash) {
 508                 bzero(hash->uh_slab_hash, alloc);
 509                 hash->uh_hashmask = hash->uh_hashsize - 1;
 510                 return (1);
 511         }
 512
 513         return (0);
 514 }
 515
 516 /*
 517  * Expands the hash table for HASH zones.  This is done from zone_timeout
 518  * to reduce collisions.  This must not be done in the regular allocation
 519  * path, otherwise, we can recurse on the vm while allocating pages.
 520  *
 521  * Arguments:
 522  *      oldhash  The hash you want to expand
 523  *      newhash  The hash structure for the new table
 524  *
 525  * Returns:
 526  *      Nothing
 527  *
 528  * Discussion:
 529  */
 530 static int
 531 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
 532 {
 533         uma_slab_t slab;
 534         int hval;
 535         int i;
 536
 537         if (!newhash->uh_slab_hash)
 538                 return (0);
 539
 540         if (oldhash->uh_hashsize >= newhash->uh_hashsize)
 541                 return (0);
 542
 543         /*
 544          * I need to investigate hash algorithms for resizing without a
 545          * full rehash.
 546          */
 547
 548         for (i = 0; i < oldhash->uh_hashsize; i++)
 549                 while (!SLIST_EMPTY(&oldhash->uh_slab_hash[i])) {
 550                         slab = SLIST_FIRST(&oldhash->uh_slab_hash[i]);
 551                         SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[i], us_hlink);
 552                         hval = UMA_HASH(newhash, slab->us_data);
 553                         SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
 554                             slab, us_hlink);
 555                 }
 556
 557         return (1);
 558 }
 559
 560 /*
 561  * Free the hash bucket to the appropriate backing store.
 562  *
 563  * Arguments:
 564  *      slab_hash  The hash bucket we're freeing
 565  *      hashsize   The number of entries in that hash bucket
 566  *
 567  * Returns:
 568  *      Nothing
 569  */
 570 static void
 571 hash_free(struct uma_hash *hash)
 572 {
 573         if (hash->uh_slab_hash == NULL)
 574                 return;
 575         if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
 576                 zone_free_item(hashzone,
 577                     hash->uh_slab_hash, NULL, SKIP_NONE, ZFREE_STATFREE);
 578         else
 579                 free(hash->uh_slab_hash, M_UMAHASH);
 580 }
 581
 582 /*
 583  * Frees all outstanding items in a bucket
 584  *
 585  * Arguments:
 586  *      zone   The zone to free to, must be unlocked.
 587  *      bucket The free/alloc bucket with items, cpu queue must be locked.
 588  *
 589  * Returns:
 590  *      Nothing
 591  */
 592
 593 static void
 594 bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
 595 {
 596         void *item;
 597
 598         if (bucket == NULL)
 599                 return;
 600
 601         while (bucket->ub_cnt > 0)  {
 602                 bucket->ub_cnt--;
 603                 item = bucket->ub_bucket[bucket->ub_cnt];
 604 #ifdef INVARIANTS
 605                 bucket->ub_bucket[bucket->ub_cnt] = NULL;
 606                 KASSERT(item != NULL,
 607                     ("bucket_drain: botched ptr, item is NULL"));
 608 #endif
 609                 zone_free_item(zone, item, NULL, SKIP_DTOR, 0);
 610         }
 611 }
 612
 613 /*
 614  * Drains the per cpu caches for a zone.
 615  *
 616  * NOTE: This may only be called while the zone is being turn down, and not
 617  * during normal operation.  This is necessary in order that we do not have
 618  * to migrate CPUs to drain the per-CPU caches.
 619  *
 620  * Arguments:
 621  *      zone     The zone to drain, must be unlocked.
 622  *
 623  * Returns:
 624  *      Nothing
 625  */
 626 static void
 627 cache_drain(uma_zone_t zone)
 628 {
 629         uma_cache_t cache;
 630         int cpu;
 631
 632         /*
 633          * XXX: It is safe to not lock the per-CPU caches, because we're
 634          * tearing down the zone anyway.  I.e., there will be no further use
 635          * of the caches at this point.
 636          *
 637          * XXX: It would good to be able to assert that the zone is being
 638          * torn down to prevent improper use of cache_drain().
 639          *
 640          * XXX: We lock the zone before passing into bucket_cache_drain() as
 641          * it is used elsewhere.  Should the tear-down path be made special
 642          * there in some form?
 643          */
 644         CPU_FOREACH(cpu) {
 645                 cache = &zone->uz_cpu[cpu];
 646                 bucket_drain(zone, cache->uc_allocbucket);
 647                 bucket_drain(zone, cache->uc_freebucket);
 648                 if (cache->uc_allocbucket != NULL)
 649                         bucket_free(cache->uc_allocbucket);
 650                 if (cache->uc_freebucket != NULL)
 651                         bucket_free(cache->uc_freebucket);
 652                 cache->uc_allocbucket = cache->uc_freebucket = NULL;
 653         }
 654         ZONE_LOCK(zone);
 655         bucket_cache_drain(zone);
 656         ZONE_UNLOCK(zone);
 657 }
 658
 659 /*
 660  * Drain the cached buckets from a zone.  Expects a locked zone on entry.
 661  */
 662 static void
 663 bucket_cache_drain(uma_zone_t zone)
 664 {
 665         uma_bucket_t bucket;
 666
 667         /*
 668          * Drain the bucket queues and free the buckets, we just keep two per
 669          * cpu (alloc/free).
 670          */
 671         while ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) {
 672                 LIST_REMOVE(bucket, ub_link);
 673                 ZONE_UNLOCK(zone);
 674                 bucket_drain(zone, bucket);
 675                 bucket_free(bucket);
 676                 ZONE_LOCK(zone);
 677         }
 678
 679         /* Now we do the free queue.. */
 680         while ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
 681                 LIST_REMOVE(bucket, ub_link);
 682                 bucket_free(bucket);
 683         }
 684 }
 685
 686 /*
 687  * Frees pages from a keg back to the system.  This is done on demand from
 688  * the pageout daemon.
 689  *
 690  * Returns nothing.
 691  */
 692 static void
 693 keg_drain(uma_keg_t keg)
 694 {
 695         struct slabhead freeslabs = { 0 };
 696         uma_slab_t slab;
 697         uma_slab_t n;
 698         u_int8_t flags;
 699         u_int8_t *mem;
 700         int i;
 701
 702         /*
 703          * We don't want to take pages from statically allocated kegs at this
 704          * time
 705          */
 706         if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)
 707                 return;
 708
 709 #ifdef UMA_DEBUG
 710         printf("%s free items: %u\n", keg->uk_name, keg->uk_free);
 711 #endif
 712         KEG_LOCK(keg);
 713         if (keg->uk_free == 0)
 714                 goto finished;
 715
 716         slab = LIST_FIRST(&keg->uk_free_slab);
 717         while (slab) {
 718                 n = LIST_NEXT(slab, us_link);
 719
 720                 /* We have no where to free these to */
 721                 if (slab->us_flags & UMA_SLAB_BOOT) {
 722                         slab = n;
 723                         continue;
 724                 }
 725
 726                 LIST_REMOVE(slab, us_link);
 727                 keg->uk_pages -= keg->uk_ppera;
 728                 keg->uk_free -= keg->uk_ipers;
 729
 730                 if (keg->uk_flags & UMA_ZONE_HASH)
 731                         UMA_HASH_REMOVE(&keg->uk_hash, slab, slab->us_data);
 732
 733                 SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
 734
 735                 slab = n;
 736         }
 737 finished:
 738         KEG_UNLOCK(keg);
 739
 740         while ((slab = SLIST_FIRST(&freeslabs)) != NULL) {
 741                 SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink);
 742                 if (keg->uk_fini)
 743                         for (i = 0; i < keg->uk_ipers; i++)
 744                                 keg->uk_fini(
 745                                     slab->us_data + (keg->uk_rsize * i),
 746                                     keg->uk_size);
 747                 flags = slab->us_flags;
 748                 mem = slab->us_data;
 749
 750                 if (keg->uk_flags & UMA_ZONE_VTOSLAB) {
 751                         vm_object_t obj;
 752
 753                         if (flags & UMA_SLAB_KMEM)
 754                                 obj = kmem_object;
 755                         else if (flags & UMA_SLAB_KERNEL)
 756                                 obj = kernel_object;
 757                         else
 758                                 obj = NULL;
 759                         for (i = 0; i < keg->uk_ppera; i++)
 760                                 vsetobj((vm_offset_t)mem + (i * PAGE_SIZE),
 761                                     obj);
 762                 }
 763                 if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 764                         zone_free_item(keg->uk_slabzone, slab, NULL,
 765                             SKIP_NONE, ZFREE_STATFREE);
 766 #ifdef UMA_DEBUG
 767                 printf("%s: Returning %d bytes.\n",
 768                     keg->uk_name, PAGE_SIZE * keg->uk_ppera);
 769 #endif
 770                 keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera, flags);
 771         }
 772 }
 773
 774 static void
 775 zone_drain_wait(uma_zone_t zone, int waitok)
 776 {
 777
 778         /*
 779          * Set draining to interlock with zone_dtor() so we can release our
 780          * locks as we go.  Only dtor() should do a WAITOK call since it
 781          * is the only call that knows the structure will still be available
 782          * when it wakes up.
 783          */
 784         ZONE_LOCK(zone);
 785         while (zone->uz_flags & UMA_ZFLAG_DRAINING) {
 786                 if (waitok == M_NOWAIT)
 787                         goto out;
 788                 mtx_unlock(&uma_mtx);
 789                 msleep(zone, zone->uz_lock, PVM, "zonedrain", 1);
 790                 mtx_lock(&uma_mtx);
 791         }
 792         zone->uz_flags |= UMA_ZFLAG_DRAINING;
 793         bucket_cache_drain(zone);
 794         ZONE_UNLOCK(zone);
 795         /*
 796          * The DRAINING flag protects us from being freed while
 797          * we're running.  Normally the uma_mtx would protect us but we
 798          * must be able to release and acquire the right lock for each keg.
 799          */
 800         zone_foreach_keg(zone, &keg_drain);
 801         ZONE_LOCK(zone);
 802         zone->uz_flags &= ~UMA_ZFLAG_DRAINING;
 803         wakeup(zone);
 804 out:
 805         ZONE_UNLOCK(zone);
 806 }
 807
 808 void
 809 zone_drain(uma_zone_t zone)
 810 {
 811
 812         zone_drain_wait(zone, M_NOWAIT);
 813 }
 814
 815 /*
 816  * Allocate a new slab for a keg.  This does not insert the slab onto a list.
 817  *
 818  * Arguments:
 819  *      wait  Shall we wait?
 820  *
 821  * Returns:
 822  *      The slab that was allocated or NULL if there is no memory and the
 823  *      caller specified M_NOWAIT.
 824  */
 825 static uma_slab_t
 826 keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait)
 827 {
 828         uma_slabrefcnt_t slabref;
 829         uma_alloc allocf;
 830         uma_slab_t slab;
 831         u_int8_t *mem;
 832         u_int8_t flags;
 833         int i;
 834
 835         mtx_assert(&keg->uk_lock, MA_OWNED);
 836         slab = NULL;
 837
 838 #ifdef UMA_DEBUG
 839         printf("slab_zalloc:  Allocating a new slab for %s\n", keg->uk_name);
 840 #endif
 841         allocf = keg->uk_allocf;
 842         KEG_UNLOCK(keg);
 843
 844         if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
 845                 slab = zone_alloc_item(keg->uk_slabzone, NULL, wait);
 846                 if (slab == NULL) {
 847                         KEG_LOCK(keg);
 848                         return NULL;
 849                 }
 850         }
 851
 852         /*
 853          * This reproduces the old vm_zone behavior of zero filling pages the
 854          * first time they are added to a zone.
 855          *
 856          * Malloced items are zeroed in uma_zalloc.
 857          */
 858
 859         if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
 860                 wait |= M_ZERO;
 861         else
 862                 wait &= ~M_ZERO;
 863
 864         if (keg->uk_flags & UMA_ZONE_NODUMP)
 865                 wait |= M_NODUMP;
 866
 867         /* zone is passed for legacy reasons. */
 868         mem = allocf(zone, keg->uk_ppera * PAGE_SIZE, &flags, wait);
 869         if (mem == NULL) {
 870                 if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 871                         zone_free_item(keg->uk_slabzone, slab, NULL,
 872                             SKIP_NONE, ZFREE_STATFREE);
 873                 KEG_LOCK(keg);
 874                 return (NULL);
 875         }
 876
 877         /* Point the slab into the allocated memory */
 878         if (!(keg->uk_flags & UMA_ZONE_OFFPAGE))
 879                 slab = (uma_slab_t )(mem + keg->uk_pgoff);
 880
 881         if (keg->uk_flags & UMA_ZONE_VTOSLAB)
 882                 for (i = 0; i < keg->uk_ppera; i++)
 883                         vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab);
 884
 885         slab->us_keg = keg;
 886         slab->us_data = mem;
 887         slab->us_freecount = keg->uk_ipers;
 888         slab->us_firstfree = 0;
 889         slab->us_flags = flags;
 890
 891         if (keg->uk_flags & UMA_ZONE_REFCNT) {
 892                 slabref = (uma_slabrefcnt_t)slab;
 893                 for (i = 0; i < keg->uk_ipers; i++) {
 894                         slabref->us_freelist[i].us_refcnt = 0;
 895                         slabref->us_freelist[i].us_item = i+1;
 896                 }
 897         } else {
 898                 for (i = 0; i < keg->uk_ipers; i++)
 899                         slab->us_freelist[i].us_item = i+1;
 900         }
 901
 902         if (keg->uk_init != NULL) {
 903                 for (i = 0; i < keg->uk_ipers; i++)
 904                         if (keg->uk_init(slab->us_data + (keg->uk_rsize * i),
 905                             keg->uk_size, wait) != 0)
 906                                 break;
 907                 if (i != keg->uk_ipers) {
 908                         if (keg->uk_fini != NULL) {
 909                                 for (i--; i > -1; i--)
 910                                         keg->uk_fini(slab->us_data +
 911                                             (keg->uk_rsize * i),
 912                                             keg->uk_size);
 913                         }
 914                         if (keg->uk_flags & UMA_ZONE_VTOSLAB) {
 915                                 vm_object_t obj;
 916
 917                                 if (flags & UMA_SLAB_KMEM)
 918                                         obj = kmem_object;
 919                                 else if (flags & UMA_SLAB_KERNEL)
 920                                         obj = kernel_object;
 921                                 else
 922                                         obj = NULL;
 923                                 for (i = 0; i < keg->uk_ppera; i++)
 924                                         vsetobj((vm_offset_t)mem +
 925                                             (i * PAGE_SIZE), obj);
 926                         }
 927                         if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 928                                 zone_free_item(keg->uk_slabzone, slab,
 929                                     NULL, SKIP_NONE, ZFREE_STATFREE);
 930                         keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera,
 931                             flags);
 932                         KEG_LOCK(keg);
 933                         return (NULL);
 934                 }
 935         }
 936         KEG_LOCK(keg);
 937
 938         if (keg->uk_flags & UMA_ZONE_HASH)
 939                 UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
 940
 941         keg->uk_pages += keg->uk_ppera;
 942         keg->uk_free += keg->uk_ipers;
 943
 944         return (slab);
 945 }
 946
 947 /*
 948  * This function is intended to be used early on in place of page_alloc() so
 949  * that we may use the boot time page cache to satisfy allocations before
 950  * the VM is ready.
 951  */
 952 static void *
 953 startup_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait)
 954 {
 955         uma_keg_t keg;
 956         uma_slab_t tmps;
 957         int pages, check_pages;
 958
 959         keg = zone_first_keg(zone);
 960         pages = howmany(bytes, PAGE_SIZE);
 961         check_pages = pages - 1;
 962         KASSERT(pages > 0, ("startup_alloc can't reserve 0 pages\n"));
 963
 964         /*
 965          * Check our small startup cache to see if it has pages remaining.
 966          */
 967         mtx_lock(&uma_boot_pages_mtx);
 968
 969         /* First check if we have enough room. */
 970         tmps = LIST_FIRST(&uma_boot_pages);
 971         while (tmps != NULL && check_pages-- > 0)
 972                 tmps = LIST_NEXT(tmps, us_link);
 973         if (tmps != NULL) {
 974                 /*
 975                  * It's ok to lose tmps references.  The last one will
 976                  * have tmps->us_data pointing to the start address of
 977                  * "pages" contiguous pages of memory.
 978                  */
 979                 while (pages-- > 0) {
 980                         tmps = LIST_FIRST(&uma_boot_pages);
 981                         LIST_REMOVE(tmps, us_link);
 982                 }
 983                 mtx_unlock(&uma_boot_pages_mtx);
 984                 *pflag = tmps->us_flags;
 985                 return (tmps->us_data);
 986         }
 987         mtx_unlock(&uma_boot_pages_mtx);
 988         if (booted < UMA_STARTUP2)
 989                 panic("UMA: Increase vm.boot_pages");
 990         /*
 991          * Now that we've booted reset these users to their real allocator.
 992          */
 993 #ifdef UMA_MD_SMALL_ALLOC
 994         keg->uk_allocf = (keg->uk_ppera > 1) ? page_alloc : uma_small_alloc;
 995 #else
 996         keg->uk_allocf = page_alloc;
 997 #endif
 998         return keg->uk_allocf(zone, bytes, pflag, wait);
 999 }
1000
1001 /*
1002  * Allocates a number of pages from the system
1003  *
1004  * Arguments:
1005  *      bytes  The number of bytes requested
1006  *      wait  Shall we wait?
1007  *
1008  * Returns:
1009  *      A pointer to the alloced memory or possibly
1010  *      NULL if M_NOWAIT is set.
1011  */
1012 static void *
1013 page_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait)
1014 {
1015         void *p;        /* Returned page */
1016
1017         *pflag = UMA_SLAB_KMEM;
1018         p = (void *) kmem_malloc(kmem_map, bytes, wait);
1019
1020         return (p);
1021 }
1022
1023 /*
1024  * Allocates a number of pages from within an object
1025  *
1026  * Arguments:
1027  *      bytes  The number of bytes requested
1028  *      wait   Shall we wait?
1029  *
1030  * Returns:
1031  *      A pointer to the alloced memory or possibly
1032  *      NULL if M_NOWAIT is set.
1033  */
1034 static void *
1035 noobj_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
1036 {
1037         TAILQ_HEAD(, vm_page) alloctail;
1038         u_long npages;
1039         vm_offset_t retkva, zkva;
1040         vm_page_t p, p_next;
1041         uma_keg_t keg;
1042
1043         TAILQ_INIT(&alloctail);
1044         keg = zone_first_keg(zone);
1045
1046         npages = howmany(bytes, PAGE_SIZE);
1047         while (npages > 0) {
1048                 p = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT |
1049                     VM_ALLOC_WIRED | VM_ALLOC_NOOBJ);
1050                 if (p != NULL) {
1051                         /*
1052                          * Since the page does not belong to an object, its
1053                          * listq is unused.
1054                          */
1055                         TAILQ_INSERT_TAIL(&alloctail, p, listq);
1056                         npages--;
1057                         continue;
1058                 }
1059                 if (wait & M_WAITOK) {
1060                         VM_WAIT;
1061                         continue;
1062                 }
1063
1064                 /*
1065                  * Page allocation failed, free intermediate pages and
1066                  * exit.
1067                  */
1068                 TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
1069                         vm_page_unwire(p, 0);
1070                         vm_page_free(p);
1071                 }
1072                 return (NULL);
1073         }
1074         *flags = UMA_SLAB_PRIV;
1075         zkva = keg->uk_kva +
1076             atomic_fetchadd_long(&keg->uk_offset, round_page(bytes));
1077         retkva = zkva;
1078         TAILQ_FOREACH(p, &alloctail, listq) {
1079                 pmap_qenter(zkva, &p, 1);
1080                 zkva += PAGE_SIZE;
1081         }
1082
1083         return ((void *)retkva);
1084 }
1085
1086 /*
1087  * Frees a number of pages to the system
1088  *
1089  * Arguments:
1090  *      mem   A pointer to the memory to be freed
1091  *      size  The size of the memory being freed
1092  *      flags The original p->us_flags field
1093  *
1094  * Returns:
1095  *      Nothing
1096  */
1097 static void
1098 page_free(void *mem, int size, u_int8_t flags)
1099 {
1100         vm_map_t map;
1101
1102         if (flags & UMA_SLAB_KMEM)
1103                 map = kmem_map;
1104         else if (flags & UMA_SLAB_KERNEL)
1105                 map = kernel_map;
1106         else
1107                 panic("UMA: page_free used with invalid flags %d", flags);
1108
1109         kmem_free(map, (vm_offset_t)mem, size);
1110 }
1111
1112 /*
1113  * Zero fill initializer
1114  *
1115  * Arguments/Returns follow uma_init specifications
1116  */
1117 static int
1118 zero_init(void *mem, int size, int flags)
1119 {
1120         bzero(mem, size);
1121         return (0);
1122 }
1123
1124 /*
1125  * Finish creating a small uma keg.  This calculates ipers, and the keg size.
1126  *
1127  * Arguments
1128  *      keg  The zone we should initialize
1129  *
1130  * Returns
1131  *      Nothing
1132  */
1133 static void
1134 keg_small_init(uma_keg_t keg)
1135 {
1136         u_int rsize;
1137         u_int memused;
1138         u_int wastedspace;
1139         u_int shsize;
1140
1141         if (keg->uk_flags & UMA_ZONE_PCPU) {
1142                 keg->uk_slabsize = sizeof(struct pcpu);
1143                 keg->uk_ppera = howmany(mp_ncpus * sizeof(struct pcpu),
1144                     PAGE_SIZE);
1145         } else {
1146                 keg->uk_slabsize = UMA_SLAB_SIZE;
1147                 keg->uk_ppera = 1;
1148         }
1149
1150         rsize = keg->uk_size;
1151
1152         if (rsize & keg->uk_align)
1153                 rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1);
1154         if (rsize < keg->uk_slabsize / 256)
1155                 rsize = keg->uk_slabsize / 256;
1156
1157         keg->uk_rsize = rsize;
1158
1159         KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 ||
1160             keg->uk_rsize < sizeof(struct pcpu),
1161             ("%s: size %u too large", __func__, keg->uk_rsize));
1162
1163         if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
1164                 shsize = 0;
1165         } else if (keg->uk_flags & UMA_ZONE_REFCNT) {
1166                 rsize += UMA_FRITMREF_SZ;       /* linkage & refcnt */
1167                 shsize = sizeof(struct uma_slab_refcnt);
1168         } else {
1169                 rsize += UMA_FRITM_SZ;  /* Account for linkage */
1170                 shsize = sizeof(struct uma_slab);
1171         }
1172
1173         keg->uk_ipers = (keg->uk_slabsize - shsize) / rsize;
1174         KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= 255,
1175             ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
1176
1177         memused = keg->uk_ipers * rsize + shsize;
1178         wastedspace = keg->uk_slabsize - memused;
1179
1180         /*
1181          * We can't do OFFPAGE if we're internal or if we've been
1182          * asked to not go to the VM for buckets.  If we do this we
1183          * may end up going to the VM (kmem_map) for slabs which we
1184          * do not want to do if we're UMA_ZFLAG_CACHEONLY as a
1185          * result of UMA_ZONE_VM, which clearly forbids it.
1186          */
1187         if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) ||
1188             (keg->uk_flags & UMA_ZFLAG_CACHEONLY))
1189                 return;
1190
1191         if ((wastedspace >= keg->uk_slabsize / UMA_MAX_WASTE) &&
1192             (keg->uk_ipers < (keg->uk_slabsize / keg->uk_rsize))) {
1193                 keg->uk_ipers = keg->uk_slabsize / keg->uk_rsize;
1194                 KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= 255,
1195                     ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
1196 #ifdef UMA_DEBUG
1197                 printf("UMA decided we need offpage slab headers for "
1198                     "keg: %s, calculated wastedspace = %d, "
1199                     "maximum wasted space allowed = %d, "
1200                     "calculated ipers = %d, "
1201                     "new wasted space = %d\n", keg->uk_name, wastedspace,
1202                     keg->uk_slabsize / UMA_MAX_WASTE, keg->uk_ipers,
1203                     keg->uk_slabsize - keg->uk_ipers * keg->uk_rsize);
1204 #endif
1205                 keg->uk_flags |= UMA_ZONE_OFFPAGE;
1206         }
1207
1208         if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
1209             (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1210                 keg->uk_flags |= UMA_ZONE_HASH;
1211 }
1212
1213 /*
1214  * Finish creating a large (> UMA_SLAB_SIZE) uma kegs.  Just give in and do
1215  * OFFPAGE for now.  When I can allow for more dynamic slab sizes this will be
1216  * more complicated.
1217  *
1218  * Arguments
1219  *      keg  The keg we should initialize
1220  *
1221  * Returns
1222  *      Nothing
1223  */
1224 static void
1225 keg_large_init(uma_keg_t keg)
1226 {
1227
1228         KASSERT(keg != NULL, ("Keg is null in keg_large_init"));
1229         KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0,
1230             ("keg_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY keg"));
1231         KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
1232             ("%s: Cannot large-init a UMA_ZONE_PCPU keg", __func__));
1233
1234         keg->uk_ppera = howmany(keg->uk_size, PAGE_SIZE);
1235         keg->uk_slabsize = keg->uk_ppera * PAGE_SIZE;
1236         keg->uk_ipers = 1;
1237         keg->uk_rsize = keg->uk_size;
1238
1239         /* We can't do OFFPAGE if we're internal, bail out here. */
1240         if (keg->uk_flags & UMA_ZFLAG_INTERNAL)
1241                 return;
1242
1243         keg->uk_flags |= UMA_ZONE_OFFPAGE;
1244         if ((keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1245                 keg->uk_flags |= UMA_ZONE_HASH;
1246 }
1247
1248 static void
1249 keg_cachespread_init(uma_keg_t keg)
1250 {
1251         int alignsize;
1252         int trailer;
1253         int pages;
1254         int rsize;
1255
1256         KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
1257             ("%s: Cannot cachespread-init a UMA_ZONE_PCPU keg", __func__));
1258
1259         alignsize = keg->uk_align + 1;
1260         rsize = keg->uk_size;
1261         /*
1262          * We want one item to start on every align boundary in a page.  To
1263          * do this we will span pages.  We will also extend the item by the
1264          * size of align if it is an even multiple of align.  Otherwise, it
1265          * would fall on the same boundary every time.
1266          */
1267         if (rsize & keg->uk_align)
1268                 rsize = (rsize & ~keg->uk_align) + alignsize;
1269         if ((rsize & alignsize) == 0)
1270                 rsize += alignsize;
1271         trailer = rsize - keg->uk_size;
1272         pages = (rsize * (PAGE_SIZE / alignsize)) / PAGE_SIZE;
1273         pages = MIN(pages, (128 * 1024) / PAGE_SIZE);
1274         keg->uk_rsize = rsize;
1275         keg->uk_ppera = pages;
1276         keg->uk_slabsize = UMA_SLAB_SIZE;
1277         keg->uk_ipers = ((pages * PAGE_SIZE) + trailer) / rsize;
1278         keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB;
1279         KASSERT(keg->uk_ipers <= uma_max_ipers,
1280             ("%s: keg->uk_ipers too high(%d) increase max_ipers", __func__,
1281             keg->uk_ipers));
1282 }
1283
1284 /*
1285  * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
1286  * the keg onto the global keg list.
1287  *
1288  * Arguments/Returns follow uma_ctor specifications
1289  *      udata  Actually uma_kctor_args
1290  */
1291 static int
1292 keg_ctor(void *mem, int size, void *udata, int flags)
1293 {
1294         struct uma_kctor_args *arg = udata;
1295         uma_keg_t keg = mem;
1296         uma_zone_t zone;
1297
1298         bzero(keg, size);
1299         keg->uk_size = arg->size;
1300         keg->uk_init = arg->uminit;
1301         keg->uk_fini = arg->fini;
1302         keg->uk_align = arg->align;
1303         keg->uk_free = 0;
1304         keg->uk_pages = 0;
1305         keg->uk_flags = arg->flags;
1306         keg->uk_allocf = page_alloc;
1307         keg->uk_freef = page_free;
1308         keg->uk_recurse = 0;
1309         keg->uk_slabzone = NULL;
1310
1311         /*
1312          * The master zone is passed to us at keg-creation time.
1313          */
1314         zone = arg->zone;
1315         keg->uk_name = zone->uz_name;
1316
1317         if (arg->flags & UMA_ZONE_VM)
1318                 keg->uk_flags |= UMA_ZFLAG_CACHEONLY;
1319
1320         if (arg->flags & UMA_ZONE_ZINIT)
1321                 keg->uk_init = zero_init;
1322
1323         if (arg->flags & UMA_ZONE_REFCNT || arg->flags & UMA_ZONE_MALLOC)
1324                 keg->uk_flags |= UMA_ZONE_VTOSLAB;
1325
1326         if (arg->flags & UMA_ZONE_PCPU)
1327 #ifdef SMP
1328                 keg->uk_flags |= UMA_ZONE_OFFPAGE;
1329 #else
1330                 keg->uk_flags &= ~UMA_ZONE_PCPU;
1331 #endif
1332
1333         /*
1334          * The +UMA_FRITM_SZ added to uk_size is to account for the
1335          * linkage that is added to the size in keg_small_init().  If
1336          * we don't account for this here then we may end up in
1337          * keg_small_init() with a calculated 'ipers' of 0.
1338          */
1339         if (keg->uk_flags & UMA_ZONE_REFCNT) {
1340                 if (keg->uk_flags & UMA_ZONE_CACHESPREAD)
1341                         keg_cachespread_init(keg);
1342                 else if ((keg->uk_size+UMA_FRITMREF_SZ) >
1343                     (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt)))
1344                         keg_large_init(keg);
1345                 else
1346                         keg_small_init(keg);
1347         } else {
1348                 if (keg->uk_flags & UMA_ZONE_CACHESPREAD)
1349                         keg_cachespread_init(keg);
1350                 else if ((keg->uk_size+UMA_FRITM_SZ) >
1351                     (UMA_SLAB_SIZE - sizeof(struct uma_slab)))
1352                         keg_large_init(keg);
1353                 else
1354                         keg_small_init(keg);
1355         }
1356
1357         if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
1358                 if (keg->uk_flags & UMA_ZONE_REFCNT)
1359                         keg->uk_slabzone = slabrefzone;
1360                 else
1361                         keg->uk_slabzone = slabzone;
1362         }
1363
1364         /*
1365          * If we haven't booted yet we need allocations to go through the
1366          * startup cache until the vm is ready.
1367          */
1368         if (keg->uk_ppera == 1) {
1369 #ifdef UMA_MD_SMALL_ALLOC
1370                 keg->uk_allocf = uma_small_alloc;
1371                 keg->uk_freef = uma_small_free;
1372
1373                 if (booted < UMA_STARTUP)
1374                         keg->uk_allocf = startup_alloc;
1375 #else
1376                 if (booted < UMA_STARTUP2)
1377                         keg->uk_allocf = startup_alloc;
1378 #endif
1379         } else if (booted < UMA_STARTUP2 &&
1380             (keg->uk_flags & UMA_ZFLAG_INTERNAL))
1381                 keg->uk_allocf = startup_alloc;
1382
1383         /*
1384          * Initialize keg's lock (shared among zones).
1385          */
1386         if (arg->flags & UMA_ZONE_MTXCLASS)
1387                 KEG_LOCK_INIT(keg, 1);
1388         else
1389                 KEG_LOCK_INIT(keg, 0);
1390
1391         /*
1392          * If we're putting the slab header in the actual page we need to
1393          * figure out where in each page it goes.  This calculates a right
1394          * justified offset into the memory on an ALIGN_PTR boundary.
1395          */
1396         if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) {
1397                 u_int totsize;
1398
1399                 /* Size of the slab struct and free list */
1400                 if (keg->uk_flags & UMA_ZONE_REFCNT)
1401                         totsize = sizeof(struct uma_slab_refcnt) +
1402                             keg->uk_ipers * UMA_FRITMREF_SZ;
1403                 else
1404                         totsize = sizeof(struct uma_slab) +
1405                             keg->uk_ipers * UMA_FRITM_SZ;
1406
1407                 if (totsize & UMA_ALIGN_PTR)
1408                         totsize = (totsize & ~UMA_ALIGN_PTR) +
1409                             (UMA_ALIGN_PTR + 1);
1410                 keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - totsize;
1411
1412                 if (keg->uk_flags & UMA_ZONE_REFCNT)
1413                         totsize = keg->uk_pgoff + sizeof(struct uma_slab_refcnt)
1414                             + keg->uk_ipers * UMA_FRITMREF_SZ;
1415                 else
1416                         totsize = keg->uk_pgoff + sizeof(struct uma_slab)
1417                             + keg->uk_ipers * UMA_FRITM_SZ;
1418
1419                 /*
1420                  * The only way the following is possible is if with our
1421                  * UMA_ALIGN_PTR adjustments we are now bigger than
1422                  * UMA_SLAB_SIZE.  I haven't checked whether this is
1423                  * mathematically possible for all cases, so we make
1424                  * sure here anyway.
1425                  */
1426                 if (totsize > PAGE_SIZE * keg->uk_ppera) {
1427                         printf("zone %s ipers %d rsize %d size %d\n",
1428                             zone->uz_name, keg->uk_ipers, keg->uk_rsize,
1429                             keg->uk_size);
1430                         panic("UMA slab won't fit.");
1431                 }
1432         }
1433
1434         if (keg->uk_flags & UMA_ZONE_HASH)
1435                 hash_alloc(&keg->uk_hash);
1436
1437 #ifdef UMA_DEBUG
1438         printf("UMA: %s(%p) size %d(%d) flags %#x ipers %d ppera %d out %d free %d\n",
1439             zone->uz_name, zone, keg->uk_size, keg->uk_rsize, keg->uk_flags,
1440             keg->uk_ipers, keg->uk_ppera,
1441             (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free);
1442 #endif
1443
1444         LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
1445
1446         mtx_lock(&uma_mtx);
1447         LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
1448         mtx_unlock(&uma_mtx);
1449         return (0);
1450 }
1451
1452 /*
1453  * Zone header ctor.  This initializes all fields, locks, etc.
1454  *
1455  * Arguments/Returns follow uma_ctor specifications
1456  *      udata  Actually uma_zctor_args
1457  */
1458 static int
1459 zone_ctor(void *mem, int size, void *udata, int flags)
1460 {
1461         struct uma_zctor_args *arg = udata;
1462         uma_zone_t zone = mem;
1463         uma_zone_t z;
1464         uma_keg_t keg;
1465
1466         bzero(zone, size);
1467         zone->uz_name = arg->name;
1468         zone->uz_ctor = arg->ctor;
1469         zone->uz_dtor = arg->dtor;
1470         zone->uz_slab = zone_fetch_slab;
1471         zone->uz_init = NULL;
1472         zone->uz_fini = NULL;
1473         zone->uz_allocs = 0;
1474         zone->uz_frees = 0;
1475         zone->uz_fails = 0;
1476         zone->uz_sleeps = 0;
1477         zone->uz_fills = zone->uz_count = 0;
1478         zone->uz_flags = 0;
1479         zone->uz_warning = NULL;
1480         timevalclear(&zone->uz_ratecheck);
1481         keg = arg->keg;
1482
1483         if (arg->flags & UMA_ZONE_SECONDARY) {
1484                 KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
1485                 zone->uz_init = arg->uminit;
1486                 zone->uz_fini = arg->fini;
1487                 zone->uz_lock = &keg->uk_lock;
1488                 zone->uz_flags |= UMA_ZONE_SECONDARY;
1489                 mtx_lock(&uma_mtx);
1490                 ZONE_LOCK(zone);
1491                 LIST_FOREACH(z, &keg->uk_zones, uz_link) {
1492                         if (LIST_NEXT(z, uz_link) == NULL) {
1493                                 LIST_INSERT_AFTER(z, zone, uz_link);
1494                                 break;
1495                         }
1496                 }
1497                 ZONE_UNLOCK(zone);
1498                 mtx_unlock(&uma_mtx);
1499         } else if (keg == NULL) {
1500                 if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
1501                     arg->align, arg->flags)) == NULL)
1502                         return (ENOMEM);
1503         } else {
1504                 struct uma_kctor_args karg;
1505                 int error;
1506
1507                 /* We should only be here from uma_startup() */
1508                 karg.size = arg->size;
1509                 karg.uminit = arg->uminit;
1510                 karg.fini = arg->fini;
1511                 karg.align = arg->align;
1512                 karg.flags = arg->flags;
1513                 karg.zone = zone;
1514                 error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg,
1515                     flags);
1516                 if (error)
1517                         return (error);
1518         }
1519         /*
1520          * Link in the first keg.
1521          */
1522         zone->uz_klink.kl_keg = keg;
1523         LIST_INSERT_HEAD(&zone->uz_kegs, &zone->uz_klink, kl_link);
1524         zone->uz_lock = &keg->uk_lock;
1525         zone->uz_size = keg->uk_size;
1526         zone->uz_flags |= (keg->uk_flags &
1527             (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT));
1528
1529         /*
1530          * Some internal zones don't have room allocated for the per cpu
1531          * caches.  If we're internal, bail out here.
1532          */
1533         if (keg->uk_flags & UMA_ZFLAG_INTERNAL) {
1534                 KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0,
1535                     ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
1536                 return (0);
1537         }
1538
1539         if (keg->uk_flags & UMA_ZONE_MAXBUCKET)
1540                 zone->uz_count = BUCKET_MAX;
1541         else if (keg->uk_ipers <= BUCKET_MAX)
1542                 zone->uz_count = keg->uk_ipers;
1543         else
1544                 zone->uz_count = BUCKET_MAX;
1545         return (0);
1546 }
1547
1548 /*
1549  * Keg header dtor.  This frees all data, destroys locks, frees the hash
1550  * table and removes the keg from the global list.
1551  *
1552  * Arguments/Returns follow uma_dtor specifications
1553  *      udata  unused
1554  */
1555 static void
1556 keg_dtor(void *arg, int size, void *udata)
1557 {
1558         uma_keg_t keg;
1559
1560         keg = (uma_keg_t)arg;
1561         KEG_LOCK(keg);
1562         if (keg->uk_free != 0) {
1563                 printf("Freed UMA keg was not empty (%d items). "
1564                     " Lost %d pages of memory.\n",
1565                     keg->uk_free, keg->uk_pages);
1566         }
1567         KEG_UNLOCK(keg);
1568
1569         hash_free(&keg->uk_hash);
1570
1571         KEG_LOCK_FINI(keg);
1572 }
1573
1574 /*
1575  * Zone header dtor.
1576  *
1577  * Arguments/Returns follow uma_dtor specifications
1578  *      udata  unused
1579  */
1580 static void
1581 zone_dtor(void *arg, int size, void *udata)
1582 {
1583         uma_klink_t klink;
1584         uma_zone_t zone;
1585         uma_keg_t keg;
1586
1587         zone = (uma_zone_t)arg;
1588         keg = zone_first_keg(zone);
1589
1590         if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
1591                 cache_drain(zone);
1592
1593         mtx_lock(&uma_mtx);
1594         LIST_REMOVE(zone, uz_link);
1595         mtx_unlock(&uma_mtx);
1596         /*
1597          * XXX there are some races here where
1598          * the zone can be drained but zone lock
1599          * released and then refilled before we
1600          * remove it... we dont care for now
1601          */
1602         zone_drain_wait(zone, M_WAITOK);
1603         /*
1604          * Unlink all of our kegs.
1605          */
1606         while ((klink = LIST_FIRST(&zone->uz_kegs)) != NULL) {
1607                 klink->kl_keg = NULL;
1608                 LIST_REMOVE(klink, kl_link);
1609                 if (klink == &zone->uz_klink)
1610                         continue;
1611                 free(klink, M_TEMP);
1612         }
1613         /*
1614          * We only destroy kegs from non secondary zones.
1615          */
1616         if ((zone->uz_flags & UMA_ZONE_SECONDARY) == 0)  {
1617                 mtx_lock(&uma_mtx);
1618                 LIST_REMOVE(keg, uk_link);
1619                 mtx_unlock(&uma_mtx);
1620                 zone_free_item(kegs, keg, NULL, SKIP_NONE,
1621                     ZFREE_STATFREE);
1622         }
1623 }
1624
1625 /*
1626  * Traverses every zone in the system and calls a callback
1627  *
1628  * Arguments:
1629  *      zfunc  A pointer to a function which accepts a zone
1630  *              as an argument.
1631  *
1632  * Returns:
1633  *      Nothing
1634  */
1635 static void
1636 zone_foreach(void (*zfunc)(uma_zone_t))
1637 {
1638         uma_keg_t keg;
1639         uma_zone_t zone;
1640
1641         mtx_lock(&uma_mtx);
1642         LIST_FOREACH(keg, &uma_kegs, uk_link) {
1643                 LIST_FOREACH(zone, &keg->uk_zones, uz_link)
1644                         zfunc(zone);
1645         }
1646         mtx_unlock(&uma_mtx);
1647 }
1648
1649 /* Public functions */
1650 /* See uma.h */
1651 void
1652 uma_startup(void *bootmem, int boot_pages)
1653 {
1654         struct uma_zctor_args args;
1655         uma_slab_t slab;
1656         u_int slabsize;
1657         u_int objsize, totsize, wsize;
1658         int i;
1659
1660 #ifdef UMA_DEBUG
1661         printf("Creating uma keg headers zone and keg.\n");
1662 #endif
1663         mtx_init(&uma_mtx, "UMA lock", NULL, MTX_DEF);
1664
1665         /*
1666          * Figure out the maximum number of items-per-slab we'll have if
1667          * we're using the OFFPAGE slab header to track free items, given
1668          * all possible object sizes and the maximum desired wastage
1669          * (UMA_MAX_WASTE).
1670          *
1671          * We iterate until we find an object size for
1672          * which the calculated wastage in keg_small_init() will be
1673          * enough to warrant OFFPAGE.  Since wastedspace versus objsize
1674          * is an overall increasing see-saw function, we find the smallest
1675          * objsize such that the wastage is always acceptable for objects
1676          * with that objsize or smaller.  Since a smaller objsize always
1677          * generates a larger possible uma_max_ipers, we use this computed
1678          * objsize to calculate the largest ipers possible.  Since the
1679          * ipers calculated for OFFPAGE slab headers is always larger than
1680          * the ipers initially calculated in keg_small_init(), we use
1681          * the former's equation (UMA_SLAB_SIZE / keg->uk_rsize) to
1682          * obtain the maximum ipers possible for offpage slab headers.
1683          *
1684          * It should be noted that ipers versus objsize is an inversly
1685          * proportional function which drops off rather quickly so as
1686          * long as our UMA_MAX_WASTE is such that the objsize we calculate
1687          * falls into the portion of the inverse relation AFTER the steep
1688          * falloff, then uma_max_ipers shouldn't be too high (~10 on i386).
1689          *
1690          * Note that we have 8-bits (1 byte) to use as a freelist index
1691          * inside the actual slab header itself and this is enough to
1692          * accomodate us.  In the worst case, a UMA_SMALLEST_UNIT sized
1693          * object with offpage slab header would have ipers =
1694          * UMA_SLAB_SIZE / UMA_SMALLEST_UNIT (currently = 256), which is
1695          * 1 greater than what our byte-integer freelist index can
1696          * accomodate, but we know that this situation never occurs as
1697          * for UMA_SMALLEST_UNIT-sized objects, we will never calculate
1698          * that we need to go to offpage slab headers.  Or, if we do,
1699          * then we trap that condition below and panic in the INVARIANTS case.
1700          */
1701         wsize = UMA_SLAB_SIZE - sizeof(struct uma_slab) -
1702             (UMA_SLAB_SIZE / UMA_MAX_WASTE);
1703         totsize = wsize;
1704         objsize = UMA_SMALLEST_UNIT;
1705         while (totsize >= wsize) {
1706                 totsize = (UMA_SLAB_SIZE - sizeof(struct uma_slab)) /
1707                     (objsize + UMA_FRITM_SZ);
1708                 totsize *= (UMA_FRITM_SZ + objsize);
1709                 objsize++;
1710         }
1711         if (objsize > UMA_SMALLEST_UNIT)
1712                 objsize--;
1713         uma_max_ipers = MAX(UMA_SLAB_SIZE / objsize, 64);
1714
1715         wsize = UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt) -
1716             (UMA_SLAB_SIZE / UMA_MAX_WASTE);
1717         totsize = wsize;
1718         objsize = UMA_SMALLEST_UNIT;
1719         while (totsize >= wsize) {
1720                 totsize = (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt)) /
1721                     (objsize + UMA_FRITMREF_SZ);
1722                 totsize *= (UMA_FRITMREF_SZ + objsize);
1723                 objsize++;
1724         }
1725         if (objsize > UMA_SMALLEST_UNIT)
1726                 objsize--;
1727         uma_max_ipers_ref = MAX(UMA_SLAB_SIZE / objsize, 64);
1728
1729         KASSERT((uma_max_ipers_ref <= 255) && (uma_max_ipers <= 255),
1730             ("uma_startup: calculated uma_max_ipers values too large!"));
1731
1732 #ifdef UMA_DEBUG
1733         printf("Calculated uma_max_ipers (for OFFPAGE) is %d\n", uma_max_ipers);
1734         printf("Calculated uma_max_ipers_ref (for OFFPAGE) is %d\n",
1735             uma_max_ipers_ref);
1736 #endif
1737
1738         /* "manually" create the initial zone */
1739         args.name = "UMA Kegs";
1740         args.size = sizeof(struct uma_keg);
1741         args.ctor = keg_ctor;
1742         args.dtor = keg_dtor;
1743         args.uminit = zero_init;
1744         args.fini = NULL;
1745         args.keg = &masterkeg;
1746         args.align = 32 - 1;
1747         args.flags = UMA_ZFLAG_INTERNAL;
1748         /* The initial zone has no Per cpu queues so it's smaller */
1749         zone_ctor(kegs, sizeof(struct uma_zone), &args, M_WAITOK);
1750
1751 #ifdef UMA_DEBUG
1752         printf("Filling boot free list.\n");
1753 #endif
1754         for (i = 0; i < boot_pages; i++) {
1755                 slab = (uma_slab_t)((u_int8_t *)bootmem + (i * UMA_SLAB_SIZE));
1756                 slab->us_data = (u_int8_t *)slab;
1757                 slab->us_flags = UMA_SLAB_BOOT;
1758                 LIST_INSERT_HEAD(&uma_boot_pages, slab, us_link);
1759         }
1760         mtx_init(&uma_boot_pages_mtx, "UMA boot pages", NULL, MTX_DEF);
1761
1762 #ifdef UMA_DEBUG
1763         printf("Creating uma zone headers zone and keg.\n");
1764 #endif
1765         args.name = "UMA Zones";
1766         args.size = sizeof(struct uma_zone) +
1767             (sizeof(struct uma_cache) * (mp_maxid + 1));
1768         args.ctor = zone_ctor;
1769         args.dtor = zone_dtor;
1770         args.uminit = zero_init;
1771         args.fini = NULL;
1772         args.keg = NULL;
1773         args.align = 32 - 1;
1774         args.flags = UMA_ZFLAG_INTERNAL;
1775         /* The initial zone has no Per cpu queues so it's smaller */
1776         zone_ctor(zones, sizeof(struct uma_zone), &args, M_WAITOK);
1777
1778 #ifdef UMA_DEBUG
1779         printf("Initializing pcpu cache locks.\n");
1780 #endif
1781 #ifdef UMA_DEBUG
1782         printf("Creating slab and hash zones.\n");
1783 #endif
1784
1785         /*
1786          * This is the max number of free list items we'll have with
1787          * offpage slabs.
1788          */
1789         slabsize = uma_max_ipers * UMA_FRITM_SZ;
1790         slabsize += sizeof(struct uma_slab);
1791
1792         /* Now make a zone for slab headers */
1793         slabzone = uma_zcreate("UMA Slabs",
1794                                 slabsize,
1795                                 NULL, NULL, NULL, NULL,
1796                                 UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
1797
1798         /*
1799          * We also create a zone for the bigger slabs with reference
1800          * counts in them, to accomodate UMA_ZONE_REFCNT zones.
1801          */
1802         slabsize = uma_max_ipers_ref * UMA_FRITMREF_SZ;
1803         slabsize += sizeof(struct uma_slab_refcnt);
1804         slabrefzone = uma_zcreate("UMA RCntSlabs",
1805                                   slabsize,
1806                                   NULL, NULL, NULL, NULL,
1807                                   UMA_ALIGN_PTR,
1808                                   UMA_ZFLAG_INTERNAL);
1809
1810         hashzone = uma_zcreate("UMA Hash",
1811             sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
1812             NULL, NULL, NULL, NULL,
1813             UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
1814
1815         bucket_init();
1816
1817         booted = UMA_STARTUP;
1818
1819 #ifdef UMA_DEBUG
1820         printf("UMA startup complete.\n");
1821 #endif
1822 }
1823
1824 /* see uma.h */
1825 void
1826 uma_startup2(void)
1827 {
1828         booted = UMA_STARTUP2;
1829         bucket_enable();
1830 #ifdef UMA_DEBUG
1831         printf("UMA startup2 complete.\n");
1832 #endif
1833 }
1834
1835 /*
1836  * Initialize our callout handle
1837  *
1838  */
1839
1840 static void
1841 uma_startup3(void)
1842 {
1843 #ifdef UMA_DEBUG
1844         printf("Starting callout.\n");
1845 #endif
1846         callout_init(&uma_callout, CALLOUT_MPSAFE);
1847         callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
1848 #ifdef UMA_DEBUG
1849         printf("UMA startup3 complete.\n");
1850 #endif
1851 }
1852
1853 static uma_keg_t
1854 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
1855                 int align, u_int32_t flags)
1856 {
1857         struct uma_kctor_args args;
1858
1859         args.size = size;
1860         args.uminit = uminit;
1861         args.fini = fini;
1862         args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
1863         args.flags = flags;
1864         args.zone = zone;
1865         return (zone_alloc_item(kegs, &args, M_WAITOK));
1866 }
1867
1868 /* See uma.h */
1869 void
1870 uma_set_align(int align)
1871 {
1872
1873         if (align != UMA_ALIGN_CACHE)
1874                 uma_align_cache = align;
1875 }
1876
1877 /* See uma.h */
1878 uma_zone_t
1879 uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
1880                 uma_init uminit, uma_fini fini, int align, u_int32_t flags)
1881
1882 {
1883         struct uma_zctor_args args;
1884
1885         /* This stuff is essential for the zone ctor */
1886         args.name = name;
1887         args.size = size;
1888         args.ctor = ctor;
1889         args.dtor = dtor;
1890         args.uminit = uminit;
1891         args.fini = fini;
1892         args.align = align;
1893         args.flags = flags;
1894         args.keg = NULL;
1895
1896         return (zone_alloc_item(zones, &args, M_WAITOK));
1897 }
1898
1899 /* See uma.h */
1900 uma_zone_t
1901 uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
1902                     uma_init zinit, uma_fini zfini, uma_zone_t master)
1903 {
1904         struct uma_zctor_args args;
1905         uma_keg_t keg;
1906
1907         keg = zone_first_keg(master);
1908         args.name = name;
1909         args.size = keg->uk_size;
1910         args.ctor = ctor;
1911         args.dtor = dtor;
1912         args.uminit = zinit;
1913         args.fini = zfini;
1914         args.align = keg->uk_align;
1915         args.flags = keg->uk_flags | UMA_ZONE_SECONDARY;
1916         args.keg = keg;
1917
1918         /* XXX Attaches only one keg of potentially many. */
1919         return (zone_alloc_item(zones, &args, M_WAITOK));
1920 }
1921
1922 static void
1923 zone_lock_pair(uma_zone_t a, uma_zone_t b)
1924 {
1925         if (a < b) {
1926                 ZONE_LOCK(a);
1927                 mtx_lock_flags(b->uz_lock, MTX_DUPOK);
1928         } else {
1929                 ZONE_LOCK(b);
1930                 mtx_lock_flags(a->uz_lock, MTX_DUPOK);
1931         }
1932 }
1933
1934 static void
1935 zone_unlock_pair(uma_zone_t a, uma_zone_t b)
1936 {
1937
1938         ZONE_UNLOCK(a);
1939         ZONE_UNLOCK(b);
1940 }
1941
1942 int
1943 uma_zsecond_add(uma_zone_t zone, uma_zone_t master)
1944 {
1945         uma_klink_t klink;
1946         uma_klink_t kl;
1947         int error;
1948
1949         error = 0;
1950         klink = malloc(sizeof(*klink), M_TEMP, M_WAITOK | M_ZERO);
1951
1952         zone_lock_pair(zone, master);
1953         /*
1954          * zone must use vtoslab() to resolve objects and must already be
1955          * a secondary.
1956          */
1957         if ((zone->uz_flags & (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY))
1958             != (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY)) {
1959                 error = EINVAL;
1960                 goto out;
1961         }
1962         /*
1963          * The new master must also use vtoslab().
1964          */
1965         if ((zone->uz_flags & UMA_ZONE_VTOSLAB) != UMA_ZONE_VTOSLAB) {
1966                 error = EINVAL;
1967                 goto out;
1968         }
1969         /*
1970          * Both must either be refcnt, or not be refcnt.
1971          */
1972         if ((zone->uz_flags & UMA_ZONE_REFCNT) !=
1973             (master->uz_flags & UMA_ZONE_REFCNT)) {
1974                 error = EINVAL;
1975                 goto out;
1976         }
1977         /*
1978          * The underlying object must be the same size.  rsize
1979          * may be different.
1980          */
1981         if (master->uz_size != zone->uz_size) {
1982                 error = E2BIG;
1983                 goto out;
1984         }
1985         /*
1986          * Put it at the end of the list.
1987          */
1988         klink->kl_keg = zone_first_keg(master);
1989         LIST_FOREACH(kl, &zone->uz_kegs, kl_link) {
1990                 if (LIST_NEXT(kl, kl_link) == NULL) {
1991                         LIST_INSERT_AFTER(kl, klink, kl_link);
1992                         break;
1993                 }
1994         }
1995         klink = NULL;
1996         zone->uz_flags |= UMA_ZFLAG_MULTI;
1997         zone->uz_slab = zone_fetch_slab_multi;
1998
1999 out:
2000         zone_unlock_pair(zone, master);
2001         if (klink != NULL)
2002                 free(klink, M_TEMP);
2003
2004         return (error);
2005 }
2006
2007
2008 /* See uma.h */
2009 void
2010 uma_zdestroy(uma_zone_t zone)
2011 {
2012
2013         zone_free_item(zones, zone, NULL, SKIP_NONE, ZFREE_STATFREE);
2014 }
2015
2016 /* See uma.h */
2017 void *
2018 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
2019 {
2020         void *item;
2021         uma_cache_t cache;
2022         uma_bucket_t bucket;
2023         int cpu;
2024
2025         /* This is the fast path allocation */
2026 #ifdef UMA_DEBUG_ALLOC_1
2027         printf("Allocating one item from %s(%p)\n", zone->uz_name, zone);
2028 #endif
2029         CTR3(KTR_UMA, "uma_zalloc_arg thread %x zone %s flags %d", curthread,
2030             zone->uz_name, flags);
2031
2032         if (flags & M_WAITOK) {
2033                 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2034                     "uma_zalloc_arg: zone \"%s\"", zone->uz_name);
2035         }
2036 #ifdef DEBUG_MEMGUARD
2037         if (memguard_cmp_zone(zone)) {
2038                 item = memguard_alloc(zone->uz_size, flags);
2039                 if (item != NULL) {
2040                         /*
2041                          * Avoid conflict with the use-after-free
2042                          * protecting infrastructure from INVARIANTS.
2043                          */
2044                         if (zone->uz_init != NULL &&
2045                             zone->uz_init != mtrash_init &&
2046                             zone->uz_init(item, zone->uz_size, flags) != 0)
2047                                 return (NULL);
2048                         if (zone->uz_ctor != NULL &&
2049                             zone->uz_ctor != mtrash_ctor &&
2050                             zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
2051                                 zone->uz_fini(item, zone->uz_size);
2052                                 return (NULL);
2053                         }
2054                         return (item);
2055                 }
2056                 /* This is unfortunate but should not be fatal. */
2057         }
2058 #endif
2059         /*
2060          * If possible, allocate from the per-CPU cache.  There are two
2061          * requirements for safe access to the per-CPU cache: (1) the thread
2062          * accessing the cache must not be preempted or yield during access,
2063          * and (2) the thread must not migrate CPUs without switching which
2064          * cache it accesses.  We rely on a critical section to prevent
2065          * preemption and migration.  We release the critical section in
2066          * order to acquire the zone mutex if we are unable to allocate from
2067          * the current cache; when we re-acquire the critical section, we
2068          * must detect and handle migration if it has occurred.
2069          */
2070 zalloc_restart:
2071         critical_enter();
2072         cpu = curcpu;
2073         cache = &zone->uz_cpu[cpu];
2074
2075 zalloc_start:
2076         bucket = cache->uc_allocbucket;
2077
2078         if (bucket) {
2079                 if (bucket->ub_cnt > 0) {
2080                         bucket->ub_cnt--;
2081                         item = bucket->ub_bucket[bucket->ub_cnt];
2082 #ifdef INVARIANTS
2083                         bucket->ub_bucket[bucket->ub_cnt] = NULL;
2084 #endif
2085                         KASSERT(item != NULL,
2086                             ("uma_zalloc: Bucket pointer mangled."));
2087                         cache->uc_allocs++;
2088                         critical_exit();
2089 #ifdef INVARIANTS
2090                         ZONE_LOCK(zone);
2091                         uma_dbg_alloc(zone, NULL, item);
2092                         ZONE_UNLOCK(zone);
2093 #endif
2094                         if (zone->uz_ctor != NULL) {
2095                                 if (zone->uz_ctor(item, zone->uz_size,
2096                                     udata, flags) != 0) {
2097                                         zone_free_item(zone, item, udata,
2098                                             SKIP_DTOR, ZFREE_STATFAIL |
2099                                             ZFREE_STATFREE);
2100                                         return (NULL);
2101                                 }
2102                         }
2103                         if (flags & M_ZERO)
2104                                 bzero(item, zone->uz_size);
2105                         return (item);
2106                 } else if (cache->uc_freebucket) {
2107                         /*
2108                          * We have run out of items in our allocbucket.
2109                          * See if we can switch with our free bucket.
2110                          */
2111                         if (cache->uc_freebucket->ub_cnt > 0) {
2112 #ifdef UMA_DEBUG_ALLOC
2113                                 printf("uma_zalloc: Swapping empty with"
2114                                     " alloc.\n");
2115 #endif
2116                                 bucket = cache->uc_freebucket;
2117                                 cache->uc_freebucket = cache->uc_allocbucket;
2118                                 cache->uc_allocbucket = bucket;
2119
2120                                 goto zalloc_start;
2121                         }
2122                 }
2123         }
2124         /*
2125          * Attempt to retrieve the item from the per-CPU cache has failed, so
2126          * we must go back to the zone.  This requires the zone lock, so we
2127          * must drop the critical section, then re-acquire it when we go back
2128          * to the cache.  Since the critical section is released, we may be
2129          * preempted or migrate.  As such, make sure not to maintain any
2130          * thread-local state specific to the cache from prior to releasing
2131          * the critical section.
2132          */
2133         critical_exit();
2134         ZONE_LOCK(zone);
2135         critical_enter();
2136         cpu = curcpu;
2137         cache = &zone->uz_cpu[cpu];
2138         bucket = cache->uc_allocbucket;
2139         if (bucket != NULL) {
2140                 if (bucket->ub_cnt > 0) {
2141                         ZONE_UNLOCK(zone);
2142                         goto zalloc_start;
2143                 }
2144                 bucket = cache->uc_freebucket;
2145                 if (bucket != NULL && bucket->ub_cnt > 0) {
2146                         ZONE_UNLOCK(zone);
2147                         goto zalloc_start;
2148                 }
2149         }
2150
2151         /* Since we have locked the zone we may as well send back our stats */
2152         zone->uz_allocs += cache->uc_allocs;
2153         cache->uc_allocs = 0;
2154         zone->uz_frees += cache->uc_frees;
2155         cache->uc_frees = 0;
2156
2157         /* Our old one is now a free bucket */
2158         if (cache->uc_allocbucket) {
2159                 KASSERT(cache->uc_allocbucket->ub_cnt == 0,
2160                     ("uma_zalloc_arg: Freeing a non free bucket."));
2161                 LIST_INSERT_HEAD(&zone->uz_free_bucket,
2162                     cache->uc_allocbucket, ub_link);
2163                 cache->uc_allocbucket = NULL;
2164         }
2165
2166         /* Check the free list for a new alloc bucket */
2167         if ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) {
2168                 KASSERT(bucket->ub_cnt != 0,
2169                     ("uma_zalloc_arg: Returning an empty bucket."));
2170
2171                 LIST_REMOVE(bucket, ub_link);
2172                 cache->uc_allocbucket = bucket;
2173                 ZONE_UNLOCK(zone);
2174                 goto zalloc_start;
2175         }
2176         /* We are no longer associated with this CPU. */
2177         critical_exit();
2178
2179         /* Bump up our uz_count so we get here less */
2180         if (zone->uz_count < BUCKET_MAX)
2181                 zone->uz_count++;
2182
2183         /*
2184          * Now lets just fill a bucket and put it on the free list.  If that
2185          * works we'll restart the allocation from the begining.
2186          */
2187         if (zone_alloc_bucket(zone, flags)) {
2188                 ZONE_UNLOCK(zone);
2189                 goto zalloc_restart;
2190         }
2191         ZONE_UNLOCK(zone);
2192         /*
2193          * We may not be able to get a bucket so return an actual item.
2194          */
2195 #ifdef UMA_DEBUG
2196         printf("uma_zalloc_arg: Bucketzone returned NULL\n");
2197 #endif
2198
2199         item = zone_alloc_item(zone, udata, flags);
2200         return (item);
2201 }
2202
2203 static uma_slab_t
2204 keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int flags)
2205 {
2206         uma_slab_t slab;
2207
2208         mtx_assert(&keg->uk_lock, MA_OWNED);
2209         slab = NULL;
2210
2211         for (;;) {
2212                 /*
2213                  * Find a slab with some space.  Prefer slabs that are partially
2214                  * used over those that are totally full.  This helps to reduce
2215                  * fragmentation.
2216                  */
2217                 if (keg->uk_free != 0) {
2218                         if (!LIST_EMPTY(&keg->uk_part_slab)) {
2219                                 slab = LIST_FIRST(&keg->uk_part_slab);
2220                         } else {
2221                                 slab = LIST_FIRST(&keg->uk_free_slab);
2222                                 LIST_REMOVE(slab, us_link);
2223                                 LIST_INSERT_HEAD(&keg->uk_part_slab, slab,
2224                                     us_link);
2225                         }
2226                         MPASS(slab->us_keg == keg);
2227                         return (slab);
2228                 }
2229
2230                 /*
2231                  * M_NOVM means don't ask at all!
2232                  */
2233                 if (flags & M_NOVM)
2234                         break;
2235
2236                 if (keg->uk_maxpages && keg->uk_pages >= keg->uk_maxpages) {
2237                         keg->uk_flags |= UMA_ZFLAG_FULL;
2238                         /*
2239                          * If this is not a multi-zone, set the FULL bit.
2240                          * Otherwise slab_multi() takes care of it.
2241                          */
2242                         if ((zone->uz_flags & UMA_ZFLAG_MULTI) == 0) {
2243                                 zone->uz_flags |= UMA_ZFLAG_FULL;
2244                                 zone_log_warning(zone);
2245                         }
2246                         if (flags & M_NOWAIT)
2247                                 break;
2248                         zone->uz_sleeps++;
2249                         msleep(keg, &keg->uk_lock, PVM, "keglimit", 0);
2250                         continue;
2251                 }
2252                 keg->uk_recurse++;
2253                 slab = keg_alloc_slab(keg, zone, flags);
2254                 keg->uk_recurse--;
2255                 /*
2256                  * If we got a slab here it's safe to mark it partially used
2257                  * and return.  We assume that the caller is going to remove
2258                  * at least one item.
2259                  */
2260                 if (slab) {
2261                         MPASS(slab->us_keg == keg);
2262                         LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
2263                         return (slab);
2264                 }
2265                 /*
2266                  * We might not have been able to get a slab but another cpu
2267                  * could have while we were unlocked.  Check again before we
2268                  * fail.
2269                  */
2270                 flags |= M_NOVM;
2271         }
2272         return (slab);
2273 }
2274
2275 static inline void
2276 zone_relock(uma_zone_t zone, uma_keg_t keg)
2277 {
2278         if (zone->uz_lock != &keg->uk_lock) {
2279                 KEG_UNLOCK(keg);
2280                 ZONE_LOCK(zone);
2281         }
2282 }
2283
2284 static inline void
2285 keg_relock(uma_keg_t keg, uma_zone_t zone)
2286 {
2287         if (zone->uz_lock != &keg->uk_lock) {
2288                 ZONE_UNLOCK(zone);
2289                 KEG_LOCK(keg);
2290         }
2291 }
2292
2293 static uma_slab_t
2294 zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int flags)
2295 {
2296         uma_slab_t slab;
2297
2298         if (keg == NULL)
2299                 keg = zone_first_keg(zone);
2300         /*
2301          * This is to prevent us from recursively trying to allocate
2302          * buckets.  The problem is that if an allocation forces us to
2303          * grab a new bucket we will call page_alloc, which will go off
2304          * and cause the vm to allocate vm_map_entries.  If we need new
2305          * buckets there too we will recurse in kmem_alloc and bad
2306          * things happen.  So instead we return a NULL bucket, and make
2307          * the code that allocates buckets smart enough to deal with it
2308          */
2309         if (keg->uk_flags & UMA_ZFLAG_BUCKET && keg->uk_recurse != 0)
2310                 return (NULL);
2311
2312         for (;;) {
2313                 slab = keg_fetch_slab(keg, zone, flags);
2314                 if (slab)
2315                         return (slab);
2316                 if (flags & (M_NOWAIT | M_NOVM))
2317                         break;
2318         }
2319         return (NULL);
2320 }
2321
2322 /*
2323  * uma_zone_fetch_slab_multi:  Fetches a slab from one available keg.  Returns
2324  * with the keg locked.  Caller must call zone_relock() afterwards if the
2325  * zone lock is required.  On NULL the zone lock is held.
2326  *
2327  * The last pointer is used to seed the search.  It is not required.
2328  */
2329 static uma_slab_t
2330 zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int rflags)
2331 {
2332         uma_klink_t klink;
2333         uma_slab_t slab;
2334         uma_keg_t keg;
2335         int flags;
2336         int empty;
2337         int full;
2338
2339         /*
2340          * Don't wait on the first pass.  This will skip limit tests
2341          * as well.  We don't want to block if we can find a provider
2342          * without blocking.
2343          */
2344         flags = (rflags & ~M_WAITOK) | M_NOWAIT;
2345         /*
2346          * Use the last slab allocated as a hint for where to start
2347          * the search.
2348          */
2349         if (last) {
2350                 slab = keg_fetch_slab(last, zone, flags);
2351                 if (slab)
2352                         return (slab);
2353                 zone_relock(zone, last);
2354                 last = NULL;
2355         }
2356         /*
2357          * Loop until we have a slab incase of transient failures
2358          * while M_WAITOK is specified.  I'm not sure this is 100%
2359          * required but we've done it for so long now.
2360          */
2361         for (;;) {
2362                 empty = 0;
2363                 full = 0;
2364                 /*
2365                  * Search the available kegs for slabs.  Be careful to hold the
2366                  * correct lock while calling into the keg layer.
2367                  */
2368                 LIST_FOREACH(klink, &zone->uz_kegs, kl_link) {
2369                         keg = klink->kl_keg;
2370                         keg_relock(keg, zone);
2371                         if ((keg->uk_flags & UMA_ZFLAG_FULL) == 0) {
2372                                 slab = keg_fetch_slab(keg, zone, flags);
2373                                 if (slab)
2374                                         return (slab);
2375                         }
2376                         if (keg->uk_flags & UMA_ZFLAG_FULL)
2377                                 full++;
2378                         else
2379                                 empty++;
2380                         zone_relock(zone, keg);
2381                 }
2382                 if (rflags & (M_NOWAIT | M_NOVM))
2383                         break;
2384                 flags = rflags;
2385                 /*
2386                  * All kegs are full.  XXX We can't atomically check all kegs
2387                  * and sleep so just sleep for a short period and retry.
2388                  */
2389                 if (full && !empty) {
2390                         zone->uz_flags |= UMA_ZFLAG_FULL;
2391                         zone->uz_sleeps++;
2392                         zone_log_warning(zone);
2393                         msleep(zone, zone->uz_lock, PVM, "zonelimit", hz/100);
2394                         zone->uz_flags &= ~UMA_ZFLAG_FULL;
2395                         continue;
2396                 }
2397         }
2398         return (NULL);
2399 }
2400
2401 static void *
2402 slab_alloc_item(uma_zone_t zone, uma_slab_t slab)
2403 {
2404         uma_keg_t keg;
2405         uma_slabrefcnt_t slabref;
2406         void *item;
2407         u_int8_t freei;
2408
2409         keg = slab->us_keg;
2410         mtx_assert(&keg->uk_lock, MA_OWNED);
2411
2412         freei = slab->us_firstfree;
2413         if (keg->uk_flags & UMA_ZONE_REFCNT) {
2414                 slabref = (uma_slabrefcnt_t)slab;
2415                 slab->us_firstfree = slabref->us_freelist[freei].us_item;
2416         } else {
2417                 slab->us_firstfree = slab->us_freelist[freei].us_item;
2418         }
2419         item = slab->us_data + (keg->uk_rsize * freei);
2420
2421         slab->us_freecount--;
2422         keg->uk_free--;
2423 #ifdef INVARIANTS
2424         uma_dbg_alloc(zone, slab, item);
2425 #endif
2426         /* Move this slab to the full list */
2427         if (slab->us_freecount == 0) {
2428                 LIST_REMOVE(slab, us_link);
2429                 LIST_INSERT_HEAD(&keg->uk_full_slab, slab, us_link);
2430         }
2431
2432         return (item);
2433 }
2434
2435 static int
2436 zone_alloc_bucket(uma_zone_t zone, int flags)
2437 {
2438         uma_bucket_t bucket;
2439         uma_slab_t slab;
2440         uma_keg_t keg;
2441         int16_t saved;
2442         int max, origflags = flags;
2443
2444         /*
2445          * Try this zone's free list first so we don't allocate extra buckets.
2446          */
2447         if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
2448                 KASSERT(bucket->ub_cnt == 0,
2449                     ("zone_alloc_bucket: Bucket on free list is not empty."));
2450                 LIST_REMOVE(bucket, ub_link);
2451         } else {
2452                 int bflags;
2453
2454                 bflags = (flags & ~M_ZERO);
2455                 if (zone->uz_flags & UMA_ZFLAG_CACHEONLY)
2456                         bflags |= M_NOVM;
2457
2458                 ZONE_UNLOCK(zone);
2459                 bucket = bucket_alloc(zone->uz_count, bflags);
2460                 ZONE_LOCK(zone);
2461         }
2462
2463         if (bucket == NULL) {
2464                 return (0);
2465         }
2466
2467 #ifdef SMP
2468         /*
2469          * This code is here to limit the number of simultaneous bucket fills
2470          * for any given zone to the number of per cpu caches in this zone. This
2471          * is done so that we don't allocate more memory than we really need.
2472          */
2473         if (zone->uz_fills >= mp_ncpus)
2474                 goto done;
2475
2476 #endif
2477         zone->uz_fills++;
2478
2479         max = MIN(bucket->ub_entries, zone->uz_count);
2480         /* Try to keep the buckets totally full */
2481         saved = bucket->ub_cnt;
2482         slab = NULL;
2483         keg = NULL;
2484         while (bucket->ub_cnt < max &&
2485             (slab = zone->uz_slab(zone, keg, flags)) != NULL) {
2486                 keg = slab->us_keg;
2487                 while (slab->us_freecount && bucket->ub_cnt < max) {
2488                         bucket->ub_bucket[bucket->ub_cnt++] =
2489                             slab_alloc_item(zone, slab);
2490                 }
2491
2492                 /* Don't block on the next fill */
2493                 flags |= M_NOWAIT;
2494         }
2495         if (slab)
2496                 zone_relock(zone, keg);
2497
2498         /*
2499          * We unlock here because we need to call the zone's init.
2500          * It should be safe to unlock because the slab dealt with
2501          * above is already on the appropriate list within the keg
2502          * and the bucket we filled is not yet on any list, so we
2503          * own it.
2504          */
2505         if (zone->uz_init != NULL) {
2506                 int i;
2507
2508                 ZONE_UNLOCK(zone);
2509                 for (i = saved; i < bucket->ub_cnt; i++)
2510                         if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size,
2511                             origflags) != 0)
2512                                 break;
2513                 /*
2514                  * If we couldn't initialize the whole bucket, put the
2515                  * rest back onto the freelist.
2516                  */
2517                 if (i != bucket->ub_cnt) {
2518                         int j;
2519
2520                         for (j = i; j < bucket->ub_cnt; j++) {
2521                                 zone_free_item(zone, bucket->ub_bucket[j],
2522                                     NULL, SKIP_FINI, 0);
2523 #ifdef INVARIANTS
2524                                 bucket->ub_bucket[j] = NULL;
2525 #endif
2526                         }
2527                         bucket->ub_cnt = i;
2528                 }
2529                 ZONE_LOCK(zone);
2530         }
2531
2532         zone->uz_fills--;
2533         if (bucket->ub_cnt != 0) {
2534                 LIST_INSERT_HEAD(&zone->uz_full_bucket,
2535                     bucket, ub_link);
2536                 return (1);
2537         }
2538 #ifdef SMP
2539 done:
2540 #endif
2541         bucket_free(bucket);
2542
2543         return (0);
2544 }
2545 /*
2546  * Allocates an item for an internal zone
2547  *
2548  * Arguments
2549  *      zone   The zone to alloc for.
2550  *      udata  The data to be passed to the constructor.
2551  *      flags  M_WAITOK, M_NOWAIT, M_ZERO.
2552  *
2553  * Returns
2554  *      NULL if there is no memory and M_NOWAIT is set
2555  *      An item if successful
2556  */
2557
2558 static void *
2559 zone_alloc_item(uma_zone_t zone, void *udata, int flags)
2560 {
2561         uma_slab_t slab;
2562         void *item;
2563
2564         item = NULL;
2565
2566 #ifdef UMA_DEBUG_ALLOC
2567         printf("INTERNAL: Allocating one item from %s(%p)\n", zone->uz_name, zone);
2568 #endif
2569         ZONE_LOCK(zone);
2570
2571         slab = zone->uz_slab(zone, NULL, flags);
2572         if (slab == NULL) {
2573                 zone->uz_fails++;
2574                 ZONE_UNLOCK(zone);
2575                 return (NULL);
2576         }
2577
2578         item = slab_alloc_item(zone, slab);
2579
2580         zone_relock(zone, slab->us_keg);
2581         zone->uz_allocs++;
2582         ZONE_UNLOCK(zone);
2583
2584         /*
2585          * We have to call both the zone's init (not the keg's init)
2586          * and the zone's ctor.  This is because the item is going from
2587          * a keg slab directly to the user, and the user is expecting it
2588          * to be both zone-init'd as well as zone-ctor'd.
2589          */
2590         if (zone->uz_init != NULL) {
2591                 if (zone->uz_init(item, zone->uz_size, flags) != 0) {
2592                         zone_free_item(zone, item, udata, SKIP_FINI,
2593                             ZFREE_STATFAIL | ZFREE_STATFREE);
2594                         return (NULL);
2595                 }
2596         }
2597         if (zone->uz_ctor != NULL) {
2598                 if (zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
2599                         zone_free_item(zone, item, udata, SKIP_DTOR,
2600                             ZFREE_STATFAIL | ZFREE_STATFREE);
2601                         return (NULL);
2602                 }
2603         }
2604         if (flags & M_ZERO)
2605                 bzero(item, zone->uz_size);
2606
2607         return (item);
2608 }
2609
2610 /* See uma.h */
2611 void
2612 uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
2613 {
2614         uma_cache_t cache;
2615         uma_bucket_t bucket;
2616         int bflags;
2617         int cpu;
2618
2619 #ifdef UMA_DEBUG_ALLOC_1
2620         printf("Freeing item %p to %s(%p)\n", item, zone->uz_name, zone);
2621 #endif
2622         CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread,
2623             zone->uz_name);
2624
2625         /* uma_zfree(..., NULL) does nothing, to match free(9). */
2626         if (item == NULL)
2627                 return;
2628 #ifdef DEBUG_MEMGUARD
2629         if (is_memguard_addr(item)) {
2630                 if (zone->uz_dtor != NULL && zone->uz_dtor != mtrash_dtor)
2631                         zone->uz_dtor(item, zone->uz_size, udata);
2632                 if (zone->uz_fini != NULL && zone->uz_fini != mtrash_fini)
2633                         zone->uz_fini(item, zone->uz_size);
2634                 memguard_free(item);
2635                 return;
2636         }
2637 #endif
2638         if (zone->uz_dtor)
2639                 zone->uz_dtor(item, zone->uz_size, udata);
2640
2641 #ifdef INVARIANTS
2642         ZONE_LOCK(zone);
2643         if (zone->uz_flags & UMA_ZONE_MALLOC)
2644                 uma_dbg_free(zone, udata, item);
2645         else
2646                 uma_dbg_free(zone, NULL, item);
2647         ZONE_UNLOCK(zone);
2648 #endif
2649         /*
2650          * The race here is acceptable.  If we miss it we'll just have to wait
2651          * a little longer for the limits to be reset.
2652          */
2653         if (zone->uz_flags & UMA_ZFLAG_FULL)
2654                 goto zfree_internal;
2655
2656         /*
2657          * If possible, free to the per-CPU cache.  There are two
2658          * requirements for safe access to the per-CPU cache: (1) the thread
2659          * accessing the cache must not be preempted or yield during access,
2660          * and (2) the thread must not migrate CPUs without switching which
2661          * cache it accesses.  We rely on a critical section to prevent
2662          * preemption and migration.  We release the critical section in
2663          * order to acquire the zone mutex if we are unable to free to the
2664          * current cache; when we re-acquire the critical section, we must
2665          * detect and handle migration if it has occurred.
2666          */
2667 zfree_restart:
2668         critical_enter();
2669         cpu = curcpu;
2670         cache = &zone->uz_cpu[cpu];
2671
2672 zfree_start:
2673         bucket = cache->uc_freebucket;
2674
2675         if (bucket) {
2676                 /*
2677                  * Do we have room in our bucket? It is OK for this uz count
2678                  * check to be slightly out of sync.
2679                  */
2680
2681                 if (bucket->ub_cnt < bucket->ub_entries) {
2682                         KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL,
2683                             ("uma_zfree: Freeing to non free bucket index."));
2684                         bucket->ub_bucket[bucket->ub_cnt] = item;
2685                         bucket->ub_cnt++;
2686                         cache->uc_frees++;
2687                         critical_exit();
2688                         return;
2689                 } else if (cache->uc_allocbucket) {
2690 #ifdef UMA_DEBUG_ALLOC
2691                         printf("uma_zfree: Swapping buckets.\n");
2692 #endif
2693                         /*
2694                          * We have run out of space in our freebucket.
2695                          * See if we can switch with our alloc bucket.
2696                          */
2697                         if (cache->uc_allocbucket->ub_cnt <
2698                             cache->uc_freebucket->ub_cnt) {
2699                                 bucket = cache->uc_freebucket;
2700                                 cache->uc_freebucket = cache->uc_allocbucket;
2701                                 cache->uc_allocbucket = bucket;
2702                                 goto zfree_start;
2703                         }
2704                 }
2705         }
2706         /*
2707          * We can get here for two reasons:
2708          *
2709          * 1) The buckets are NULL
2710          * 2) The alloc and free buckets are both somewhat full.
2711          *
2712          * We must go back the zone, which requires acquiring the zone lock,
2713          * which in turn means we must release and re-acquire the critical
2714          * section.  Since the critical section is released, we may be
2715          * preempted or migrate.  As such, make sure not to maintain any
2716          * thread-local state specific to the cache from prior to releasing
2717          * the critical section.
2718          */
2719         critical_exit();
2720         ZONE_LOCK(zone);
2721         critical_enter();
2722         cpu = curcpu;
2723         cache = &zone->uz_cpu[cpu];
2724         if (cache->uc_freebucket != NULL) {
2725                 if (cache->uc_freebucket->ub_cnt <
2726                     cache->uc_freebucket->ub_entries) {
2727                         ZONE_UNLOCK(zone);
2728                         goto zfree_start;
2729                 }
2730                 if (cache->uc_allocbucket != NULL &&
2731                     (cache->uc_allocbucket->ub_cnt <
2732                     cache->uc_freebucket->ub_cnt)) {
2733                         ZONE_UNLOCK(zone);
2734                         goto zfree_start;
2735                 }
2736         }
2737
2738         /* Since we have locked the zone we may as well send back our stats */
2739         zone->uz_allocs += cache->uc_allocs;
2740         cache->uc_allocs = 0;
2741         zone->uz_frees += cache->uc_frees;
2742         cache->uc_frees = 0;
2743
2744         bucket = cache->uc_freebucket;
2745         cache->uc_freebucket = NULL;
2746
2747         /* Can we throw this on the zone full list? */
2748         if (bucket != NULL) {
2749 #ifdef UMA_DEBUG_ALLOC
2750                 printf("uma_zfree: Putting old bucket on the free list.\n");
2751 #endif
2752                 /* ub_cnt is pointing to the last free item */
2753                 KASSERT(bucket->ub_cnt != 0,
2754                     ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n"));
2755                 LIST_INSERT_HEAD(&zone->uz_full_bucket,
2756                     bucket, ub_link);
2757         }
2758         if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
2759                 LIST_REMOVE(bucket, ub_link);
2760                 ZONE_UNLOCK(zone);
2761                 cache->uc_freebucket = bucket;
2762                 goto zfree_start;
2763         }
2764         /* We are no longer associated with this CPU. */
2765         critical_exit();
2766
2767         /* And the zone.. */
2768         ZONE_UNLOCK(zone);
2769
2770 #ifdef UMA_DEBUG_ALLOC
2771         printf("uma_zfree: Allocating new free bucket.\n");
2772 #endif
2773         bflags = M_NOWAIT;
2774
2775         if (zone->uz_flags & UMA_ZFLAG_CACHEONLY)
2776                 bflags |= M_NOVM;
2777         bucket = bucket_alloc(zone->uz_count, bflags);
2778         if (bucket) {
2779                 ZONE_LOCK(zone);
2780                 LIST_INSERT_HEAD(&zone->uz_free_bucket,
2781                     bucket, ub_link);
2782                 ZONE_UNLOCK(zone);
2783                 goto zfree_restart;
2784         }
2785
2786         /*
2787          * If nothing else caught this, we'll just do an internal free.
2788          */
2789 zfree_internal:
2790         zone_free_item(zone, item, udata, SKIP_DTOR, ZFREE_STATFREE);
2791
2792         return;
2793 }
2794
2795 /*
2796  * Frees an item to an INTERNAL zone or allocates a free bucket
2797  *
2798  * Arguments:
2799  *      zone   The zone to free to
2800  *      item   The item we're freeing
2801  *      udata  User supplied data for the dtor
2802  *      skip   Skip dtors and finis
2803  */
2804 static void
2805 zone_free_item(uma_zone_t zone, void *item, void *udata,
2806     enum zfreeskip skip, int flags)
2807 {
2808         uma_slab_t slab;
2809         uma_slabrefcnt_t slabref;
2810         uma_keg_t keg;
2811         u_int8_t *mem;
2812         u_int8_t freei;
2813         int clearfull;
2814
2815         if (skip < SKIP_DTOR && zone->uz_dtor)
2816                 zone->uz_dtor(item, zone->uz_size, udata);
2817
2818         if (skip < SKIP_FINI && zone->uz_fini)
2819                 zone->uz_fini(item, zone->uz_size);
2820
2821         ZONE_LOCK(zone);
2822
2823         if (flags & ZFREE_STATFAIL)
2824                 zone->uz_fails++;
2825         if (flags & ZFREE_STATFREE)
2826                 zone->uz_frees++;
2827
2828         if (!(zone->uz_flags & UMA_ZONE_VTOSLAB)) {
2829                 mem = (u_int8_t *)((unsigned long)item & (~UMA_SLAB_MASK));
2830                 keg = zone_first_keg(zone); /* Must only be one. */
2831                 if (zone->uz_flags & UMA_ZONE_HASH) {
2832                         slab = hash_sfind(&keg->uk_hash, mem);
2833                 } else {
2834                         mem += keg->uk_pgoff;
2835                         slab = (uma_slab_t)mem;
2836                 }
2837         } else {
2838                 /* This prevents redundant lookups via free(). */
2839                 if ((zone->uz_flags & UMA_ZONE_MALLOC) && udata != NULL)
2840                         slab = (uma_slab_t)udata;
2841                 else
2842                         slab = vtoslab((vm_offset_t)item);
2843                 keg = slab->us_keg;
2844                 keg_relock(keg, zone);
2845         }
2846         MPASS(keg == slab->us_keg);
2847
2848         /* Do we need to remove from any lists? */
2849         if (slab->us_freecount+1 == keg->uk_ipers) {
2850                 LIST_REMOVE(slab, us_link);
2851                 LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
2852         } else if (slab->us_freecount == 0) {
2853                 LIST_REMOVE(slab, us_link);
2854                 LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
2855         }
2856
2857         /* Slab management stuff */
2858         freei = ((unsigned long)item - (unsigned long)slab->us_data)
2859                 / keg->uk_rsize;
2860
2861 #ifdef INVARIANTS
2862         if (!skip)
2863                 uma_dbg_free(zone, slab, item);
2864 #endif
2865
2866         if (keg->uk_flags & UMA_ZONE_REFCNT) {
2867                 slabref = (uma_slabrefcnt_t)slab;
2868                 slabref->us_freelist[freei].us_item = slab->us_firstfree;
2869         } else {
2870                 slab->us_freelist[freei].us_item = slab->us_firstfree;
2871         }
2872         slab->us_firstfree = freei;
2873         slab->us_freecount++;
2874
2875         /* Zone statistics */
2876         keg->uk_free++;
2877
2878         clearfull = 0;
2879         if (keg->uk_flags & UMA_ZFLAG_FULL) {
2880                 if (keg->uk_pages < keg->uk_maxpages) {
2881                         keg->uk_flags &= ~UMA_ZFLAG_FULL;
2882                         clearfull = 1;
2883                 }
2884
2885                 /*
2886                  * We can handle one more allocation. Since we're clearing ZFLAG_FULL,
2887                  * wake up all procs blocked on pages. This should be uncommon, so
2888                  * keeping this simple for now (rather than adding count of blocked
2889                  * threads etc).
2890                  */
2891                 wakeup(keg);
2892         }
2893         if (clearfull) {
2894                 zone_relock(zone, keg);
2895                 zone->uz_flags &= ~UMA_ZFLAG_FULL;
2896                 wakeup(zone);
2897                 ZONE_UNLOCK(zone);
2898         } else
2899                 KEG_UNLOCK(keg);
2900 }
2901
2902 /* See uma.h */
2903 int
2904 uma_zone_set_max(uma_zone_t zone, int nitems)
2905 {
2906         uma_keg_t keg;
2907
2908         ZONE_LOCK(zone);
2909         keg = zone_first_keg(zone);
2910         keg->uk_maxpages = (nitems / keg->uk_ipers) * keg->uk_ppera;
2911         if (keg->uk_maxpages * keg->uk_ipers < nitems)
2912                 keg->uk_maxpages += keg->uk_ppera;
2913         nitems = keg->uk_maxpages * keg->uk_ipers;
2914         ZONE_UNLOCK(zone);
2915
2916         return (nitems);
2917 }
2918
2919 /* See uma.h */
2920 int
2921 uma_zone_get_max(uma_zone_t zone)
2922 {
2923         int nitems;
2924         uma_keg_t keg;
2925
2926         ZONE_LOCK(zone);
2927         keg = zone_first_keg(zone);
2928         nitems = keg->uk_maxpages * keg->uk_ipers;
2929         ZONE_UNLOCK(zone);
2930
2931         return (nitems);
2932 }
2933
2934 /* See uma.h */
2935 void
2936 uma_zone_set_warning(uma_zone_t zone, const char *warning)
2937 {
2938
2939         ZONE_LOCK(zone);
2940         zone->uz_warning = warning;
2941         ZONE_UNLOCK(zone);
2942 }
2943
2944 /* See uma.h */
2945 int
2946 uma_zone_get_cur(uma_zone_t zone)
2947 {
2948         int64_t nitems;
2949         u_int i;
2950
2951         ZONE_LOCK(zone);
2952         nitems = zone->uz_allocs - zone->uz_frees;
2953         CPU_FOREACH(i) {
2954                 /*
2955                  * See the comment in sysctl_vm_zone_stats() regarding the
2956                  * safety of accessing the per-cpu caches. With the zone lock
2957                  * held, it is safe, but can potentially result in stale data.
2958                  */
2959                 nitems += zone->uz_cpu[i].uc_allocs -
2960                     zone->uz_cpu[i].uc_frees;
2961         }
2962         ZONE_UNLOCK(zone);
2963
2964         return (nitems < 0 ? 0 : nitems);
2965 }
2966
2967 /* See uma.h */
2968 void
2969 uma_zone_set_init(uma_zone_t zone, uma_init uminit)
2970 {
2971         uma_keg_t keg;
2972
2973         ZONE_LOCK(zone);
2974         keg = zone_first_keg(zone);
2975         KASSERT(keg->uk_pages == 0,
2976             ("uma_zone_set_init on non-empty keg"));
2977         keg->uk_init = uminit;
2978         ZONE_UNLOCK(zone);
2979 }
2980
2981 /* See uma.h */
2982 void
2983 uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
2984 {
2985         uma_keg_t keg;
2986
2987         ZONE_LOCK(zone);
2988         keg = zone_first_keg(zone);
2989         KASSERT(keg->uk_pages == 0,
2990             ("uma_zone_set_fini on non-empty keg"));
2991         keg->uk_fini = fini;
2992         ZONE_UNLOCK(zone);
2993 }
2994
2995 /* See uma.h */
2996 void
2997 uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
2998 {
2999         ZONE_LOCK(zone);
3000         KASSERT(zone_first_keg(zone)->uk_pages == 0,
3001             ("uma_zone_set_zinit on non-empty keg"));
3002         zone->uz_init = zinit;
3003         ZONE_UNLOCK(zone);
3004 }
3005
3006 /* See uma.h */
3007 void
3008 uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
3009 {
3010         ZONE_LOCK(zone);
3011         KASSERT(zone_first_keg(zone)->uk_pages == 0,
3012             ("uma_zone_set_zfini on non-empty keg"));
3013         zone->uz_fini = zfini;
3014         ZONE_UNLOCK(zone);
3015 }
3016
3017 /* See uma.h */
3018 /* XXX uk_freef is not actually used with the zone locked */
3019 void
3020 uma_zone_set_freef(uma_zone_t zone, uma_free freef)
3021 {
3022
3023         ZONE_LOCK(zone);
3024         zone_first_keg(zone)->uk_freef = freef;
3025         ZONE_UNLOCK(zone);
3026 }
3027
3028 /* See uma.h */
3029 /* XXX uk_allocf is not actually used with the zone locked */
3030 void
3031 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
3032 {
3033         uma_keg_t keg;
3034
3035         ZONE_LOCK(zone);
3036         keg = zone_first_keg(zone);
3037         keg->uk_flags |= UMA_ZFLAG_PRIVALLOC;
3038         keg->uk_allocf = allocf;
3039         ZONE_UNLOCK(zone);
3040 }
3041
3042 /* See uma.h */
3043 int
3044 uma_zone_reserve_kva(uma_zone_t zone, int count)
3045 {
3046         uma_keg_t keg;
3047         vm_offset_t kva;
3048         int pages;
3049
3050         keg = zone_first_keg(zone);
3051         pages = count / keg->uk_ipers;
3052
3053         if (pages * keg->uk_ipers < count)
3054                 pages++;
3055
3056 #ifdef UMA_MD_SMALL_ALLOC
3057         if (keg->uk_ppera > 1) {
3058 #else
3059         if (1) {
3060 #endif
3061                 kva = kmem_alloc_nofault(kernel_map, pages * UMA_SLAB_SIZE);
3062                 if (kva == 0)
3063                         return (0);
3064         } else
3065                 kva = 0;
3066         ZONE_LOCK(zone);
3067         keg->uk_kva = kva;
3068         keg->uk_offset = 0;
3069         keg->uk_maxpages = pages;
3070 #ifdef UMA_MD_SMALL_ALLOC
3071         keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc;
3072 #else
3073         keg->uk_allocf = noobj_alloc;
3074 #endif
3075         keg->uk_flags |= UMA_ZONE_NOFREE | UMA_ZFLAG_PRIVALLOC;
3076         ZONE_UNLOCK(zone);
3077         return (1);
3078 }
3079
3080 /* See uma.h */
3081 void
3082 uma_prealloc(uma_zone_t zone, int items)
3083 {
3084         int slabs;
3085         uma_slab_t slab;
3086         uma_keg_t keg;
3087
3088         keg = zone_first_keg(zone);
3089         ZONE_LOCK(zone);
3090         slabs = items / keg->uk_ipers;
3091         if (slabs * keg->uk_ipers < items)
3092                 slabs++;
3093         while (slabs > 0) {
3094                 slab = keg_alloc_slab(keg, zone, M_WAITOK);
3095                 if (slab == NULL)
3096                         break;
3097                 MPASS(slab->us_keg == keg);
3098                 LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
3099                 slabs--;
3100         }
3101         ZONE_UNLOCK(zone);
3102 }
3103
3104 /* See uma.h */
3105 u_int32_t *
3106 uma_find_refcnt(uma_zone_t zone, void *item)
3107 {
3108         uma_slabrefcnt_t slabref;
3109         uma_keg_t keg;
3110         u_int32_t *refcnt;
3111         int idx;
3112
3113         slabref = (uma_slabrefcnt_t)vtoslab((vm_offset_t)item &
3114             (~UMA_SLAB_MASK));
3115         keg = slabref->us_keg;
3116         KASSERT(slabref != NULL && slabref->us_keg->uk_flags & UMA_ZONE_REFCNT,
3117             ("uma_find_refcnt(): zone possibly not UMA_ZONE_REFCNT"));
3118         idx = ((unsigned long)item - (unsigned long)slabref->us_data)
3119             / keg->uk_rsize;
3120         refcnt = &slabref->us_freelist[idx].us_refcnt;
3121         return refcnt;
3122 }
3123
3124 /* See uma.h */
3125 void
3126 uma_reclaim(void)
3127 {
3128 #ifdef UMA_DEBUG
3129         printf("UMA: vm asked us to release pages!\n");
3130 #endif
3131         bucket_enable();
3132         zone_foreach(zone_drain);
3133         /*
3134          * Some slabs may have been freed but this zone will be visited early
3135          * we visit again so that we can free pages that are empty once other
3136          * zones are drained.  We have to do the same for buckets.
3137          */
3138         zone_drain(slabzone);
3139         zone_drain(slabrefzone);
3140         bucket_zone_drain();
3141 }
3142
3143 /* See uma.h */
3144 int
3145 uma_zone_exhausted(uma_zone_t zone)
3146 {
3147         int full;
3148
3149         ZONE_LOCK(zone);
3150         full = (zone->uz_flags & UMA_ZFLAG_FULL);
3151         ZONE_UNLOCK(zone);
3152         return (full);
3153 }
3154
3155 int
3156 uma_zone_exhausted_nolock(uma_zone_t zone)
3157 {
3158         return (zone->uz_flags & UMA_ZFLAG_FULL);
3159 }
3160
3161 void *
3162 uma_large_malloc(int size, int wait)
3163 {
3164         void *mem;
3165         uma_slab_t slab;
3166         u_int8_t flags;
3167
3168         slab = zone_alloc_item(slabzone, NULL, wait);
3169         if (slab == NULL)
3170                 return (NULL);
3171         mem = page_alloc(NULL, size, &flags, wait);
3172         if (mem) {
3173                 vsetslab((vm_offset_t)mem, slab);
3174                 slab->us_data = mem;
3175                 slab->us_flags = flags | UMA_SLAB_MALLOC;
3176                 slab->us_size = size;
3177         } else {
3178                 zone_free_item(slabzone, slab, NULL, SKIP_NONE,
3179                     ZFREE_STATFAIL | ZFREE_STATFREE);
3180         }
3181
3182         return (mem);
3183 }
3184
3185 void
3186 uma_large_free(uma_slab_t slab)
3187 {
3188         vsetobj((vm_offset_t)slab->us_data, kmem_object);
3189         page_free(slab->us_data, slab->us_size, slab->us_flags);
3190         zone_free_item(slabzone, slab, NULL, SKIP_NONE, ZFREE_STATFREE);
3191 }
3192
3193 void
3194 uma_print_stats(void)
3195 {
3196         zone_foreach(uma_print_zone);
3197 }
3198
3199 static void
3200 slab_print(uma_slab_t slab)
3201 {
3202         printf("slab: keg %p, data %p, freecount %d, firstfree %d\n",
3203                 slab->us_keg, slab->us_data, slab->us_freecount,
3204                 slab->us_firstfree);
3205 }
3206
3207 static void
3208 cache_print(uma_cache_t cache)
3209 {
3210         printf("alloc: %p(%d), free: %p(%d)\n",
3211                 cache->uc_allocbucket,
3212                 cache->uc_allocbucket?cache->uc_allocbucket->ub_cnt:0,
3213                 cache->uc_freebucket,
3214                 cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0);
3215 }
3216
3217 static void
3218 uma_print_keg(uma_keg_t keg)
3219 {
3220         uma_slab_t slab;
3221
3222         printf("keg: %s(%p) size %d(%d) flags %#x ipers %d ppera %d "
3223             "out %d free %d limit %d\n",
3224             keg->uk_name, keg, keg->uk_size, keg->uk_rsize, keg->uk_flags,
3225             keg->uk_ipers, keg->uk_ppera,
3226             (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free,
3227             (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers);
3228         printf("Part slabs:\n");
3229         LIST_FOREACH(slab, &keg->uk_part_slab, us_link)
3230                 slab_print(slab);
3231         printf("Free slabs:\n");
3232         LIST_FOREACH(slab, &keg->uk_free_slab, us_link)
3233                 slab_print(slab);
3234         printf("Full slabs:\n");
3235         LIST_FOREACH(slab, &keg->uk_full_slab, us_link)
3236                 slab_print(slab);
3237 }
3238
3239 void
3240 uma_print_zone(uma_zone_t zone)
3241 {
3242         uma_cache_t cache;
3243         uma_klink_t kl;
3244         int i;
3245
3246         printf("zone: %s(%p) size %d flags %#x\n",
3247             zone->uz_name, zone, zone->uz_size, zone->uz_flags);
3248         LIST_FOREACH(kl, &zone->uz_kegs, kl_link)
3249                 uma_print_keg(kl->kl_keg);
3250         CPU_FOREACH(i) {
3251                 cache = &zone->uz_cpu[i];
3252                 printf("CPU %d Cache:\n", i);
3253                 cache_print(cache);
3254         }
3255 }
3256
3257 #ifdef DDB
3258 /*
3259  * Generate statistics across both the zone and its per-cpu cache's.  Return
3260  * desired statistics if the pointer is non-NULL for that statistic.
3261  *
3262  * Note: does not update the zone statistics, as it can't safely clear the
3263  * per-CPU cache statistic.
3264  *
3265  * XXXRW: Following the uc_allocbucket and uc_freebucket pointers here isn't
3266  * safe from off-CPU; we should modify the caches to track this information
3267  * directly so that we don't have to.
3268  */
3269 static void
3270 uma_zone_sumstat(uma_zone_t z, int *cachefreep, u_int64_t *allocsp,
3271     u_int64_t *freesp, u_int64_t *sleepsp)
3272 {
3273         uma_cache_t cache;
3274         u_int64_t allocs, frees, sleeps;
3275         int cachefree, cpu;
3276
3277         allocs = frees = sleeps = 0;
3278         cachefree = 0;
3279         CPU_FOREACH(cpu) {
3280                 cache = &z->uz_cpu[cpu];
3281                 if (cache->uc_allocbucket != NULL)
3282                         cachefree += cache->uc_allocbucket->ub_cnt;
3283                 if (cache->uc_freebucket != NULL)
3284                         cachefree += cache->uc_freebucket->ub_cnt;
3285                 allocs += cache->uc_allocs;
3286                 frees += cache->uc_frees;
3287         }
3288         allocs += z->uz_allocs;
3289         frees += z->uz_frees;
3290         sleeps += z->uz_sleeps;
3291         if (cachefreep != NULL)
3292                 *cachefreep = cachefree;
3293         if (allocsp != NULL)
3294                 *allocsp = allocs;
3295         if (freesp != NULL)
3296                 *freesp = frees;
3297         if (sleepsp != NULL)
3298                 *sleepsp = sleeps;
3299 }
3300 #endif /* DDB */
3301
3302 static int
3303 sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS)
3304 {
3305         uma_keg_t kz;
3306         uma_zone_t z;
3307         int count;
3308
3309         count = 0;
3310         mtx_lock(&uma_mtx);
3311         LIST_FOREACH(kz, &uma_kegs, uk_link) {
3312                 LIST_FOREACH(z, &kz->uk_zones, uz_link)
3313                         count++;
3314         }
3315         mtx_unlock(&uma_mtx);
3316         return (sysctl_handle_int(oidp, &count, 0, req));
3317 }
3318
3319 static int
3320 sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS)
3321 {
3322         struct uma_stream_header ush;
3323         struct uma_type_header uth;
3324         struct uma_percpu_stat ups;
3325         uma_bucket_t bucket;
3326         struct sbuf sbuf;
3327         uma_cache_t cache;
3328         uma_klink_t kl;
3329         uma_keg_t kz;
3330         uma_zone_t z;
3331         uma_keg_t k;
3332         int count, error, i;
3333
3334         error = sysctl_wire_old_buffer(req, 0);
3335         if (error != 0)
3336                 return (error);
3337         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
3338
3339         count = 0;
3340         mtx_lock(&uma_mtx);
3341         LIST_FOREACH(kz, &uma_kegs, uk_link) {
3342                 LIST_FOREACH(z, &kz->uk_zones, uz_link)
3343                         count++;
3344         }
3345
3346         /*
3347          * Insert stream header.
3348          */
3349         bzero(&ush, sizeof(ush));
3350         ush.ush_version = UMA_STREAM_VERSION;
3351         ush.ush_maxcpus = (mp_maxid + 1);
3352         ush.ush_count = count;
3353         (void)sbuf_bcat(&sbuf, &ush, sizeof(ush));
3354
3355         LIST_FOREACH(kz, &uma_kegs, uk_link) {
3356                 LIST_FOREACH(z, &kz->uk_zones, uz_link) {
3357                         bzero(&uth, sizeof(uth));
3358                         ZONE_LOCK(z);
3359                         strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
3360                         uth.uth_align = kz->uk_align;
3361                         uth.uth_size = kz->uk_size;
3362                         uth.uth_rsize = kz->uk_rsize;
3363                         LIST_FOREACH(kl, &z->uz_kegs, kl_link) {
3364                                 k = kl->kl_keg;
3365                                 uth.uth_maxpages += k->uk_maxpages;
3366                                 uth.uth_pages += k->uk_pages;
3367                                 uth.uth_keg_free += k->uk_free;
3368                                 uth.uth_limit = (k->uk_maxpages / k->uk_ppera)
3369                                     * k->uk_ipers;
3370                         }
3371
3372                         /*
3373                          * A zone is secondary is it is not the first entry
3374                          * on the keg's zone list.
3375                          */
3376                         if ((z->uz_flags & UMA_ZONE_SECONDARY) &&
3377                             (LIST_FIRST(&kz->uk_zones) != z))
3378                                 uth.uth_zone_flags = UTH_ZONE_SECONDARY;
3379
3380                         LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link)
3381                                 uth.uth_zone_free += bucket->ub_cnt;
3382                         uth.uth_allocs = z->uz_allocs;
3383                         uth.uth_frees = z->uz_frees;
3384                         uth.uth_fails = z->uz_fails;
3385                         uth.uth_sleeps = z->uz_sleeps;
3386                         (void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
3387                         /*
3388                          * While it is not normally safe to access the cache
3389                          * bucket pointers while not on the CPU that owns the
3390                          * cache, we only allow the pointers to be exchanged
3391                          * without the zone lock held, not invalidated, so
3392                          * accept the possible race associated with bucket
3393                          * exchange during monitoring.
3394                          */
3395                         for (i = 0; i < (mp_maxid + 1); i++) {
3396                                 bzero(&ups, sizeof(ups));
3397                                 if (kz->uk_flags & UMA_ZFLAG_INTERNAL)
3398                                         goto skip;
3399                                 if (CPU_ABSENT(i))
3400                                         goto skip;
3401                                 cache = &z->uz_cpu[i];
3402                                 if (cache->uc_allocbucket != NULL)
3403                                         ups.ups_cache_free +=
3404                                             cache->uc_allocbucket->ub_cnt;
3405                                 if (cache->uc_freebucket != NULL)
3406                                         ups.ups_cache_free +=
3407                                             cache->uc_freebucket->ub_cnt;
3408                                 ups.ups_allocs = cache->uc_allocs;
3409                                 ups.ups_frees = cache->uc_frees;
3410 skip:
3411                                 (void)sbuf_bcat(&sbuf, &ups, sizeof(ups));
3412                         }
3413                         ZONE_UNLOCK(z);
3414                 }
3415         }
3416         mtx_unlock(&uma_mtx);
3417         error = sbuf_finish(&sbuf);
3418         sbuf_delete(&sbuf);
3419         return (error);
3420 }
3421
3422 #ifdef DDB
3423 DB_SHOW_COMMAND(uma, db_show_uma)
3424 {
3425         u_int64_t allocs, frees, sleeps;
3426         uma_bucket_t bucket;
3427         uma_keg_t kz;
3428         uma_zone_t z;
3429         int cachefree;
3430
3431         db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
3432             "Requests", "Sleeps");
3433         LIST_FOREACH(kz, &uma_kegs, uk_link) {
3434                 LIST_FOREACH(z, &kz->uk_zones, uz_link) {
3435                         if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
3436                                 allocs = z->uz_allocs;
3437                                 frees = z->uz_frees;
3438                                 sleeps = z->uz_sleeps;
3439                                 cachefree = 0;
3440                         } else
3441                                 uma_zone_sumstat(z, &cachefree, &allocs,
3442                                     &frees, &sleeps);
3443                         if (!((z->uz_flags & UMA_ZONE_SECONDARY) &&
3444                             (LIST_FIRST(&kz->uk_zones) != z)))
3445                                 cachefree += kz->uk_free;
3446                         LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link)
3447                                 cachefree += bucket->ub_cnt;
3448                         db_printf("%18s %8ju %8jd %8d %12ju %8ju\n", z->uz_name,
3449                             (uintmax_t)kz->uk_size,
3450                             (intmax_t)(allocs - frees), cachefree,
3451                             (uintmax_t)allocs, sleeps);
3452                         if (db_pager_quit)
3453                                 return;
3454                 }
3455         }
3456 }
3457 #endif