sys/vm/uma_core.c

   1 /*-
   2  * Copyright (c) 2002-2005, 2009 Jeffrey Roberson <jeff@FreeBSD.org>
   3  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
   4  * Copyright (c) 2004-2006 Robert N. M. Watson
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice unmodified, this list of conditions, and the following
  12  *    disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27  */
  28
  29 /*
  30  * uma_core.c  Implementation of the Universal Memory allocator
  31  *
  32  * This allocator is intended to replace the multitude of similar object caches
  33  * in the standard FreeBSD kernel.  The intent is to be flexible as well as
  34  * effecient.  A primary design goal is to return unused memory to the rest of
  35  * the system.  This will make the system as a whole more flexible due to the
  36  * ability to move memory to subsystems which most need it instead of leaving
  37  * pools of reserved memory unused.
  38  *
  39  * The basic ideas stem from similar slab/zone based allocators whose algorithms
  40  * are well known.
  41  *
  42  */
  43
  44 /*
  45  * TODO:
  46  *      - Improve memory usage for large allocations
  47  *      - Investigate cache size adjustments
  48  */
  49
  50 #include <sys/cdefs.h>
  51 __FBSDID("$FreeBSD$");
  52
  53 /* I should really use ktr.. */
  54 /*
  55 #define UMA_DEBUG 1
  56 #define UMA_DEBUG_ALLOC 1
  57 #define UMA_DEBUG_ALLOC_1 1
  58 */
  59
  60 #include "opt_ddb.h"
  61 #include "opt_param.h"
  62 #include "opt_vm.h"
  63
  64 #include <sys/param.h>
  65 #include <sys/systm.h>
  66 #include <sys/kernel.h>
  67 #include <sys/types.h>
  68 #include <sys/queue.h>
  69 #include <sys/malloc.h>
  70 #include <sys/ktr.h>
  71 #include <sys/lock.h>
  72 #include <sys/sysctl.h>
  73 #include <sys/mutex.h>
  74 #include <sys/proc.h>
  75 #include <sys/rwlock.h>
  76 #include <sys/sbuf.h>
  77 #include <sys/smp.h>
  78 #include <sys/vmmeter.h>
  79
  80 #include <vm/vm.h>
  81 #include <vm/vm_object.h>
  82 #include <vm/vm_page.h>
  83 #include <vm/vm_pageout.h>
  84 #include <vm/vm_param.h>
  85 #include <vm/vm_map.h>
  86 #include <vm/vm_kern.h>
  87 #include <vm/vm_extern.h>
  88 #include <vm/uma.h>
  89 #include <vm/uma_int.h>
  90 #include <vm/uma_dbg.h>
  91
  92 #include <ddb/ddb.h>
  93
  94 #ifdef DEBUG_MEMGUARD
  95 #include <vm/memguard.h>
  96 #endif
  97
  98 /*
  99  * This is the zone and keg from which all zones are spawned.  The idea is that
 100  * even the zone & keg heads are allocated from the allocator, so we use the
 101  * bss section to bootstrap us.
 102  */
 103 static struct uma_keg masterkeg;
 104 static struct uma_zone masterzone_k;
 105 static struct uma_zone masterzone_z;
 106 static uma_zone_t kegs = &masterzone_k;
 107 static uma_zone_t zones = &masterzone_z;
 108
 109 /* This is the zone from which all of uma_slab_t's are allocated. */
 110 static uma_zone_t slabzone;
 111 static uma_zone_t slabrefzone;  /* With refcounters (for UMA_ZONE_REFCNT) */
 112
 113 /*
 114  * The initial hash tables come out of this zone so they can be allocated
 115  * prior to malloc coming up.
 116  */
 117 static uma_zone_t hashzone;
 118
 119 /* The boot-time adjusted value for cache line alignment. */
 120 int uma_align_cache = 64 - 1;
 121
 122 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
 123
 124 /*
 125  * Are we allowed to allocate buckets?
 126  */
 127 static int bucketdisable = 1;
 128
 129 /* Linked list of all kegs in the system */
 130 static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs);
 131
 132 /* This mutex protects the keg list */
 133 static struct mtx uma_mtx;
 134
 135 /* Linked list of boot time pages */
 136 static LIST_HEAD(,uma_slab) uma_boot_pages =
 137     LIST_HEAD_INITIALIZER(uma_boot_pages);
 138
 139 /* This mutex protects the boot time pages list */
 140 static struct mtx uma_boot_pages_mtx;
 141
 142 /* Is the VM done starting up? */
 143 static int booted = 0;
 144 #define UMA_STARTUP     1
 145 #define UMA_STARTUP2    2
 146
 147 /* Maximum number of allowed items-per-slab if the slab header is OFFPAGE */
 148 static u_int uma_max_ipers;
 149 static u_int uma_max_ipers_ref;
 150
 151 /*
 152  * This is the handle used to schedule events that need to happen
 153  * outside of the allocation fast path.
 154  */
 155 static struct callout uma_callout;
 156 #define UMA_TIMEOUT     20              /* Seconds for callout interval. */
 157
 158 /*
 159  * This structure is passed as the zone ctor arg so that I don't have to create
 160  * a special allocation function just for zones.
 161  */
 162 struct uma_zctor_args {
 163         const char *name;
 164         size_t size;
 165         uma_ctor ctor;
 166         uma_dtor dtor;
 167         uma_init uminit;
 168         uma_fini fini;
 169         uma_keg_t keg;
 170         int align;
 171         uint32_t flags;
 172 };
 173
 174 struct uma_kctor_args {
 175         uma_zone_t zone;
 176         size_t size;
 177         uma_init uminit;
 178         uma_fini fini;
 179         int align;
 180         uint32_t flags;
 181 };
 182
 183 struct uma_bucket_zone {
 184         uma_zone_t      ubz_zone;
 185         char            *ubz_name;
 186         int             ubz_entries;
 187 };
 188
 189 #define BUCKET_MAX      128
 190
 191 struct uma_bucket_zone bucket_zones[] = {
 192         { NULL, "16 Bucket", 16 },
 193         { NULL, "32 Bucket", 32 },
 194         { NULL, "64 Bucket", 64 },
 195         { NULL, "128 Bucket", 128 },
 196         { NULL, NULL, 0}
 197 };
 198
 199 #define BUCKET_SHIFT    4
 200 #define BUCKET_ZONES    ((BUCKET_MAX >> BUCKET_SHIFT) + 1)
 201
 202 /*
 203  * bucket_size[] maps requested bucket sizes to zones that allocate a bucket
 204  * of approximately the right size.
 205  */
 206 static uint8_t bucket_size[BUCKET_ZONES];
 207
 208 /*
 209  * Flags and enumerations to be passed to internal functions.
 210  */
 211 enum zfreeskip { SKIP_NONE, SKIP_DTOR, SKIP_FINI };
 212
 213 #define ZFREE_STATFAIL  0x00000001      /* Update zone failure statistic. */
 214 #define ZFREE_STATFREE  0x00000002      /* Update zone free statistic. */
 215
 216 /* Prototypes.. */
 217
 218 static void *noobj_alloc(uma_zone_t, int, uint8_t *, int);
 219 static void *page_alloc(uma_zone_t, int, uint8_t *, int);
 220 static void *startup_alloc(uma_zone_t, int, uint8_t *, int);
 221 static void page_free(void *, int, uint8_t);
 222 static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int);
 223 static void cache_drain(uma_zone_t);
 224 static void bucket_drain(uma_zone_t, uma_bucket_t);
 225 static void bucket_cache_drain(uma_zone_t zone);
 226 static int keg_ctor(void *, int, void *, int);
 227 static void keg_dtor(void *, int, void *);
 228 static int zone_ctor(void *, int, void *, int);
 229 static void zone_dtor(void *, int, void *);
 230 static int zero_init(void *, int, int);
 231 static void keg_small_init(uma_keg_t keg);
 232 static void keg_large_init(uma_keg_t keg);
 233 static void zone_foreach(void (*zfunc)(uma_zone_t));
 234 static void zone_timeout(uma_zone_t zone);
 235 static int hash_alloc(struct uma_hash *);
 236 static int hash_expand(struct uma_hash *, struct uma_hash *);
 237 static void hash_free(struct uma_hash *hash);
 238 static void uma_timeout(void *);
 239 static void uma_startup3(void);
 240 static void *zone_alloc_item(uma_zone_t, void *, int);
 241 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip,
 242     int);
 243 static void bucket_enable(void);
 244 static void bucket_init(void);
 245 static uma_bucket_t bucket_alloc(int, int);
 246 static void bucket_free(uma_bucket_t);
 247 static void bucket_zone_drain(void);
 248 static int zone_alloc_bucket(uma_zone_t zone, int flags);
 249 static uma_slab_t zone_fetch_slab(uma_zone_t zone, uma_keg_t last, int flags);
 250 static uma_slab_t zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int flags);
 251 static void *slab_alloc_item(uma_zone_t zone, uma_slab_t slab);
 252 static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
 253     uma_fini fini, int align, uint32_t flags);
 254 static inline void zone_relock(uma_zone_t zone, uma_keg_t keg);
 255 static inline void keg_relock(uma_keg_t keg, uma_zone_t zone);
 256
 257 void uma_print_zone(uma_zone_t);
 258 void uma_print_stats(void);
 259 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
 260 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
 261
 262 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
 263
 264 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT,
 265     0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones");
 266
 267 SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
 268     0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
 269
 270 static int zone_warnings = 1;
 271 TUNABLE_INT("vm.zone_warnings", &zone_warnings);
 272 SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RW, &zone_warnings, 0,
 273     "Warn when UMA zones becomes full");
 274
 275 /*
 276  * This routine checks to see whether or not it's safe to enable buckets.
 277  */
 278
 279 static void
 280 bucket_enable(void)
 281 {
 282         bucketdisable = vm_page_count_min();
 283 }
 284
 285 /*
 286  * Initialize bucket_zones, the array of zones of buckets of various sizes.
 287  *
 288  * For each zone, calculate the memory required for each bucket, consisting
 289  * of the header and an array of pointers.  Initialize bucket_size[] to point
 290  * the range of appropriate bucket sizes at the zone.
 291  */
 292 static void
 293 bucket_init(void)
 294 {
 295         struct uma_bucket_zone *ubz;
 296         int i;
 297         int j;
 298
 299         for (i = 0, j = 0; bucket_zones[j].ubz_entries != 0; j++) {
 300                 int size;
 301
 302                 ubz = &bucket_zones[j];
 303                 size = roundup(sizeof(struct uma_bucket), sizeof(void *));
 304                 size += sizeof(void *) * ubz->ubz_entries;
 305                 ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
 306                     NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 307                     UMA_ZFLAG_INTERNAL | UMA_ZFLAG_BUCKET);
 308                 for (; i <= ubz->ubz_entries; i += (1 << BUCKET_SHIFT))
 309                         bucket_size[i >> BUCKET_SHIFT] = j;
 310         }
 311 }
 312
 313 /*
 314  * Given a desired number of entries for a bucket, return the zone from which
 315  * to allocate the bucket.
 316  */
 317 static struct uma_bucket_zone *
 318 bucket_zone_lookup(int entries)
 319 {
 320         int idx;
 321
 322         idx = howmany(entries, 1 << BUCKET_SHIFT);
 323         return (&bucket_zones[bucket_size[idx]]);
 324 }
 325
 326 static uma_bucket_t
 327 bucket_alloc(int entries, int bflags)
 328 {
 329         struct uma_bucket_zone *ubz;
 330         uma_bucket_t bucket;
 331
 332         /*
 333          * This is to stop us from allocating per cpu buckets while we're
 334          * running out of vm.boot_pages.  Otherwise, we would exhaust the
 335          * boot pages.  This also prevents us from allocating buckets in
 336          * low memory situations.
 337          */
 338         if (bucketdisable)
 339                 return (NULL);
 340
 341         ubz = bucket_zone_lookup(entries);
 342         bucket = zone_alloc_item(ubz->ubz_zone, NULL, bflags);
 343         if (bucket) {
 344 #ifdef INVARIANTS
 345                 bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
 346 #endif
 347                 bucket->ub_cnt = 0;
 348                 bucket->ub_entries = ubz->ubz_entries;
 349         }
 350
 351         return (bucket);
 352 }
 353
 354 static void
 355 bucket_free(uma_bucket_t bucket)
 356 {
 357         struct uma_bucket_zone *ubz;
 358
 359         ubz = bucket_zone_lookup(bucket->ub_entries);
 360         zone_free_item(ubz->ubz_zone, bucket, NULL, SKIP_NONE,
 361             ZFREE_STATFREE);
 362 }
 363
 364 static void
 365 bucket_zone_drain(void)
 366 {
 367         struct uma_bucket_zone *ubz;
 368
 369         for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
 370                 zone_drain(ubz->ubz_zone);
 371 }
 372
 373 static void
 374 zone_log_warning(uma_zone_t zone)
 375 {
 376         static const struct timeval warninterval = { 300, 0 };
 377
 378         if (!zone_warnings || zone->uz_warning == NULL)
 379                 return;
 380
 381         if (ratecheck(&zone->uz_ratecheck, &warninterval))
 382                 printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning);
 383 }
 384
 385 static inline uma_keg_t
 386 zone_first_keg(uma_zone_t zone)
 387 {
 388
 389         return (LIST_FIRST(&zone->uz_kegs)->kl_keg);
 390 }
 391
 392 static void
 393 zone_foreach_keg(uma_zone_t zone, void (*kegfn)(uma_keg_t))
 394 {
 395         uma_klink_t klink;
 396
 397         LIST_FOREACH(klink, &zone->uz_kegs, kl_link)
 398                 kegfn(klink->kl_keg);
 399 }
 400
 401 /*
 402  * Routine called by timeout which is used to fire off some time interval
 403  * based calculations.  (stats, hash size, etc.)
 404  *
 405  * Arguments:
 406  *      arg   Unused
 407  *
 408  * Returns:
 409  *      Nothing
 410  */
 411 static void
 412 uma_timeout(void *unused)
 413 {
 414         bucket_enable();
 415         zone_foreach(zone_timeout);
 416
 417         /* Reschedule this event */
 418         callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
 419 }
 420
 421 /*
 422  * Routine to perform timeout driven calculations.  This expands the
 423  * hashes and does per cpu statistics aggregation.
 424  *
 425  *  Returns nothing.
 426  */
 427 static void
 428 keg_timeout(uma_keg_t keg)
 429 {
 430
 431         KEG_LOCK(keg);
 432         /*
 433          * Expand the keg hash table.
 434          *
 435          * This is done if the number of slabs is larger than the hash size.
 436          * What I'm trying to do here is completely reduce collisions.  This
 437          * may be a little aggressive.  Should I allow for two collisions max?
 438          */
 439         if (keg->uk_flags & UMA_ZONE_HASH &&
 440             keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) {
 441                 struct uma_hash newhash;
 442                 struct uma_hash oldhash;
 443                 int ret;
 444
 445                 /*
 446                  * This is so involved because allocating and freeing
 447                  * while the keg lock is held will lead to deadlock.
 448                  * I have to do everything in stages and check for
 449                  * races.
 450                  */
 451                 newhash = keg->uk_hash;
 452                 KEG_UNLOCK(keg);
 453                 ret = hash_alloc(&newhash);
 454                 KEG_LOCK(keg);
 455                 if (ret) {
 456                         if (hash_expand(&keg->uk_hash, &newhash)) {
 457                                 oldhash = keg->uk_hash;
 458                                 keg->uk_hash = newhash;
 459                         } else
 460                                 oldhash = newhash;
 461
 462                         KEG_UNLOCK(keg);
 463                         hash_free(&oldhash);
 464                         KEG_LOCK(keg);
 465                 }
 466         }
 467         KEG_UNLOCK(keg);
 468 }
 469
 470 static void
 471 zone_timeout(uma_zone_t zone)
 472 {
 473
 474         zone_foreach_keg(zone, &keg_timeout);
 475 }
 476
 477 /*
 478  * Allocate and zero fill the next sized hash table from the appropriate
 479  * backing store.
 480  *
 481  * Arguments:
 482  *      hash  A new hash structure with the old hash size in uh_hashsize
 483  *
 484  * Returns:
 485  *      1 on sucess and 0 on failure.
 486  */
 487 static int
 488 hash_alloc(struct uma_hash *hash)
 489 {
 490         int oldsize;
 491         int alloc;
 492
 493         oldsize = hash->uh_hashsize;
 494
 495         /* We're just going to go to a power of two greater */
 496         if (oldsize)  {
 497                 hash->uh_hashsize = oldsize * 2;
 498                 alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
 499                 hash->uh_slab_hash = (struct slabhead *)malloc(alloc,
 500                     M_UMAHASH, M_NOWAIT);
 501         } else {
 502                 alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
 503                 hash->uh_slab_hash = zone_alloc_item(hashzone, NULL,
 504                     M_WAITOK);
 505                 hash->uh_hashsize = UMA_HASH_SIZE_INIT;
 506         }
 507         if (hash->uh_slab_hash) {
 508                 bzero(hash->uh_slab_hash, alloc);
 509                 hash->uh_hashmask = hash->uh_hashsize - 1;
 510                 return (1);
 511         }
 512
 513         return (0);
 514 }
 515
 516 /*
 517  * Expands the hash table for HASH zones.  This is done from zone_timeout
 518  * to reduce collisions.  This must not be done in the regular allocation
 519  * path, otherwise, we can recurse on the vm while allocating pages.
 520  *
 521  * Arguments:
 522  *      oldhash  The hash you want to expand
 523  *      newhash  The hash structure for the new table
 524  *
 525  * Returns:
 526  *      Nothing
 527  *
 528  * Discussion:
 529  */
 530 static int
 531 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
 532 {
 533         uma_slab_t slab;
 534         int hval;
 535         int i;
 536
 537         if (!newhash->uh_slab_hash)
 538                 return (0);
 539
 540         if (oldhash->uh_hashsize >= newhash->uh_hashsize)
 541                 return (0);
 542
 543         /*
 544          * I need to investigate hash algorithms for resizing without a
 545          * full rehash.
 546          */
 547
 548         for (i = 0; i < oldhash->uh_hashsize; i++)
 549                 while (!SLIST_EMPTY(&oldhash->uh_slab_hash[i])) {
 550                         slab = SLIST_FIRST(&oldhash->uh_slab_hash[i]);
 551                         SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[i], us_hlink);
 552                         hval = UMA_HASH(newhash, slab->us_data);
 553                         SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
 554                             slab, us_hlink);
 555                 }
 556
 557         return (1);
 558 }
 559
 560 /*
 561  * Free the hash bucket to the appropriate backing store.
 562  *
 563  * Arguments:
 564  *      slab_hash  The hash bucket we're freeing
 565  *      hashsize   The number of entries in that hash bucket
 566  *
 567  * Returns:
 568  *      Nothing
 569  */
 570 static void
 571 hash_free(struct uma_hash *hash)
 572 {
 573         if (hash->uh_slab_hash == NULL)
 574                 return;
 575         if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
 576                 zone_free_item(hashzone,
 577                     hash->uh_slab_hash, NULL, SKIP_NONE, ZFREE_STATFREE);
 578         else
 579                 free(hash->uh_slab_hash, M_UMAHASH);
 580 }
 581
 582 /*
 583  * Frees all outstanding items in a bucket
 584  *
 585  * Arguments:
 586  *      zone   The zone to free to, must be unlocked.
 587  *      bucket The free/alloc bucket with items, cpu queue must be locked.
 588  *
 589  * Returns:
 590  *      Nothing
 591  */
 592
 593 static void
 594 bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
 595 {
 596         void *item;
 597
 598         if (bucket == NULL)
 599                 return;
 600
 601         while (bucket->ub_cnt > 0)  {
 602                 bucket->ub_cnt--;
 603                 item = bucket->ub_bucket[bucket->ub_cnt];
 604 #ifdef INVARIANTS
 605                 bucket->ub_bucket[bucket->ub_cnt] = NULL;
 606                 KASSERT(item != NULL,
 607                     ("bucket_drain: botched ptr, item is NULL"));
 608 #endif
 609                 zone_free_item(zone, item, NULL, SKIP_DTOR, 0);
 610         }
 611 }
 612
 613 /*
 614  * Drains the per cpu caches for a zone.
 615  *
 616  * NOTE: This may only be called while the zone is being turn down, and not
 617  * during normal operation.  This is necessary in order that we do not have
 618  * to migrate CPUs to drain the per-CPU caches.
 619  *
 620  * Arguments:
 621  *      zone     The zone to drain, must be unlocked.
 622  *
 623  * Returns:
 624  *      Nothing
 625  */
 626 static void
 627 cache_drain(uma_zone_t zone)
 628 {
 629         uma_cache_t cache;
 630         int cpu;
 631
 632         /*
 633          * XXX: It is safe to not lock the per-CPU caches, because we're
 634          * tearing down the zone anyway.  I.e., there will be no further use
 635          * of the caches at this point.
 636          *
 637          * XXX: It would good to be able to assert that the zone is being
 638          * torn down to prevent improper use of cache_drain().
 639          *
 640          * XXX: We lock the zone before passing into bucket_cache_drain() as
 641          * it is used elsewhere.  Should the tear-down path be made special
 642          * there in some form?
 643          */
 644         CPU_FOREACH(cpu) {
 645                 cache = &zone->uz_cpu[cpu];
 646                 bucket_drain(zone, cache->uc_allocbucket);
 647                 bucket_drain(zone, cache->uc_freebucket);
 648                 if (cache->uc_allocbucket != NULL)
 649                         bucket_free(cache->uc_allocbucket);
 650                 if (cache->uc_freebucket != NULL)
 651                         bucket_free(cache->uc_freebucket);
 652                 cache->uc_allocbucket = cache->uc_freebucket = NULL;
 653         }
 654         ZONE_LOCK(zone);
 655         bucket_cache_drain(zone);
 656         ZONE_UNLOCK(zone);
 657 }
 658
 659 /*
 660  * Drain the cached buckets from a zone.  Expects a locked zone on entry.
 661  */
 662 static void
 663 bucket_cache_drain(uma_zone_t zone)
 664 {
 665         uma_bucket_t bucket;
 666
 667         /*
 668          * Drain the bucket queues and free the buckets, we just keep two per
 669          * cpu (alloc/free).
 670          */
 671         while ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) {
 672                 LIST_REMOVE(bucket, ub_link);
 673                 ZONE_UNLOCK(zone);
 674                 bucket_drain(zone, bucket);
 675                 bucket_free(bucket);
 676                 ZONE_LOCK(zone);
 677         }
 678
 679         /* Now we do the free queue.. */
 680         while ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
 681                 LIST_REMOVE(bucket, ub_link);
 682                 bucket_free(bucket);
 683         }
 684 }
 685
 686 /*
 687  * Frees pages from a keg back to the system.  This is done on demand from
 688  * the pageout daemon.
 689  *
 690  * Returns nothing.
 691  */
 692 static void
 693 keg_drain(uma_keg_t keg)
 694 {
 695         struct slabhead freeslabs = { 0 };
 696         uma_slab_t slab;
 697         uma_slab_t n;
 698         uint8_t flags;
 699         uint8_t *mem;
 700         int i;
 701
 702         /*
 703          * We don't want to take pages from statically allocated kegs at this
 704          * time
 705          */
 706         if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)
 707                 return;
 708
 709 #ifdef UMA_DEBUG
 710         printf("%s free items: %u\n", keg->uk_name, keg->uk_free);
 711 #endif
 712         KEG_LOCK(keg);
 713         if (keg->uk_free == 0)
 714                 goto finished;
 715
 716         slab = LIST_FIRST(&keg->uk_free_slab);
 717         while (slab) {
 718                 n = LIST_NEXT(slab, us_link);
 719
 720                 /* We have no where to free these to */
 721                 if (slab->us_flags & UMA_SLAB_BOOT) {
 722                         slab = n;
 723                         continue;
 724                 }
 725
 726                 LIST_REMOVE(slab, us_link);
 727                 keg->uk_pages -= keg->uk_ppera;
 728                 keg->uk_free -= keg->uk_ipers;
 729
 730                 if (keg->uk_flags & UMA_ZONE_HASH)
 731                         UMA_HASH_REMOVE(&keg->uk_hash, slab, slab->us_data);
 732
 733                 SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
 734
 735                 slab = n;
 736         }
 737 finished:
 738         KEG_UNLOCK(keg);
 739
 740         while ((slab = SLIST_FIRST(&freeslabs)) != NULL) {
 741                 SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink);
 742                 if (keg->uk_fini)
 743                         for (i = 0; i < keg->uk_ipers; i++)
 744                                 keg->uk_fini(
 745                                     slab->us_data + (keg->uk_rsize * i),
 746                                     keg->uk_size);
 747                 flags = slab->us_flags;
 748                 mem = slab->us_data;
 749
 750                 if (keg->uk_flags & UMA_ZONE_VTOSLAB) {
 751                         vm_object_t obj;
 752
 753                         if (flags & UMA_SLAB_KMEM)
 754                                 obj = kmem_object;
 755                         else if (flags & UMA_SLAB_KERNEL)
 756                                 obj = kernel_object;
 757                         else
 758                                 obj = NULL;
 759                         for (i = 0; i < keg->uk_ppera; i++)
 760                                 vsetobj((vm_offset_t)mem + (i * PAGE_SIZE),
 761                                     obj);
 762                 }
 763                 if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 764                         zone_free_item(keg->uk_slabzone, slab, NULL,
 765                             SKIP_NONE, ZFREE_STATFREE);
 766 #ifdef UMA_DEBUG
 767                 printf("%s: Returning %d bytes.\n",
 768                     keg->uk_name, PAGE_SIZE * keg->uk_ppera);
 769 #endif
 770                 keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera, flags);
 771         }
 772 }
 773
 774 static void
 775 zone_drain_wait(uma_zone_t zone, int waitok)
 776 {
 777
 778         /*
 779          * Set draining to interlock with zone_dtor() so we can release our
 780          * locks as we go.  Only dtor() should do a WAITOK call since it
 781          * is the only call that knows the structure will still be available
 782          * when it wakes up.
 783          */
 784         ZONE_LOCK(zone);
 785         while (zone->uz_flags & UMA_ZFLAG_DRAINING) {
 786                 if (waitok == M_NOWAIT)
 787                         goto out;
 788                 mtx_unlock(&uma_mtx);
 789                 msleep(zone, zone->uz_lock, PVM, "zonedrain", 1);
 790                 mtx_lock(&uma_mtx);
 791         }
 792         zone->uz_flags |= UMA_ZFLAG_DRAINING;
 793         bucket_cache_drain(zone);
 794         ZONE_UNLOCK(zone);
 795         /*
 796          * The DRAINING flag protects us from being freed while
 797          * we're running.  Normally the uma_mtx would protect us but we
 798          * must be able to release and acquire the right lock for each keg.
 799          */
 800         zone_foreach_keg(zone, &keg_drain);
 801         ZONE_LOCK(zone);
 802         zone->uz_flags &= ~UMA_ZFLAG_DRAINING;
 803         wakeup(zone);
 804 out:
 805         ZONE_UNLOCK(zone);
 806 }
 807
 808 void
 809 zone_drain(uma_zone_t zone)
 810 {
 811
 812         zone_drain_wait(zone, M_NOWAIT);
 813 }
 814
 815 /*
 816  * Allocate a new slab for a keg.  This does not insert the slab onto a list.
 817  *
 818  * Arguments:
 819  *      wait  Shall we wait?
 820  *
 821  * Returns:
 822  *      The slab that was allocated or NULL if there is no memory and the
 823  *      caller specified M_NOWAIT.
 824  */
 825 static uma_slab_t
 826 keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait)
 827 {
 828         uma_slabrefcnt_t slabref;
 829         uma_alloc allocf;
 830         uma_slab_t slab;
 831         uint8_t *mem;
 832         uint8_t flags;
 833         int i;
 834
 835         mtx_assert(&keg->uk_lock, MA_OWNED);
 836         slab = NULL;
 837
 838 #ifdef UMA_DEBUG
 839         printf("slab_zalloc:  Allocating a new slab for %s\n", keg->uk_name);
 840 #endif
 841         allocf = keg->uk_allocf;
 842         KEG_UNLOCK(keg);
 843
 844         if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
 845                 slab = zone_alloc_item(keg->uk_slabzone, NULL, wait);
 846                 if (slab == NULL) {
 847                         KEG_LOCK(keg);
 848                         return NULL;
 849                 }
 850         }
 851
 852         /*
 853          * This reproduces the old vm_zone behavior of zero filling pages the
 854          * first time they are added to a zone.
 855          *
 856          * Malloced items are zeroed in uma_zalloc.
 857          */
 858
 859         if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
 860                 wait |= M_ZERO;
 861         else
 862                 wait &= ~M_ZERO;
 863
 864         if (keg->uk_flags & UMA_ZONE_NODUMP)
 865                 wait |= M_NODUMP;
 866
 867         /* zone is passed for legacy reasons. */
 868         mem = allocf(zone, keg->uk_ppera * PAGE_SIZE, &flags, wait);
 869         if (mem == NULL) {
 870                 if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 871                         zone_free_item(keg->uk_slabzone, slab, NULL,
 872                             SKIP_NONE, ZFREE_STATFREE);
 873                 KEG_LOCK(keg);
 874                 return (NULL);
 875         }
 876
 877         /* Point the slab into the allocated memory */
 878         if (!(keg->uk_flags & UMA_ZONE_OFFPAGE))
 879                 slab = (uma_slab_t )(mem + keg->uk_pgoff);
 880
 881         if (keg->uk_flags & UMA_ZONE_VTOSLAB)
 882                 for (i = 0; i < keg->uk_ppera; i++)
 883                         vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab);
 884
 885         slab->us_keg = keg;
 886         slab->us_data = mem;
 887         slab->us_freecount = keg->uk_ipers;
 888         slab->us_firstfree = 0;
 889         slab->us_flags = flags;
 890
 891         if (keg->uk_flags & UMA_ZONE_REFCNT) {
 892                 slabref = (uma_slabrefcnt_t)slab;
 893                 for (i = 0; i < keg->uk_ipers; i++) {
 894                         slabref->us_freelist[i].us_refcnt = 0;
 895                         slabref->us_freelist[i].us_item = i+1;
 896                 }
 897         } else {
 898                 for (i = 0; i < keg->uk_ipers; i++)
 899                         slab->us_freelist[i].us_item = i+1;
 900         }
 901
 902         if (keg->uk_init != NULL) {
 903                 for (i = 0; i < keg->uk_ipers; i++)
 904                         if (keg->uk_init(slab->us_data + (keg->uk_rsize * i),
 905                             keg->uk_size, wait) != 0)
 906                                 break;
 907                 if (i != keg->uk_ipers) {
 908                         if (keg->uk_fini != NULL) {
 909                                 for (i--; i > -1; i--)
 910                                         keg->uk_fini(slab->us_data +
 911                                             (keg->uk_rsize * i),
 912                                             keg->uk_size);
 913                         }
 914                         if (keg->uk_flags & UMA_ZONE_VTOSLAB) {
 915                                 vm_object_t obj;
 916
 917                                 if (flags & UMA_SLAB_KMEM)
 918                                         obj = kmem_object;
 919                                 else if (flags & UMA_SLAB_KERNEL)
 920                                         obj = kernel_object;
 921                                 else
 922                                         obj = NULL;
 923                                 for (i = 0; i < keg->uk_ppera; i++)
 924                                         vsetobj((vm_offset_t)mem +
 925                                             (i * PAGE_SIZE), obj);
 926                         }
 927                         if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 928                                 zone_free_item(keg->uk_slabzone, slab,
 929                                     NULL, SKIP_NONE, ZFREE_STATFREE);
 930                         keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera,
 931                             flags);
 932                         KEG_LOCK(keg);
 933                         return (NULL);
 934                 }
 935         }
 936         KEG_LOCK(keg);
 937
 938         if (keg->uk_flags & UMA_ZONE_HASH)
 939                 UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
 940
 941         keg->uk_pages += keg->uk_ppera;
 942         keg->uk_free += keg->uk_ipers;
 943
 944         return (slab);
 945 }
 946
 947 /*
 948  * This function is intended to be used early on in place of page_alloc() so
 949  * that we may use the boot time page cache to satisfy allocations before
 950  * the VM is ready.
 951  */
 952 static void *
 953 startup_alloc(uma_zone_t zone, int bytes, uint8_t *pflag, int wait)
 954 {
 955         uma_keg_t keg;
 956         uma_slab_t tmps;
 957         int pages, check_pages;
 958
 959         keg = zone_first_keg(zone);
 960         pages = howmany(bytes, PAGE_SIZE);
 961         check_pages = pages - 1;
 962         KASSERT(pages > 0, ("startup_alloc can't reserve 0 pages\n"));
 963
 964         /*
 965          * Check our small startup cache to see if it has pages remaining.
 966          */
 967         mtx_lock(&uma_boot_pages_mtx);
 968
 969         /* First check if we have enough room. */
 970         tmps = LIST_FIRST(&uma_boot_pages);
 971         while (tmps != NULL && check_pages-- > 0)
 972                 tmps = LIST_NEXT(tmps, us_link);
 973         if (tmps != NULL) {
 974                 /*
 975                  * It's ok to lose tmps references.  The last one will
 976                  * have tmps->us_data pointing to the start address of
 977                  * "pages" contiguous pages of memory.
 978                  */
 979                 while (pages-- > 0) {
 980                         tmps = LIST_FIRST(&uma_boot_pages);
 981                         LIST_REMOVE(tmps, us_link);
 982                 }
 983                 mtx_unlock(&uma_boot_pages_mtx);
 984                 *pflag = tmps->us_flags;
 985                 return (tmps->us_data);
 986         }
 987         mtx_unlock(&uma_boot_pages_mtx);
 988         if (booted < UMA_STARTUP2)
 989                 panic("UMA: Increase vm.boot_pages");
 990         /*
 991          * Now that we've booted reset these users to their real allocator.
 992          */
 993 #ifdef UMA_MD_SMALL_ALLOC
 994         keg->uk_allocf = (keg->uk_ppera > 1) ? page_alloc : uma_small_alloc;
 995 #else
 996         keg->uk_allocf = page_alloc;
 997 #endif
 998         return keg->uk_allocf(zone, bytes, pflag, wait);
 999 }
1000
1001 /*
1002  * Allocates a number of pages from the system
1003  *
1004  * Arguments:
1005  *      bytes  The number of bytes requested
1006  *      wait  Shall we wait?
1007  *
1008  * Returns:
1009  *      A pointer to the alloced memory or possibly
1010  *      NULL if M_NOWAIT is set.
1011  */
1012 static void *
1013 page_alloc(uma_zone_t zone, int bytes, uint8_t *pflag, int wait)
1014 {
1015         void *p;        /* Returned page */
1016
1017         *pflag = UMA_SLAB_KMEM;
1018         p = (void *) kmem_malloc(kmem_map, bytes, wait);
1019
1020         return (p);
1021 }
1022
1023 /*
1024  * Allocates a number of pages from within an object
1025  *
1026  * Arguments:
1027  *      bytes  The number of bytes requested
1028  *      wait   Shall we wait?
1029  *
1030  * Returns:
1031  *      A pointer to the alloced memory or possibly
1032  *      NULL if M_NOWAIT is set.
1033  */
1034 static void *
1035 noobj_alloc(uma_zone_t zone, int bytes, uint8_t *flags, int wait)
1036 {
1037         TAILQ_HEAD(, vm_page) alloctail;
1038         u_long npages;
1039         vm_offset_t retkva, zkva;
1040         vm_page_t p, p_next;
1041         uma_keg_t keg;
1042
1043         TAILQ_INIT(&alloctail);
1044         keg = zone_first_keg(zone);
1045
1046         npages = howmany(bytes, PAGE_SIZE);
1047         while (npages > 0) {
1048                 p = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT |
1049                     VM_ALLOC_WIRED | VM_ALLOC_NOOBJ);
1050                 if (p != NULL) {
1051                         /*
1052                          * Since the page does not belong to an object, its
1053                          * listq is unused.
1054                          */
1055                         TAILQ_INSERT_TAIL(&alloctail, p, listq);
1056                         npages--;
1057                         continue;
1058                 }
1059                 if (wait & M_WAITOK) {
1060                         VM_WAIT;
1061                         continue;
1062                 }
1063
1064                 /*
1065                  * Page allocation failed, free intermediate pages and
1066                  * exit.
1067                  */
1068                 TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
1069                         vm_page_unwire(p, 0);
1070                         vm_page_free(p);
1071                 }
1072                 return (NULL);
1073         }
1074         *flags = UMA_SLAB_PRIV;
1075         zkva = keg->uk_kva +
1076             atomic_fetchadd_long(&keg->uk_offset, round_page(bytes));
1077         retkva = zkva;
1078         TAILQ_FOREACH(p, &alloctail, listq) {
1079                 pmap_qenter(zkva, &p, 1);
1080                 zkva += PAGE_SIZE;
1081         }
1082
1083         return ((void *)retkva);
1084 }
1085
1086 /*
1087  * Frees a number of pages to the system
1088  *
1089  * Arguments:
1090  *      mem   A pointer to the memory to be freed
1091  *      size  The size of the memory being freed
1092  *      flags The original p->us_flags field
1093  *
1094  * Returns:
1095  *      Nothing
1096  */
1097 static void
1098 page_free(void *mem, int size, uint8_t flags)
1099 {
1100         vm_map_t map;
1101
1102         if (flags & UMA_SLAB_KMEM)
1103                 map = kmem_map;
1104         else if (flags & UMA_SLAB_KERNEL)
1105                 map = kernel_map;
1106         else
1107                 panic("UMA: page_free used with invalid flags %d", flags);
1108
1109         kmem_free(map, (vm_offset_t)mem, size);
1110 }
1111
1112 /*
1113  * Zero fill initializer
1114  *
1115  * Arguments/Returns follow uma_init specifications
1116  */
1117 static int
1118 zero_init(void *mem, int size, int flags)
1119 {
1120         bzero(mem, size);
1121         return (0);
1122 }
1123
1124 /*
1125  * Finish creating a small uma keg.  This calculates ipers, and the keg size.
1126  *
1127  * Arguments
1128  *      keg  The zone we should initialize
1129  *
1130  * Returns
1131  *      Nothing
1132  */
1133 static void
1134 keg_small_init(uma_keg_t keg)
1135 {
1136         u_int rsize;
1137         u_int memused;
1138         u_int wastedspace;
1139         u_int shsize;
1140
1141         if (keg->uk_flags & UMA_ZONE_PCPU) {
1142                 KASSERT(mp_ncpus > 0, ("%s: ncpus %d\n", __func__, mp_ncpus));
1143                 keg->uk_slabsize = sizeof(struct pcpu);
1144                 keg->uk_ppera = howmany(mp_ncpus * sizeof(struct pcpu),
1145                     PAGE_SIZE);
1146         } else {
1147                 keg->uk_slabsize = UMA_SLAB_SIZE;
1148                 keg->uk_ppera = 1;
1149         }
1150
1151         rsize = keg->uk_size;
1152
1153         if (rsize & keg->uk_align)
1154                 rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1);
1155         if (rsize < keg->uk_slabsize / 256)
1156                 rsize = keg->uk_slabsize / 256;
1157
1158         keg->uk_rsize = rsize;
1159
1160         KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 ||
1161             keg->uk_rsize < sizeof(struct pcpu),
1162             ("%s: size %u too large", __func__, keg->uk_rsize));
1163
1164         if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
1165                 shsize = 0;
1166         } else if (keg->uk_flags & UMA_ZONE_REFCNT) {
1167                 rsize += UMA_FRITMREF_SZ;       /* linkage & refcnt */
1168                 shsize = sizeof(struct uma_slab_refcnt);
1169         } else {
1170                 rsize += UMA_FRITM_SZ;  /* Account for linkage */
1171                 shsize = sizeof(struct uma_slab);
1172         }
1173
1174         keg->uk_ipers = (keg->uk_slabsize - shsize) / rsize;
1175         KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= 256,
1176             ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
1177
1178         memused = keg->uk_ipers * rsize + shsize;
1179         wastedspace = keg->uk_slabsize - memused;
1180
1181         /*
1182          * We can't do OFFPAGE if we're internal or if we've been
1183          * asked to not go to the VM for buckets.  If we do this we
1184          * may end up going to the VM (kmem_map) for slabs which we
1185          * do not want to do if we're UMA_ZFLAG_CACHEONLY as a
1186          * result of UMA_ZONE_VM, which clearly forbids it.
1187          */
1188         if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) ||
1189             (keg->uk_flags & UMA_ZFLAG_CACHEONLY))
1190                 return;
1191
1192         if ((wastedspace >= keg->uk_slabsize / UMA_MAX_WASTE) &&
1193             (keg->uk_ipers < (keg->uk_slabsize / keg->uk_rsize))) {
1194                 keg->uk_ipers = keg->uk_slabsize / keg->uk_rsize;
1195                 KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= 256,
1196                     ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
1197 #ifdef UMA_DEBUG
1198                 printf("UMA decided we need offpage slab headers for "
1199                     "keg: %s, calculated wastedspace = %d, "
1200                     "maximum wasted space allowed = %d, "
1201                     "calculated ipers = %d, "
1202                     "new wasted space = %d\n", keg->uk_name, wastedspace,
1203                     keg->uk_slabsize / UMA_MAX_WASTE, keg->uk_ipers,
1204                     keg->uk_slabsize - keg->uk_ipers * keg->uk_rsize);
1205 #endif
1206                 keg->uk_flags |= UMA_ZONE_OFFPAGE;
1207         }
1208
1209         if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
1210             (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1211                 keg->uk_flags |= UMA_ZONE_HASH;
1212 }
1213
1214 /*
1215  * Finish creating a large (> UMA_SLAB_SIZE) uma kegs.  Just give in and do
1216  * OFFPAGE for now.  When I can allow for more dynamic slab sizes this will be
1217  * more complicated.
1218  *
1219  * Arguments
1220  *      keg  The keg we should initialize
1221  *
1222  * Returns
1223  *      Nothing
1224  */
1225 static void
1226 keg_large_init(uma_keg_t keg)
1227 {
1228
1229         KASSERT(keg != NULL, ("Keg is null in keg_large_init"));
1230         KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0,
1231             ("keg_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY keg"));
1232         KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
1233             ("%s: Cannot large-init a UMA_ZONE_PCPU keg", __func__));
1234
1235         keg->uk_ppera = howmany(keg->uk_size, PAGE_SIZE);
1236         keg->uk_slabsize = keg->uk_ppera * PAGE_SIZE;
1237         keg->uk_ipers = 1;
1238         keg->uk_rsize = keg->uk_size;
1239
1240         /* We can't do OFFPAGE if we're internal, bail out here. */
1241         if (keg->uk_flags & UMA_ZFLAG_INTERNAL)
1242                 return;
1243
1244         keg->uk_flags |= UMA_ZONE_OFFPAGE;
1245         if ((keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1246                 keg->uk_flags |= UMA_ZONE_HASH;
1247 }
1248
1249 static void
1250 keg_cachespread_init(uma_keg_t keg)
1251 {
1252         int alignsize;
1253         int trailer;
1254         int pages;
1255         int rsize;
1256
1257         KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
1258             ("%s: Cannot cachespread-init a UMA_ZONE_PCPU keg", __func__));
1259
1260         alignsize = keg->uk_align + 1;
1261         rsize = keg->uk_size;
1262         /*
1263          * We want one item to start on every align boundary in a page.  To
1264          * do this we will span pages.  We will also extend the item by the
1265          * size of align if it is an even multiple of align.  Otherwise, it
1266          * would fall on the same boundary every time.
1267          */
1268         if (rsize & keg->uk_align)
1269                 rsize = (rsize & ~keg->uk_align) + alignsize;
1270         if ((rsize & alignsize) == 0)
1271                 rsize += alignsize;
1272         trailer = rsize - keg->uk_size;
1273         pages = (rsize * (PAGE_SIZE / alignsize)) / PAGE_SIZE;
1274         pages = MIN(pages, (128 * 1024) / PAGE_SIZE);
1275         keg->uk_rsize = rsize;
1276         keg->uk_ppera = pages;
1277         keg->uk_slabsize = UMA_SLAB_SIZE;
1278         keg->uk_ipers = ((pages * PAGE_SIZE) + trailer) / rsize;
1279         keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB;
1280         KASSERT(keg->uk_ipers <= uma_max_ipers,
1281             ("%s: keg->uk_ipers too high(%d) increase max_ipers", __func__,
1282             keg->uk_ipers));
1283 }
1284
1285 /*
1286  * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
1287  * the keg onto the global keg list.
1288  *
1289  * Arguments/Returns follow uma_ctor specifications
1290  *      udata  Actually uma_kctor_args
1291  */
1292 static int
1293 keg_ctor(void *mem, int size, void *udata, int flags)
1294 {
1295         struct uma_kctor_args *arg = udata;
1296         uma_keg_t keg = mem;
1297         uma_zone_t zone;
1298
1299         bzero(keg, size);
1300         keg->uk_size = arg->size;
1301         keg->uk_init = arg->uminit;
1302         keg->uk_fini = arg->fini;
1303         keg->uk_align = arg->align;
1304         keg->uk_free = 0;
1305         keg->uk_pages = 0;
1306         keg->uk_flags = arg->flags;
1307         keg->uk_allocf = page_alloc;
1308         keg->uk_freef = page_free;
1309         keg->uk_recurse = 0;
1310         keg->uk_slabzone = NULL;
1311
1312         /*
1313          * The master zone is passed to us at keg-creation time.
1314          */
1315         zone = arg->zone;
1316         keg->uk_name = zone->uz_name;
1317
1318         if (arg->flags & UMA_ZONE_VM)
1319                 keg->uk_flags |= UMA_ZFLAG_CACHEONLY;
1320
1321         if (arg->flags & UMA_ZONE_ZINIT)
1322                 keg->uk_init = zero_init;
1323
1324         if (arg->flags & UMA_ZONE_REFCNT || arg->flags & UMA_ZONE_MALLOC)
1325                 keg->uk_flags |= UMA_ZONE_VTOSLAB;
1326
1327         if (arg->flags & UMA_ZONE_PCPU)
1328 #ifdef SMP
1329                 keg->uk_flags |= UMA_ZONE_OFFPAGE;
1330 #else
1331                 keg->uk_flags &= ~UMA_ZONE_PCPU;
1332 #endif
1333
1334         /*
1335          * The +UMA_FRITM_SZ added to uk_size is to account for the
1336          * linkage that is added to the size in keg_small_init().  If
1337          * we don't account for this here then we may end up in
1338          * keg_small_init() with a calculated 'ipers' of 0.
1339          */
1340         if (keg->uk_flags & UMA_ZONE_REFCNT) {
1341                 if (keg->uk_flags & UMA_ZONE_CACHESPREAD)
1342                         keg_cachespread_init(keg);
1343                 else if ((keg->uk_size+UMA_FRITMREF_SZ) >
1344                     (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt)))
1345                         keg_large_init(keg);
1346                 else
1347                         keg_small_init(keg);
1348         } else {
1349                 if (keg->uk_flags & UMA_ZONE_CACHESPREAD)
1350                         keg_cachespread_init(keg);
1351                 else if ((keg->uk_size+UMA_FRITM_SZ) >
1352                     (UMA_SLAB_SIZE - sizeof(struct uma_slab)))
1353                         keg_large_init(keg);
1354                 else
1355                         keg_small_init(keg);
1356         }
1357
1358         if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
1359                 if (keg->uk_flags & UMA_ZONE_REFCNT)
1360                         keg->uk_slabzone = slabrefzone;
1361                 else
1362                         keg->uk_slabzone = slabzone;
1363         }
1364
1365         /*
1366          * If we haven't booted yet we need allocations to go through the
1367          * startup cache until the vm is ready.
1368          */
1369         if (keg->uk_ppera == 1) {
1370 #ifdef UMA_MD_SMALL_ALLOC
1371                 keg->uk_allocf = uma_small_alloc;
1372                 keg->uk_freef = uma_small_free;
1373
1374                 if (booted < UMA_STARTUP)
1375                         keg->uk_allocf = startup_alloc;
1376 #else
1377                 if (booted < UMA_STARTUP2)
1378                         keg->uk_allocf = startup_alloc;
1379 #endif
1380         } else if (booted < UMA_STARTUP2 &&
1381             (keg->uk_flags & UMA_ZFLAG_INTERNAL))
1382                 keg->uk_allocf = startup_alloc;
1383
1384         /*
1385          * Initialize keg's lock (shared among zones).
1386          */
1387         if (arg->flags & UMA_ZONE_MTXCLASS)
1388                 KEG_LOCK_INIT(keg, 1);
1389         else
1390                 KEG_LOCK_INIT(keg, 0);
1391
1392         /*
1393          * If we're putting the slab header in the actual page we need to
1394          * figure out where in each page it goes.  This calculates a right
1395          * justified offset into the memory on an ALIGN_PTR boundary.
1396          */
1397         if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) {
1398                 u_int totsize;
1399
1400                 /* Size of the slab struct and free list */
1401                 if (keg->uk_flags & UMA_ZONE_REFCNT)
1402                         totsize = sizeof(struct uma_slab_refcnt) +
1403                             keg->uk_ipers * UMA_FRITMREF_SZ;
1404                 else
1405                         totsize = sizeof(struct uma_slab) +
1406                             keg->uk_ipers * UMA_FRITM_SZ;
1407
1408                 if (totsize & UMA_ALIGN_PTR)
1409                         totsize = (totsize & ~UMA_ALIGN_PTR) +
1410                             (UMA_ALIGN_PTR + 1);
1411                 keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - totsize;
1412
1413                 if (keg->uk_flags & UMA_ZONE_REFCNT)
1414                         totsize = keg->uk_pgoff + sizeof(struct uma_slab_refcnt)
1415                             + keg->uk_ipers * UMA_FRITMREF_SZ;
1416                 else
1417                         totsize = keg->uk_pgoff + sizeof(struct uma_slab)
1418                             + keg->uk_ipers * UMA_FRITM_SZ;
1419
1420                 /*
1421                  * The only way the following is possible is if with our
1422                  * UMA_ALIGN_PTR adjustments we are now bigger than
1423                  * UMA_SLAB_SIZE.  I haven't checked whether this is
1424                  * mathematically possible for all cases, so we make
1425                  * sure here anyway.
1426                  */
1427                 if (totsize > PAGE_SIZE * keg->uk_ppera) {
1428                         printf("zone %s ipers %d rsize %d size %d\n",
1429                             zone->uz_name, keg->uk_ipers, keg->uk_rsize,
1430                             keg->uk_size);
1431                         panic("UMA slab won't fit.");
1432                 }
1433         }
1434
1435         if (keg->uk_flags & UMA_ZONE_HASH)
1436                 hash_alloc(&keg->uk_hash);
1437
1438 #ifdef UMA_DEBUG
1439         printf("UMA: %s(%p) size %d(%d) flags %#x ipers %d ppera %d out %d free %d\n",
1440             zone->uz_name, zone, keg->uk_size, keg->uk_rsize, keg->uk_flags,
1441             keg->uk_ipers, keg->uk_ppera,
1442             (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free);
1443 #endif
1444
1445         LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
1446
1447         mtx_lock(&uma_mtx);
1448         LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
1449         mtx_unlock(&uma_mtx);
1450         return (0);
1451 }
1452
1453 /*
1454  * Zone header ctor.  This initializes all fields, locks, etc.
1455  *
1456  * Arguments/Returns follow uma_ctor specifications
1457  *      udata  Actually uma_zctor_args
1458  */
1459 static int
1460 zone_ctor(void *mem, int size, void *udata, int flags)
1461 {
1462         struct uma_zctor_args *arg = udata;
1463         uma_zone_t zone = mem;
1464         uma_zone_t z;
1465         uma_keg_t keg;
1466
1467         bzero(zone, size);
1468         zone->uz_name = arg->name;
1469         zone->uz_ctor = arg->ctor;
1470         zone->uz_dtor = arg->dtor;
1471         zone->uz_slab = zone_fetch_slab;
1472         zone->uz_init = NULL;
1473         zone->uz_fini = NULL;
1474         zone->uz_allocs = 0;
1475         zone->uz_frees = 0;
1476         zone->uz_fails = 0;
1477         zone->uz_sleeps = 0;
1478         zone->uz_fills = zone->uz_count = 0;
1479         zone->uz_flags = 0;
1480         zone->uz_warning = NULL;
1481         timevalclear(&zone->uz_ratecheck);
1482         keg = arg->keg;
1483
1484         if (arg->flags & UMA_ZONE_SECONDARY) {
1485                 KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
1486                 zone->uz_init = arg->uminit;
1487                 zone->uz_fini = arg->fini;
1488                 zone->uz_lock = &keg->uk_lock;
1489                 zone->uz_flags |= UMA_ZONE_SECONDARY;
1490                 mtx_lock(&uma_mtx);
1491                 ZONE_LOCK(zone);
1492                 LIST_FOREACH(z, &keg->uk_zones, uz_link) {
1493                         if (LIST_NEXT(z, uz_link) == NULL) {
1494                                 LIST_INSERT_AFTER(z, zone, uz_link);
1495                                 break;
1496                         }
1497                 }
1498                 ZONE_UNLOCK(zone);
1499                 mtx_unlock(&uma_mtx);
1500         } else if (keg == NULL) {
1501                 if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
1502                     arg->align, arg->flags)) == NULL)
1503                         return (ENOMEM);
1504         } else {
1505                 struct uma_kctor_args karg;
1506                 int error;
1507
1508                 /* We should only be here from uma_startup() */
1509                 karg.size = arg->size;
1510                 karg.uminit = arg->uminit;
1511                 karg.fini = arg->fini;
1512                 karg.align = arg->align;
1513                 karg.flags = arg->flags;
1514                 karg.zone = zone;
1515                 error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg,
1516                     flags);
1517                 if (error)
1518                         return (error);
1519         }
1520         /*
1521          * Link in the first keg.
1522          */
1523         zone->uz_klink.kl_keg = keg;
1524         LIST_INSERT_HEAD(&zone->uz_kegs, &zone->uz_klink, kl_link);
1525         zone->uz_lock = &keg->uk_lock;
1526         zone->uz_size = keg->uk_size;
1527         zone->uz_flags |= (keg->uk_flags &
1528             (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT));
1529
1530         /*
1531          * Some internal zones don't have room allocated for the per cpu
1532          * caches.  If we're internal, bail out here.
1533          */
1534         if (keg->uk_flags & UMA_ZFLAG_INTERNAL) {
1535                 KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0,
1536                     ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
1537                 return (0);
1538         }
1539
1540         if (keg->uk_flags & UMA_ZONE_MAXBUCKET)
1541                 zone->uz_count = BUCKET_MAX;
1542         else if (keg->uk_ipers <= BUCKET_MAX)
1543                 zone->uz_count = keg->uk_ipers;
1544         else
1545                 zone->uz_count = BUCKET_MAX;
1546         return (0);
1547 }
1548
1549 /*
1550  * Keg header dtor.  This frees all data, destroys locks, frees the hash
1551  * table and removes the keg from the global list.
1552  *
1553  * Arguments/Returns follow uma_dtor specifications
1554  *      udata  unused
1555  */
1556 static void
1557 keg_dtor(void *arg, int size, void *udata)
1558 {
1559         uma_keg_t keg;
1560
1561         keg = (uma_keg_t)arg;
1562         KEG_LOCK(keg);
1563         if (keg->uk_free != 0) {
1564                 printf("Freed UMA keg was not empty (%d items). "
1565                     " Lost %d pages of memory.\n",
1566                     keg->uk_free, keg->uk_pages);
1567         }
1568         KEG_UNLOCK(keg);
1569
1570         hash_free(&keg->uk_hash);
1571
1572         KEG_LOCK_FINI(keg);
1573 }
1574
1575 /*
1576  * Zone header dtor.
1577  *
1578  * Arguments/Returns follow uma_dtor specifications
1579  *      udata  unused
1580  */
1581 static void
1582 zone_dtor(void *arg, int size, void *udata)
1583 {
1584         uma_klink_t klink;
1585         uma_zone_t zone;
1586         uma_keg_t keg;
1587
1588         zone = (uma_zone_t)arg;
1589         keg = zone_first_keg(zone);
1590
1591         if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
1592                 cache_drain(zone);
1593
1594         mtx_lock(&uma_mtx);
1595         LIST_REMOVE(zone, uz_link);
1596         mtx_unlock(&uma_mtx);
1597         /*
1598          * XXX there are some races here where
1599          * the zone can be drained but zone lock
1600          * released and then refilled before we
1601          * remove it... we dont care for now
1602          */
1603         zone_drain_wait(zone, M_WAITOK);
1604         /*
1605          * Unlink all of our kegs.
1606          */
1607         while ((klink = LIST_FIRST(&zone->uz_kegs)) != NULL) {
1608                 klink->kl_keg = NULL;
1609                 LIST_REMOVE(klink, kl_link);
1610                 if (klink == &zone->uz_klink)
1611                         continue;
1612                 free(klink, M_TEMP);
1613         }
1614         /*
1615          * We only destroy kegs from non secondary zones.
1616          */
1617         if ((zone->uz_flags & UMA_ZONE_SECONDARY) == 0)  {
1618                 mtx_lock(&uma_mtx);
1619                 LIST_REMOVE(keg, uk_link);
1620                 mtx_unlock(&uma_mtx);
1621                 zone_free_item(kegs, keg, NULL, SKIP_NONE,
1622                     ZFREE_STATFREE);
1623         }
1624 }
1625
1626 /*
1627  * Traverses every zone in the system and calls a callback
1628  *
1629  * Arguments:
1630  *      zfunc  A pointer to a function which accepts a zone
1631  *              as an argument.
1632  *
1633  * Returns:
1634  *      Nothing
1635  */
1636 static void
1637 zone_foreach(void (*zfunc)(uma_zone_t))
1638 {
1639         uma_keg_t keg;
1640         uma_zone_t zone;
1641
1642         mtx_lock(&uma_mtx);
1643         LIST_FOREACH(keg, &uma_kegs, uk_link) {
1644                 LIST_FOREACH(zone, &keg->uk_zones, uz_link)
1645                         zfunc(zone);
1646         }
1647         mtx_unlock(&uma_mtx);
1648 }
1649
1650 /* Public functions */
1651 /* See uma.h */
1652 void
1653 uma_startup(void *bootmem, int boot_pages)
1654 {
1655         struct uma_zctor_args args;
1656         uma_slab_t slab;
1657         u_int slabsize;
1658         u_int objsize, totsize, wsize;
1659         int i;
1660
1661 #ifdef UMA_DEBUG
1662         printf("Creating uma keg headers zone and keg.\n");
1663 #endif
1664         mtx_init(&uma_mtx, "UMA lock", NULL, MTX_DEF);
1665
1666         /*
1667          * Figure out the maximum number of items-per-slab we'll have if
1668          * we're using the OFFPAGE slab header to track free items, given
1669          * all possible object sizes and the maximum desired wastage
1670          * (UMA_MAX_WASTE).
1671          *
1672          * We iterate until we find an object size for
1673          * which the calculated wastage in keg_small_init() will be
1674          * enough to warrant OFFPAGE.  Since wastedspace versus objsize
1675          * is an overall increasing see-saw function, we find the smallest
1676          * objsize such that the wastage is always acceptable for objects
1677          * with that objsize or smaller.  Since a smaller objsize always
1678          * generates a larger possible uma_max_ipers, we use this computed
1679          * objsize to calculate the largest ipers possible.  Since the
1680          * ipers calculated for OFFPAGE slab headers is always larger than
1681          * the ipers initially calculated in keg_small_init(), we use
1682          * the former's equation (UMA_SLAB_SIZE / keg->uk_rsize) to
1683          * obtain the maximum ipers possible for offpage slab headers.
1684          *
1685          * It should be noted that ipers versus objsize is an inversly
1686          * proportional function which drops off rather quickly so as
1687          * long as our UMA_MAX_WASTE is such that the objsize we calculate
1688          * falls into the portion of the inverse relation AFTER the steep
1689          * falloff, then uma_max_ipers shouldn't be too high (~10 on i386).
1690          *
1691          * Note that we have 8-bits (1 byte) to use as a freelist index
1692          * inside the actual slab header itself and this is enough to
1693          * accomodate us.  In the worst case, a UMA_SMALLEST_UNIT sized
1694          * object with offpage slab header would have ipers =
1695          * UMA_SLAB_SIZE / UMA_SMALLEST_UNIT (currently = 256), which is
1696          * 1 greater than what our byte-integer freelist index can
1697          * accomodate, but we know that this situation never occurs as
1698          * for UMA_SMALLEST_UNIT-sized objects, we will never calculate
1699          * that we need to go to offpage slab headers.  Or, if we do,
1700          * then we trap that condition below and panic in the INVARIANTS case.
1701          */
1702         wsize = UMA_SLAB_SIZE - sizeof(struct uma_slab) -
1703             (UMA_SLAB_SIZE / UMA_MAX_WASTE);
1704         totsize = wsize;
1705         objsize = UMA_SMALLEST_UNIT;
1706         while (totsize >= wsize) {
1707                 totsize = (UMA_SLAB_SIZE - sizeof(struct uma_slab)) /
1708                     (objsize + UMA_FRITM_SZ);
1709                 totsize *= (UMA_FRITM_SZ + objsize);
1710                 objsize++;
1711         }
1712         if (objsize > UMA_SMALLEST_UNIT)
1713                 objsize--;
1714         uma_max_ipers = MAX(UMA_SLAB_SIZE / objsize, 64);
1715
1716         wsize = UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt) -
1717             (UMA_SLAB_SIZE / UMA_MAX_WASTE);
1718         totsize = wsize;
1719         objsize = UMA_SMALLEST_UNIT;
1720         while (totsize >= wsize) {
1721                 totsize = (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt)) /
1722                     (objsize + UMA_FRITMREF_SZ);
1723                 totsize *= (UMA_FRITMREF_SZ + objsize);
1724                 objsize++;
1725         }
1726         if (objsize > UMA_SMALLEST_UNIT)
1727                 objsize--;
1728         uma_max_ipers_ref = MAX(UMA_SLAB_SIZE / objsize, 64);
1729
1730         KASSERT((uma_max_ipers_ref <= 256) && (uma_max_ipers <= 256),
1731             ("uma_startup: calculated uma_max_ipers values too large!"));
1732
1733 #ifdef UMA_DEBUG
1734         printf("Calculated uma_max_ipers (for OFFPAGE) is %d\n", uma_max_ipers);
1735         printf("Calculated uma_max_ipers_ref (for OFFPAGE) is %d\n",
1736             uma_max_ipers_ref);
1737 #endif
1738
1739         /* "manually" create the initial zone */
1740         args.name = "UMA Kegs";
1741         args.size = sizeof(struct uma_keg);
1742         args.ctor = keg_ctor;
1743         args.dtor = keg_dtor;
1744         args.uminit = zero_init;
1745         args.fini = NULL;
1746         args.keg = &masterkeg;
1747         args.align = 32 - 1;
1748         args.flags = UMA_ZFLAG_INTERNAL;
1749         /* The initial zone has no Per cpu queues so it's smaller */
1750         zone_ctor(kegs, sizeof(struct uma_zone), &args, M_WAITOK);
1751
1752 #ifdef UMA_DEBUG
1753         printf("Filling boot free list.\n");
1754 #endif
1755         for (i = 0; i < boot_pages; i++) {
1756                 slab = (uma_slab_t)((uint8_t *)bootmem + (i * UMA_SLAB_SIZE));
1757                 slab->us_data = (uint8_t *)slab;
1758                 slab->us_flags = UMA_SLAB_BOOT;
1759                 LIST_INSERT_HEAD(&uma_boot_pages, slab, us_link);
1760         }
1761         mtx_init(&uma_boot_pages_mtx, "UMA boot pages", NULL, MTX_DEF);
1762
1763 #ifdef UMA_DEBUG
1764         printf("Creating uma zone headers zone and keg.\n");
1765 #endif
1766         args.name = "UMA Zones";
1767         args.size = sizeof(struct uma_zone) +
1768             (sizeof(struct uma_cache) * (mp_maxid + 1));
1769         args.ctor = zone_ctor;
1770         args.dtor = zone_dtor;
1771         args.uminit = zero_init;
1772         args.fini = NULL;
1773         args.keg = NULL;
1774         args.align = 32 - 1;
1775         args.flags = UMA_ZFLAG_INTERNAL;
1776         /* The initial zone has no Per cpu queues so it's smaller */
1777         zone_ctor(zones, sizeof(struct uma_zone), &args, M_WAITOK);
1778
1779 #ifdef UMA_DEBUG
1780         printf("Initializing pcpu cache locks.\n");
1781 #endif
1782 #ifdef UMA_DEBUG
1783         printf("Creating slab and hash zones.\n");
1784 #endif
1785
1786         /*
1787          * This is the max number of free list items we'll have with
1788          * offpage slabs.
1789          */
1790         slabsize = uma_max_ipers * UMA_FRITM_SZ;
1791         slabsize += sizeof(struct uma_slab);
1792
1793         /* Now make a zone for slab headers */
1794         slabzone = uma_zcreate("UMA Slabs",
1795                                 slabsize,
1796                                 NULL, NULL, NULL, NULL,
1797                                 UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
1798
1799         /*
1800          * We also create a zone for the bigger slabs with reference
1801          * counts in them, to accomodate UMA_ZONE_REFCNT zones.
1802          */
1803         slabsize = uma_max_ipers_ref * UMA_FRITMREF_SZ;
1804         slabsize += sizeof(struct uma_slab_refcnt);
1805         slabrefzone = uma_zcreate("UMA RCntSlabs",
1806                                   slabsize,
1807                                   NULL, NULL, NULL, NULL,
1808                                   UMA_ALIGN_PTR,
1809                                   UMA_ZFLAG_INTERNAL);
1810
1811         hashzone = uma_zcreate("UMA Hash",
1812             sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
1813             NULL, NULL, NULL, NULL,
1814             UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
1815
1816         bucket_init();
1817
1818         booted = UMA_STARTUP;
1819
1820 #ifdef UMA_DEBUG
1821         printf("UMA startup complete.\n");
1822 #endif
1823 }
1824
1825 /* see uma.h */
1826 void
1827 uma_startup2(void)
1828 {
1829         booted = UMA_STARTUP2;
1830         bucket_enable();
1831 #ifdef UMA_DEBUG
1832         printf("UMA startup2 complete.\n");
1833 #endif
1834 }
1835
1836 /*
1837  * Initialize our callout handle
1838  *
1839  */
1840
1841 static void
1842 uma_startup3(void)
1843 {
1844 #ifdef UMA_DEBUG
1845         printf("Starting callout.\n");
1846 #endif
1847         callout_init(&uma_callout, CALLOUT_MPSAFE);
1848         callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
1849 #ifdef UMA_DEBUG
1850         printf("UMA startup3 complete.\n");
1851 #endif
1852 }
1853
1854 static uma_keg_t
1855 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
1856                 int align, uint32_t flags)
1857 {
1858         struct uma_kctor_args args;
1859
1860         args.size = size;
1861         args.uminit = uminit;
1862         args.fini = fini;
1863         args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
1864         args.flags = flags;
1865         args.zone = zone;
1866         return (zone_alloc_item(kegs, &args, M_WAITOK));
1867 }
1868
1869 /* See uma.h */
1870 void
1871 uma_set_align(int align)
1872 {
1873
1874         if (align != UMA_ALIGN_CACHE)
1875                 uma_align_cache = align;
1876 }
1877
1878 /* See uma.h */
1879 uma_zone_t
1880 uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
1881                 uma_init uminit, uma_fini fini, int align, uint32_t flags)
1882
1883 {
1884         struct uma_zctor_args args;
1885
1886         /* This stuff is essential for the zone ctor */
1887         args.name = name;
1888         args.size = size;
1889         args.ctor = ctor;
1890         args.dtor = dtor;
1891         args.uminit = uminit;
1892         args.fini = fini;
1893         args.align = align;
1894         args.flags = flags;
1895         args.keg = NULL;
1896
1897         return (zone_alloc_item(zones, &args, M_WAITOK));
1898 }
1899
1900 /* See uma.h */
1901 uma_zone_t
1902 uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
1903                     uma_init zinit, uma_fini zfini, uma_zone_t master)
1904 {
1905         struct uma_zctor_args args;
1906         uma_keg_t keg;
1907
1908         keg = zone_first_keg(master);
1909         args.name = name;
1910         args.size = keg->uk_size;
1911         args.ctor = ctor;
1912         args.dtor = dtor;
1913         args.uminit = zinit;
1914         args.fini = zfini;
1915         args.align = keg->uk_align;
1916         args.flags = keg->uk_flags | UMA_ZONE_SECONDARY;
1917         args.keg = keg;
1918
1919         /* XXX Attaches only one keg of potentially many. */
1920         return (zone_alloc_item(zones, &args, M_WAITOK));
1921 }
1922
1923 static void
1924 zone_lock_pair(uma_zone_t a, uma_zone_t b)
1925 {
1926         if (a < b) {
1927                 ZONE_LOCK(a);
1928                 mtx_lock_flags(b->uz_lock, MTX_DUPOK);
1929         } else {
1930                 ZONE_LOCK(b);
1931                 mtx_lock_flags(a->uz_lock, MTX_DUPOK);
1932         }
1933 }
1934
1935 static void
1936 zone_unlock_pair(uma_zone_t a, uma_zone_t b)
1937 {
1938
1939         ZONE_UNLOCK(a);
1940         ZONE_UNLOCK(b);
1941 }
1942
1943 int
1944 uma_zsecond_add(uma_zone_t zone, uma_zone_t master)
1945 {
1946         uma_klink_t klink;
1947         uma_klink_t kl;
1948         int error;
1949
1950         error = 0;
1951         klink = malloc(sizeof(*klink), M_TEMP, M_WAITOK | M_ZERO);
1952
1953         zone_lock_pair(zone, master);
1954         /*
1955          * zone must use vtoslab() to resolve objects and must already be
1956          * a secondary.
1957          */
1958         if ((zone->uz_flags & (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY))
1959             != (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY)) {
1960                 error = EINVAL;
1961                 goto out;
1962         }
1963         /*
1964          * The new master must also use vtoslab().
1965          */
1966         if ((zone->uz_flags & UMA_ZONE_VTOSLAB) != UMA_ZONE_VTOSLAB) {
1967                 error = EINVAL;
1968                 goto out;
1969         }
1970         /*
1971          * Both must either be refcnt, or not be refcnt.
1972          */
1973         if ((zone->uz_flags & UMA_ZONE_REFCNT) !=
1974             (master->uz_flags & UMA_ZONE_REFCNT)) {
1975                 error = EINVAL;
1976                 goto out;
1977         }
1978         /*
1979          * The underlying object must be the same size.  rsize
1980          * may be different.
1981          */
1982         if (master->uz_size != zone->uz_size) {
1983                 error = E2BIG;
1984                 goto out;
1985         }
1986         /*
1987          * Put it at the end of the list.
1988          */
1989         klink->kl_keg = zone_first_keg(master);
1990         LIST_FOREACH(kl, &zone->uz_kegs, kl_link) {
1991                 if (LIST_NEXT(kl, kl_link) == NULL) {
1992                         LIST_INSERT_AFTER(kl, klink, kl_link);
1993                         break;
1994                 }
1995         }
1996         klink = NULL;
1997         zone->uz_flags |= UMA_ZFLAG_MULTI;
1998         zone->uz_slab = zone_fetch_slab_multi;
1999
2000 out:
2001         zone_unlock_pair(zone, master);
2002         if (klink != NULL)
2003                 free(klink, M_TEMP);
2004
2005         return (error);
2006 }
2007
2008
2009 /* See uma.h */
2010 void
2011 uma_zdestroy(uma_zone_t zone)
2012 {
2013
2014         zone_free_item(zones, zone, NULL, SKIP_NONE, ZFREE_STATFREE);
2015 }
2016
2017 /* See uma.h */
2018 void *
2019 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
2020 {
2021         void *item;
2022         uma_cache_t cache;
2023         uma_bucket_t bucket;
2024         int cpu;
2025
2026         /* This is the fast path allocation */
2027 #ifdef UMA_DEBUG_ALLOC_1
2028         printf("Allocating one item from %s(%p)\n", zone->uz_name, zone);
2029 #endif
2030         CTR3(KTR_UMA, "uma_zalloc_arg thread %x zone %s flags %d", curthread,
2031             zone->uz_name, flags);
2032
2033         if (flags & M_WAITOK) {
2034                 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2035                     "uma_zalloc_arg: zone \"%s\"", zone->uz_name);
2036         }
2037 #ifdef DEBUG_MEMGUARD
2038         if (memguard_cmp_zone(zone)) {
2039                 item = memguard_alloc(zone->uz_size, flags);
2040                 if (item != NULL) {
2041                         /*
2042                          * Avoid conflict with the use-after-free
2043                          * protecting infrastructure from INVARIANTS.
2044                          */
2045                         if (zone->uz_init != NULL &&
2046                             zone->uz_init != mtrash_init &&
2047                             zone->uz_init(item, zone->uz_size, flags) != 0)
2048                                 return (NULL);
2049                         if (zone->uz_ctor != NULL &&
2050                             zone->uz_ctor != mtrash_ctor &&
2051                             zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
2052                                 zone->uz_fini(item, zone->uz_size);
2053                                 return (NULL);
2054                         }
2055                         return (item);
2056                 }
2057                 /* This is unfortunate but should not be fatal. */
2058         }
2059 #endif
2060         /*
2061          * If possible, allocate from the per-CPU cache.  There are two
2062          * requirements for safe access to the per-CPU cache: (1) the thread
2063          * accessing the cache must not be preempted or yield during access,
2064          * and (2) the thread must not migrate CPUs without switching which
2065          * cache it accesses.  We rely on a critical section to prevent
2066          * preemption and migration.  We release the critical section in
2067          * order to acquire the zone mutex if we are unable to allocate from
2068          * the current cache; when we re-acquire the critical section, we
2069          * must detect and handle migration if it has occurred.
2070          */
2071 zalloc_restart:
2072         critical_enter();
2073         cpu = curcpu;
2074         cache = &zone->uz_cpu[cpu];
2075
2076 zalloc_start:
2077         bucket = cache->uc_allocbucket;
2078
2079         if (bucket) {
2080                 if (bucket->ub_cnt > 0) {
2081                         bucket->ub_cnt--;
2082                         item = bucket->ub_bucket[bucket->ub_cnt];
2083 #ifdef INVARIANTS
2084                         bucket->ub_bucket[bucket->ub_cnt] = NULL;
2085 #endif
2086                         KASSERT(item != NULL,
2087                             ("uma_zalloc: Bucket pointer mangled."));
2088                         cache->uc_allocs++;
2089                         critical_exit();
2090 #ifdef INVARIANTS
2091                         ZONE_LOCK(zone);
2092                         uma_dbg_alloc(zone, NULL, item);
2093                         ZONE_UNLOCK(zone);
2094 #endif
2095                         if (zone->uz_ctor != NULL) {
2096                                 if (zone->uz_ctor(item, zone->uz_size,
2097                                     udata, flags) != 0) {
2098                                         zone_free_item(zone, item, udata,
2099                                             SKIP_DTOR, ZFREE_STATFAIL |
2100                                             ZFREE_STATFREE);
2101                                         return (NULL);
2102                                 }
2103                         }
2104                         if (flags & M_ZERO)
2105                                 bzero(item, zone->uz_size);
2106                         return (item);
2107                 } else if (cache->uc_freebucket) {
2108                         /*
2109                          * We have run out of items in our allocbucket.
2110                          * See if we can switch with our free bucket.
2111                          */
2112                         if (cache->uc_freebucket->ub_cnt > 0) {
2113 #ifdef UMA_DEBUG_ALLOC
2114                                 printf("uma_zalloc: Swapping empty with"
2115                                     " alloc.\n");
2116 #endif
2117                                 bucket = cache->uc_freebucket;
2118                                 cache->uc_freebucket = cache->uc_allocbucket;
2119                                 cache->uc_allocbucket = bucket;
2120
2121                                 goto zalloc_start;
2122                         }
2123                 }
2124         }
2125         /*
2126          * Attempt to retrieve the item from the per-CPU cache has failed, so
2127          * we must go back to the zone.  This requires the zone lock, so we
2128          * must drop the critical section, then re-acquire it when we go back
2129          * to the cache.  Since the critical section is released, we may be
2130          * preempted or migrate.  As such, make sure not to maintain any
2131          * thread-local state specific to the cache from prior to releasing
2132          * the critical section.
2133          */
2134         critical_exit();
2135         ZONE_LOCK(zone);
2136         critical_enter();
2137         cpu = curcpu;
2138         cache = &zone->uz_cpu[cpu];
2139         bucket = cache->uc_allocbucket;
2140         if (bucket != NULL) {
2141                 if (bucket->ub_cnt > 0) {
2142                         ZONE_UNLOCK(zone);
2143                         goto zalloc_start;
2144                 }
2145                 bucket = cache->uc_freebucket;
2146                 if (bucket != NULL && bucket->ub_cnt > 0) {
2147                         ZONE_UNLOCK(zone);
2148                         goto zalloc_start;
2149                 }
2150         }
2151
2152         /* Since we have locked the zone we may as well send back our stats */
2153         zone->uz_allocs += cache->uc_allocs;
2154         cache->uc_allocs = 0;
2155         zone->uz_frees += cache->uc_frees;
2156         cache->uc_frees = 0;
2157
2158         /* Our old one is now a free bucket */
2159         if (cache->uc_allocbucket) {
2160                 KASSERT(cache->uc_allocbucket->ub_cnt == 0,
2161                     ("uma_zalloc_arg: Freeing a non free bucket."));
2162                 LIST_INSERT_HEAD(&zone->uz_free_bucket,
2163                     cache->uc_allocbucket, ub_link);
2164                 cache->uc_allocbucket = NULL;
2165         }
2166
2167         /* Check the free list for a new alloc bucket */
2168         if ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) {
2169                 KASSERT(bucket->ub_cnt != 0,
2170                     ("uma_zalloc_arg: Returning an empty bucket."));
2171
2172                 LIST_REMOVE(bucket, ub_link);
2173                 cache->uc_allocbucket = bucket;
2174                 ZONE_UNLOCK(zone);
2175                 goto zalloc_start;
2176         }
2177         /* We are no longer associated with this CPU. */
2178         critical_exit();
2179
2180         /* Bump up our uz_count so we get here less */
2181         if (zone->uz_count < BUCKET_MAX)
2182                 zone->uz_count++;
2183
2184         /*
2185          * Now lets just fill a bucket and put it on the free list.  If that
2186          * works we'll restart the allocation from the begining.
2187          */
2188         if (zone_alloc_bucket(zone, flags)) {
2189                 ZONE_UNLOCK(zone);
2190                 goto zalloc_restart;
2191         }
2192         ZONE_UNLOCK(zone);
2193         /*
2194          * We may not be able to get a bucket so return an actual item.
2195          */
2196 #ifdef UMA_DEBUG
2197         printf("uma_zalloc_arg: Bucketzone returned NULL\n");
2198 #endif
2199
2200         item = zone_alloc_item(zone, udata, flags);
2201         return (item);
2202 }
2203
2204 static uma_slab_t
2205 keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int flags)
2206 {
2207         uma_slab_t slab;
2208
2209         mtx_assert(&keg->uk_lock, MA_OWNED);
2210         slab = NULL;
2211
2212         for (;;) {
2213                 /*
2214                  * Find a slab with some space.  Prefer slabs that are partially
2215                  * used over those that are totally full.  This helps to reduce
2216                  * fragmentation.
2217                  */
2218                 if (keg->uk_free != 0) {
2219                         if (!LIST_EMPTY(&keg->uk_part_slab)) {
2220                                 slab = LIST_FIRST(&keg->uk_part_slab);
2221                         } else {
2222                                 slab = LIST_FIRST(&keg->uk_free_slab);
2223                                 LIST_REMOVE(slab, us_link);
2224                                 LIST_INSERT_HEAD(&keg->uk_part_slab, slab,
2225                                     us_link);
2226                         }
2227                         MPASS(slab->us_keg == keg);
2228                         return (slab);
2229                 }
2230
2231                 /*
2232                  * M_NOVM means don't ask at all!
2233                  */
2234                 if (flags & M_NOVM)
2235                         break;
2236
2237                 if (keg->uk_maxpages && keg->uk_pages >= keg->uk_maxpages) {
2238                         keg->uk_flags |= UMA_ZFLAG_FULL;
2239                         /*
2240                          * If this is not a multi-zone, set the FULL bit.
2241                          * Otherwise slab_multi() takes care of it.
2242                          */
2243                         if ((zone->uz_flags & UMA_ZFLAG_MULTI) == 0) {
2244                                 zone->uz_flags |= UMA_ZFLAG_FULL;
2245                                 zone_log_warning(zone);
2246                         }
2247                         if (flags & M_NOWAIT)
2248                                 break;
2249                         zone->uz_sleeps++;
2250                         msleep(keg, &keg->uk_lock, PVM, "keglimit", 0);
2251                         continue;
2252                 }
2253                 keg->uk_recurse++;
2254                 slab = keg_alloc_slab(keg, zone, flags);
2255                 keg->uk_recurse--;
2256                 /*
2257                  * If we got a slab here it's safe to mark it partially used
2258                  * and return.  We assume that the caller is going to remove
2259                  * at least one item.
2260                  */
2261                 if (slab) {
2262                         MPASS(slab->us_keg == keg);
2263                         LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
2264                         return (slab);
2265                 }
2266                 /*
2267                  * We might not have been able to get a slab but another cpu
2268                  * could have while we were unlocked.  Check again before we
2269                  * fail.
2270                  */
2271                 flags |= M_NOVM;
2272         }
2273         return (slab);
2274 }
2275
2276 static inline void
2277 zone_relock(uma_zone_t zone, uma_keg_t keg)
2278 {
2279         if (zone->uz_lock != &keg->uk_lock) {
2280                 KEG_UNLOCK(keg);
2281                 ZONE_LOCK(zone);
2282         }
2283 }
2284
2285 static inline void
2286 keg_relock(uma_keg_t keg, uma_zone_t zone)
2287 {
2288         if (zone->uz_lock != &keg->uk_lock) {
2289                 ZONE_UNLOCK(zone);
2290                 KEG_LOCK(keg);
2291         }
2292 }
2293
2294 static uma_slab_t
2295 zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int flags)
2296 {
2297         uma_slab_t slab;
2298
2299         if (keg == NULL)
2300                 keg = zone_first_keg(zone);
2301         /*
2302          * This is to prevent us from recursively trying to allocate
2303          * buckets.  The problem is that if an allocation forces us to
2304          * grab a new bucket we will call page_alloc, which will go off
2305          * and cause the vm to allocate vm_map_entries.  If we need new
2306          * buckets there too we will recurse in kmem_alloc and bad
2307          * things happen.  So instead we return a NULL bucket, and make
2308          * the code that allocates buckets smart enough to deal with it
2309          */
2310         if (keg->uk_flags & UMA_ZFLAG_BUCKET && keg->uk_recurse != 0)
2311                 return (NULL);
2312
2313         for (;;) {
2314                 slab = keg_fetch_slab(keg, zone, flags);
2315                 if (slab)
2316                         return (slab);
2317                 if (flags & (M_NOWAIT | M_NOVM))
2318                         break;
2319         }
2320         return (NULL);
2321 }
2322
2323 /*
2324  * uma_zone_fetch_slab_multi:  Fetches a slab from one available keg.  Returns
2325  * with the keg locked.  Caller must call zone_relock() afterwards if the
2326  * zone lock is required.  On NULL the zone lock is held.
2327  *
2328  * The last pointer is used to seed the search.  It is not required.
2329  */
2330 static uma_slab_t
2331 zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int rflags)
2332 {
2333         uma_klink_t klink;
2334         uma_slab_t slab;
2335         uma_keg_t keg;
2336         int flags;
2337         int empty;
2338         int full;
2339
2340         /*
2341          * Don't wait on the first pass.  This will skip limit tests
2342          * as well.  We don't want to block if we can find a provider
2343          * without blocking.
2344          */
2345         flags = (rflags & ~M_WAITOK) | M_NOWAIT;
2346         /*
2347          * Use the last slab allocated as a hint for where to start
2348          * the search.
2349          */
2350         if (last) {
2351                 slab = keg_fetch_slab(last, zone, flags);
2352                 if (slab)
2353                         return (slab);
2354                 zone_relock(zone, last);
2355                 last = NULL;
2356         }
2357         /*
2358          * Loop until we have a slab incase of transient failures
2359          * while M_WAITOK is specified.  I'm not sure this is 100%
2360          * required but we've done it for so long now.
2361          */
2362         for (;;) {
2363                 empty = 0;
2364                 full = 0;
2365                 /*
2366                  * Search the available kegs for slabs.  Be careful to hold the
2367                  * correct lock while calling into the keg layer.
2368                  */
2369                 LIST_FOREACH(klink, &zone->uz_kegs, kl_link) {
2370                         keg = klink->kl_keg;
2371                         keg_relock(keg, zone);
2372                         if ((keg->uk_flags & UMA_ZFLAG_FULL) == 0) {
2373                                 slab = keg_fetch_slab(keg, zone, flags);
2374                                 if (slab)
2375                                         return (slab);
2376                         }
2377                         if (keg->uk_flags & UMA_ZFLAG_FULL)
2378                                 full++;
2379                         else
2380                                 empty++;
2381                         zone_relock(zone, keg);
2382                 }
2383                 if (rflags & (M_NOWAIT | M_NOVM))
2384                         break;
2385                 flags = rflags;
2386                 /*
2387                  * All kegs are full.  XXX We can't atomically check all kegs
2388                  * and sleep so just sleep for a short period and retry.
2389                  */
2390                 if (full && !empty) {
2391                         zone->uz_flags |= UMA_ZFLAG_FULL;
2392                         zone->uz_sleeps++;
2393                         zone_log_warning(zone);
2394                         msleep(zone, zone->uz_lock, PVM, "zonelimit", hz/100);
2395                         zone->uz_flags &= ~UMA_ZFLAG_FULL;
2396                         continue;
2397                 }
2398         }
2399         return (NULL);
2400 }
2401
2402 static void *
2403 slab_alloc_item(uma_zone_t zone, uma_slab_t slab)
2404 {
2405         uma_keg_t keg;
2406         uma_slabrefcnt_t slabref;
2407         void *item;
2408         uint8_t freei;
2409
2410         keg = slab->us_keg;
2411         mtx_assert(&keg->uk_lock, MA_OWNED);
2412
2413         freei = slab->us_firstfree;
2414         if (keg->uk_flags & UMA_ZONE_REFCNT) {
2415                 slabref = (uma_slabrefcnt_t)slab;
2416                 slab->us_firstfree = slabref->us_freelist[freei].us_item;
2417         } else {
2418                 slab->us_firstfree = slab->us_freelist[freei].us_item;
2419         }
2420         item = slab->us_data + (keg->uk_rsize * freei);
2421
2422         slab->us_freecount--;
2423         keg->uk_free--;
2424 #ifdef INVARIANTS
2425         uma_dbg_alloc(zone, slab, item);
2426 #endif
2427         /* Move this slab to the full list */
2428         if (slab->us_freecount == 0) {
2429                 LIST_REMOVE(slab, us_link);
2430                 LIST_INSERT_HEAD(&keg->uk_full_slab, slab, us_link);
2431         }
2432
2433         return (item);
2434 }
2435
2436 static int
2437 zone_alloc_bucket(uma_zone_t zone, int flags)
2438 {
2439         uma_bucket_t bucket;
2440         uma_slab_t slab;
2441         uma_keg_t keg;
2442         int16_t saved;
2443         int max, origflags = flags;
2444
2445         /*
2446          * Try this zone's free list first so we don't allocate extra buckets.
2447          */
2448         if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
2449                 KASSERT(bucket->ub_cnt == 0,
2450                     ("zone_alloc_bucket: Bucket on free list is not empty."));
2451                 LIST_REMOVE(bucket, ub_link);
2452         } else {
2453                 int bflags;
2454
2455                 bflags = (flags & ~M_ZERO);
2456                 if (zone->uz_flags & UMA_ZFLAG_CACHEONLY)
2457                         bflags |= M_NOVM;
2458
2459                 ZONE_UNLOCK(zone);
2460                 bucket = bucket_alloc(zone->uz_count, bflags);
2461                 ZONE_LOCK(zone);
2462         }
2463
2464         if (bucket == NULL) {
2465                 return (0);
2466         }
2467
2468 #ifdef SMP
2469         /*
2470          * This code is here to limit the number of simultaneous bucket fills
2471          * for any given zone to the number of per cpu caches in this zone. This
2472          * is done so that we don't allocate more memory than we really need.
2473          */
2474         if (zone->uz_fills >= mp_ncpus)
2475                 goto done;
2476
2477 #endif
2478         zone->uz_fills++;
2479
2480         max = MIN(bucket->ub_entries, zone->uz_count);
2481         /* Try to keep the buckets totally full */
2482         saved = bucket->ub_cnt;
2483         slab = NULL;
2484         keg = NULL;
2485         while (bucket->ub_cnt < max &&
2486             (slab = zone->uz_slab(zone, keg, flags)) != NULL) {
2487                 keg = slab->us_keg;
2488                 while (slab->us_freecount && bucket->ub_cnt < max) {
2489                         bucket->ub_bucket[bucket->ub_cnt++] =
2490                             slab_alloc_item(zone, slab);
2491                 }
2492
2493                 /* Don't block on the next fill */
2494                 flags |= M_NOWAIT;
2495         }
2496         if (slab)
2497                 zone_relock(zone, keg);
2498
2499         /*
2500          * We unlock here because we need to call the zone's init.
2501          * It should be safe to unlock because the slab dealt with
2502          * above is already on the appropriate list within the keg
2503          * and the bucket we filled is not yet on any list, so we
2504          * own it.
2505          */
2506         if (zone->uz_init != NULL) {
2507                 int i;
2508
2509                 ZONE_UNLOCK(zone);
2510                 for (i = saved; i < bucket->ub_cnt; i++)
2511                         if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size,
2512                             origflags) != 0)
2513                                 break;
2514                 /*
2515                  * If we couldn't initialize the whole bucket, put the
2516                  * rest back onto the freelist.
2517                  */
2518                 if (i != bucket->ub_cnt) {
2519                         int j;
2520
2521                         for (j = i; j < bucket->ub_cnt; j++) {
2522                                 zone_free_item(zone, bucket->ub_bucket[j],
2523                                     NULL, SKIP_FINI, 0);
2524 #ifdef INVARIANTS
2525                                 bucket->ub_bucket[j] = NULL;
2526 #endif
2527                         }
2528                         bucket->ub_cnt = i;
2529                 }
2530                 ZONE_LOCK(zone);
2531         }
2532
2533         zone->uz_fills--;
2534         if (bucket->ub_cnt != 0) {
2535                 LIST_INSERT_HEAD(&zone->uz_full_bucket,
2536                     bucket, ub_link);
2537                 return (1);
2538         }
2539 #ifdef SMP
2540 done:
2541 #endif
2542         bucket_free(bucket);
2543
2544         return (0);
2545 }
2546 /*
2547  * Allocates an item for an internal zone
2548  *
2549  * Arguments
2550  *      zone   The zone to alloc for.
2551  *      udata  The data to be passed to the constructor.
2552  *      flags  M_WAITOK, M_NOWAIT, M_ZERO.
2553  *
2554  * Returns
2555  *      NULL if there is no memory and M_NOWAIT is set
2556  *      An item if successful
2557  */
2558
2559 static void *
2560 zone_alloc_item(uma_zone_t zone, void *udata, int flags)
2561 {
2562         uma_slab_t slab;
2563         void *item;
2564
2565         item = NULL;
2566
2567 #ifdef UMA_DEBUG_ALLOC
2568         printf("INTERNAL: Allocating one item from %s(%p)\n", zone->uz_name, zone);
2569 #endif
2570         ZONE_LOCK(zone);
2571
2572         slab = zone->uz_slab(zone, NULL, flags);
2573         if (slab == NULL) {
2574                 zone->uz_fails++;
2575                 ZONE_UNLOCK(zone);
2576                 return (NULL);
2577         }
2578
2579         item = slab_alloc_item(zone, slab);
2580
2581         zone_relock(zone, slab->us_keg);
2582         zone->uz_allocs++;
2583         ZONE_UNLOCK(zone);
2584
2585         /*
2586          * We have to call both the zone's init (not the keg's init)
2587          * and the zone's ctor.  This is because the item is going from
2588          * a keg slab directly to the user, and the user is expecting it
2589          * to be both zone-init'd as well as zone-ctor'd.
2590          */
2591         if (zone->uz_init != NULL) {
2592                 if (zone->uz_init(item, zone->uz_size, flags) != 0) {
2593                         zone_free_item(zone, item, udata, SKIP_FINI,
2594                             ZFREE_STATFAIL | ZFREE_STATFREE);
2595                         return (NULL);
2596                 }
2597         }
2598         if (zone->uz_ctor != NULL) {
2599                 if (zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
2600                         zone_free_item(zone, item, udata, SKIP_DTOR,
2601                             ZFREE_STATFAIL | ZFREE_STATFREE);
2602                         return (NULL);
2603                 }
2604         }
2605         if (flags & M_ZERO)
2606                 bzero(item, zone->uz_size);
2607
2608         return (item);
2609 }
2610
2611 /* See uma.h */
2612 void
2613 uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
2614 {
2615         uma_cache_t cache;
2616         uma_bucket_t bucket;
2617         int bflags;
2618         int cpu;
2619
2620 #ifdef UMA_DEBUG_ALLOC_1
2621         printf("Freeing item %p to %s(%p)\n", item, zone->uz_name, zone);
2622 #endif
2623         CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread,
2624             zone->uz_name);
2625
2626         /* uma_zfree(..., NULL) does nothing, to match free(9). */
2627         if (item == NULL)
2628                 return;
2629 #ifdef DEBUG_MEMGUARD
2630         if (is_memguard_addr(item)) {
2631                 if (zone->uz_dtor != NULL && zone->uz_dtor != mtrash_dtor)
2632                         zone->uz_dtor(item, zone->uz_size, udata);
2633                 if (zone->uz_fini != NULL && zone->uz_fini != mtrash_fini)
2634                         zone->uz_fini(item, zone->uz_size);
2635                 memguard_free(item);
2636                 return;
2637         }
2638 #endif
2639         if (zone->uz_dtor)
2640                 zone->uz_dtor(item, zone->uz_size, udata);
2641
2642 #ifdef INVARIANTS
2643         ZONE_LOCK(zone);
2644         if (zone->uz_flags & UMA_ZONE_MALLOC)
2645                 uma_dbg_free(zone, udata, item);
2646         else
2647                 uma_dbg_free(zone, NULL, item);
2648         ZONE_UNLOCK(zone);
2649 #endif
2650         /*
2651          * The race here is acceptable.  If we miss it we'll just have to wait
2652          * a little longer for the limits to be reset.
2653          */
2654         if (zone->uz_flags & UMA_ZFLAG_FULL)
2655                 goto zfree_internal;
2656
2657         /*
2658          * If possible, free to the per-CPU cache.  There are two
2659          * requirements for safe access to the per-CPU cache: (1) the thread
2660          * accessing the cache must not be preempted or yield during access,
2661          * and (2) the thread must not migrate CPUs without switching which
2662          * cache it accesses.  We rely on a critical section to prevent
2663          * preemption and migration.  We release the critical section in
2664          * order to acquire the zone mutex if we are unable to free to the
2665          * current cache; when we re-acquire the critical section, we must
2666          * detect and handle migration if it has occurred.
2667          */
2668 zfree_restart:
2669         critical_enter();
2670         cpu = curcpu;
2671         cache = &zone->uz_cpu[cpu];
2672
2673 zfree_start:
2674         bucket = cache->uc_freebucket;
2675
2676         if (bucket) {
2677                 /*
2678                  * Do we have room in our bucket? It is OK for this uz count
2679                  * check to be slightly out of sync.
2680                  */
2681
2682                 if (bucket->ub_cnt < bucket->ub_entries) {
2683                         KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL,
2684                             ("uma_zfree: Freeing to non free bucket index."));
2685                         bucket->ub_bucket[bucket->ub_cnt] = item;
2686                         bucket->ub_cnt++;
2687                         cache->uc_frees++;
2688                         critical_exit();
2689                         return;
2690                 } else if (cache->uc_allocbucket) {
2691 #ifdef UMA_DEBUG_ALLOC
2692                         printf("uma_zfree: Swapping buckets.\n");
2693 #endif
2694                         /*
2695                          * We have run out of space in our freebucket.
2696                          * See if we can switch with our alloc bucket.
2697                          */
2698                         if (cache->uc_allocbucket->ub_cnt <
2699                             cache->uc_freebucket->ub_cnt) {
2700                                 bucket = cache->uc_freebucket;
2701                                 cache->uc_freebucket = cache->uc_allocbucket;
2702                                 cache->uc_allocbucket = bucket;
2703                                 goto zfree_start;
2704                         }
2705                 }
2706         }
2707         /*
2708          * We can get here for two reasons:
2709          *
2710          * 1) The buckets are NULL
2711          * 2) The alloc and free buckets are both somewhat full.
2712          *
2713          * We must go back the zone, which requires acquiring the zone lock,
2714          * which in turn means we must release and re-acquire the critical
2715          * section.  Since the critical section is released, we may be
2716          * preempted or migrate.  As such, make sure not to maintain any
2717          * thread-local state specific to the cache from prior to releasing
2718          * the critical section.
2719          */
2720         critical_exit();
2721         ZONE_LOCK(zone);
2722         critical_enter();
2723         cpu = curcpu;
2724         cache = &zone->uz_cpu[cpu];
2725         if (cache->uc_freebucket != NULL) {
2726                 if (cache->uc_freebucket->ub_cnt <
2727                     cache->uc_freebucket->ub_entries) {
2728                         ZONE_UNLOCK(zone);
2729                         goto zfree_start;
2730                 }
2731                 if (cache->uc_allocbucket != NULL &&
2732                     (cache->uc_allocbucket->ub_cnt <
2733                     cache->uc_freebucket->ub_cnt)) {
2734                         ZONE_UNLOCK(zone);
2735                         goto zfree_start;
2736                 }
2737         }
2738
2739         /* Since we have locked the zone we may as well send back our stats */
2740         zone->uz_allocs += cache->uc_allocs;
2741         cache->uc_allocs = 0;
2742         zone->uz_frees += cache->uc_frees;
2743         cache->uc_frees = 0;
2744
2745         bucket = cache->uc_freebucket;
2746         cache->uc_freebucket = NULL;
2747
2748         /* Can we throw this on the zone full list? */
2749         if (bucket != NULL) {
2750 #ifdef UMA_DEBUG_ALLOC
2751                 printf("uma_zfree: Putting old bucket on the free list.\n");
2752 #endif
2753                 /* ub_cnt is pointing to the last free item */
2754                 KASSERT(bucket->ub_cnt != 0,
2755                     ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n"));
2756                 LIST_INSERT_HEAD(&zone->uz_full_bucket,
2757                     bucket, ub_link);
2758         }
2759         if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
2760                 LIST_REMOVE(bucket, ub_link);
2761                 ZONE_UNLOCK(zone);
2762                 cache->uc_freebucket = bucket;
2763                 goto zfree_start;
2764         }
2765         /* We are no longer associated with this CPU. */
2766         critical_exit();
2767
2768         /* And the zone.. */
2769         ZONE_UNLOCK(zone);
2770
2771 #ifdef UMA_DEBUG_ALLOC
2772         printf("uma_zfree: Allocating new free bucket.\n");
2773 #endif
2774         bflags = M_NOWAIT;
2775
2776         if (zone->uz_flags & UMA_ZFLAG_CACHEONLY)
2777                 bflags |= M_NOVM;
2778         bucket = bucket_alloc(zone->uz_count, bflags);
2779         if (bucket) {
2780                 ZONE_LOCK(zone);
2781                 LIST_INSERT_HEAD(&zone->uz_free_bucket,
2782                     bucket, ub_link);
2783                 ZONE_UNLOCK(zone);
2784                 goto zfree_restart;
2785         }
2786
2787         /*
2788          * If nothing else caught this, we'll just do an internal free.
2789          */
2790 zfree_internal:
2791         zone_free_item(zone, item, udata, SKIP_DTOR, ZFREE_STATFREE);
2792
2793         return;
2794 }
2795
2796 /*
2797  * Frees an item to an INTERNAL zone or allocates a free bucket
2798  *
2799  * Arguments:
2800  *      zone   The zone to free to
2801  *      item   The item we're freeing
2802  *      udata  User supplied data for the dtor
2803  *      skip   Skip dtors and finis
2804  */
2805 static void
2806 zone_free_item(uma_zone_t zone, void *item, void *udata,
2807     enum zfreeskip skip, int flags)
2808 {
2809         uma_slab_t slab;
2810         uma_slabrefcnt_t slabref;
2811         uma_keg_t keg;
2812         uint8_t *mem;
2813         uint8_t freei;
2814         int clearfull;
2815
2816         if (skip < SKIP_DTOR && zone->uz_dtor)
2817                 zone->uz_dtor(item, zone->uz_size, udata);
2818
2819         if (skip < SKIP_FINI && zone->uz_fini)
2820                 zone->uz_fini(item, zone->uz_size);
2821
2822         ZONE_LOCK(zone);
2823
2824         if (flags & ZFREE_STATFAIL)
2825                 zone->uz_fails++;
2826         if (flags & ZFREE_STATFREE)
2827                 zone->uz_frees++;
2828
2829         if (!(zone->uz_flags & UMA_ZONE_VTOSLAB)) {
2830                 mem = (uint8_t *)((unsigned long)item & (~UMA_SLAB_MASK));
2831                 keg = zone_first_keg(zone); /* Must only be one. */
2832                 if (zone->uz_flags & UMA_ZONE_HASH) {
2833                         slab = hash_sfind(&keg->uk_hash, mem);
2834                 } else {
2835                         mem += keg->uk_pgoff;
2836                         slab = (uma_slab_t)mem;
2837                 }
2838         } else {
2839                 /* This prevents redundant lookups via free(). */
2840                 if ((zone->uz_flags & UMA_ZONE_MALLOC) && udata != NULL)
2841                         slab = (uma_slab_t)udata;
2842                 else
2843                         slab = vtoslab((vm_offset_t)item);
2844                 keg = slab->us_keg;
2845                 keg_relock(keg, zone);
2846         }
2847         MPASS(keg == slab->us_keg);
2848
2849         /* Do we need to remove from any lists? */
2850         if (slab->us_freecount+1 == keg->uk_ipers) {
2851                 LIST_REMOVE(slab, us_link);
2852                 LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
2853         } else if (slab->us_freecount == 0) {
2854                 LIST_REMOVE(slab, us_link);
2855                 LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
2856         }
2857
2858         /* Slab management stuff */
2859         freei = ((unsigned long)item - (unsigned long)slab->us_data)
2860                 / keg->uk_rsize;
2861
2862 #ifdef INVARIANTS
2863         if (!skip)
2864                 uma_dbg_free(zone, slab, item);
2865 #endif
2866
2867         if (keg->uk_flags & UMA_ZONE_REFCNT) {
2868                 slabref = (uma_slabrefcnt_t)slab;
2869                 slabref->us_freelist[freei].us_item = slab->us_firstfree;
2870         } else {
2871                 slab->us_freelist[freei].us_item = slab->us_firstfree;
2872         }
2873         slab->us_firstfree = freei;
2874         slab->us_freecount++;
2875
2876         /* Zone statistics */
2877         keg->uk_free++;
2878
2879         clearfull = 0;
2880         if (keg->uk_flags & UMA_ZFLAG_FULL) {
2881                 if (keg->uk_pages < keg->uk_maxpages) {
2882                         keg->uk_flags &= ~UMA_ZFLAG_FULL;
2883                         clearfull = 1;
2884                 }
2885
2886                 /*
2887                  * We can handle one more allocation. Since we're clearing ZFLAG_FULL,
2888                  * wake up all procs blocked on pages. This should be uncommon, so
2889                  * keeping this simple for now (rather than adding count of blocked
2890                  * threads etc).
2891                  */
2892                 wakeup(keg);
2893         }
2894         if (clearfull) {
2895                 zone_relock(zone, keg);
2896                 zone->uz_flags &= ~UMA_ZFLAG_FULL;
2897                 wakeup(zone);
2898                 ZONE_UNLOCK(zone);
2899         } else
2900                 KEG_UNLOCK(keg);
2901 }
2902
2903 /* See uma.h */
2904 int
2905 uma_zone_set_max(uma_zone_t zone, int nitems)
2906 {
2907         uma_keg_t keg;
2908
2909         ZONE_LOCK(zone);
2910         keg = zone_first_keg(zone);
2911         keg->uk_maxpages = (nitems / keg->uk_ipers) * keg->uk_ppera;
2912         if (keg->uk_maxpages * keg->uk_ipers < nitems)
2913                 keg->uk_maxpages += keg->uk_ppera;
2914         nitems = keg->uk_maxpages * keg->uk_ipers;
2915         ZONE_UNLOCK(zone);
2916
2917         return (nitems);
2918 }
2919
2920 /* See uma.h */
2921 int
2922 uma_zone_get_max(uma_zone_t zone)
2923 {
2924         int nitems;
2925         uma_keg_t keg;
2926
2927         ZONE_LOCK(zone);
2928         keg = zone_first_keg(zone);
2929         nitems = keg->uk_maxpages * keg->uk_ipers;
2930         ZONE_UNLOCK(zone);
2931
2932         return (nitems);
2933 }
2934
2935 /* See uma.h */
2936 void
2937 uma_zone_set_warning(uma_zone_t zone, const char *warning)
2938 {
2939
2940         ZONE_LOCK(zone);
2941         zone->uz_warning = warning;
2942         ZONE_UNLOCK(zone);
2943 }
2944
2945 /* See uma.h */
2946 int
2947 uma_zone_get_cur(uma_zone_t zone)
2948 {
2949         int64_t nitems;
2950         u_int i;
2951
2952         ZONE_LOCK(zone);
2953         nitems = zone->uz_allocs - zone->uz_frees;
2954         CPU_FOREACH(i) {
2955                 /*
2956                  * See the comment in sysctl_vm_zone_stats() regarding the
2957                  * safety of accessing the per-cpu caches. With the zone lock
2958                  * held, it is safe, but can potentially result in stale data.
2959                  */
2960                 nitems += zone->uz_cpu[i].uc_allocs -
2961                     zone->uz_cpu[i].uc_frees;
2962         }
2963         ZONE_UNLOCK(zone);
2964
2965         return (nitems < 0 ? 0 : nitems);
2966 }
2967
2968 /* See uma.h */
2969 void
2970 uma_zone_set_init(uma_zone_t zone, uma_init uminit)
2971 {
2972         uma_keg_t keg;
2973
2974         ZONE_LOCK(zone);
2975         keg = zone_first_keg(zone);
2976         KASSERT(keg->uk_pages == 0,
2977             ("uma_zone_set_init on non-empty keg"));
2978         keg->uk_init = uminit;
2979         ZONE_UNLOCK(zone);
2980 }
2981
2982 /* See uma.h */
2983 void
2984 uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
2985 {
2986         uma_keg_t keg;
2987
2988         ZONE_LOCK(zone);
2989         keg = zone_first_keg(zone);
2990         KASSERT(keg->uk_pages == 0,
2991             ("uma_zone_set_fini on non-empty keg"));
2992         keg->uk_fini = fini;
2993         ZONE_UNLOCK(zone);
2994 }
2995
2996 /* See uma.h */
2997 void
2998 uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
2999 {
3000         ZONE_LOCK(zone);
3001         KASSERT(zone_first_keg(zone)->uk_pages == 0,
3002             ("uma_zone_set_zinit on non-empty keg"));
3003         zone->uz_init = zinit;
3004         ZONE_UNLOCK(zone);
3005 }
3006
3007 /* See uma.h */
3008 void
3009 uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
3010 {
3011         ZONE_LOCK(zone);
3012         KASSERT(zone_first_keg(zone)->uk_pages == 0,
3013             ("uma_zone_set_zfini on non-empty keg"));
3014         zone->uz_fini = zfini;
3015         ZONE_UNLOCK(zone);
3016 }
3017
3018 /* See uma.h */
3019 /* XXX uk_freef is not actually used with the zone locked */
3020 void
3021 uma_zone_set_freef(uma_zone_t zone, uma_free freef)
3022 {
3023
3024         ZONE_LOCK(zone);
3025         zone_first_keg(zone)->uk_freef = freef;
3026         ZONE_UNLOCK(zone);
3027 }
3028
3029 /* See uma.h */
3030 /* XXX uk_allocf is not actually used with the zone locked */
3031 void
3032 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
3033 {
3034         uma_keg_t keg;
3035
3036         ZONE_LOCK(zone);
3037         keg = zone_first_keg(zone);
3038         keg->uk_flags |= UMA_ZFLAG_PRIVALLOC;
3039         keg->uk_allocf = allocf;
3040         ZONE_UNLOCK(zone);
3041 }
3042
3043 /* See uma.h */
3044 int
3045 uma_zone_reserve_kva(uma_zone_t zone, int count)
3046 {
3047         uma_keg_t keg;
3048         vm_offset_t kva;
3049         int pages;
3050
3051         keg = zone_first_keg(zone);
3052         pages = count / keg->uk_ipers;
3053
3054         if (pages * keg->uk_ipers < count)
3055                 pages++;
3056
3057 #ifdef UMA_MD_SMALL_ALLOC
3058         if (keg->uk_ppera > 1) {
3059 #else
3060         if (1) {
3061 #endif
3062                 kva = kmem_alloc_nofault(kernel_map, pages * UMA_SLAB_SIZE);
3063                 if (kva == 0)
3064                         return (0);
3065         } else
3066                 kva = 0;
3067         ZONE_LOCK(zone);
3068         keg->uk_kva = kva;
3069         keg->uk_offset = 0;
3070         keg->uk_maxpages = pages;
3071 #ifdef UMA_MD_SMALL_ALLOC
3072         keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc;
3073 #else
3074         keg->uk_allocf = noobj_alloc;
3075 #endif
3076         keg->uk_flags |= UMA_ZONE_NOFREE | UMA_ZFLAG_PRIVALLOC;
3077         ZONE_UNLOCK(zone);
3078         return (1);
3079 }
3080
3081 /* See uma.h */
3082 void
3083 uma_prealloc(uma_zone_t zone, int items)
3084 {
3085         int slabs;
3086         uma_slab_t slab;
3087         uma_keg_t keg;
3088
3089         keg = zone_first_keg(zone);
3090         ZONE_LOCK(zone);
3091         slabs = items / keg->uk_ipers;
3092         if (slabs * keg->uk_ipers < items)
3093                 slabs++;
3094         while (slabs > 0) {
3095                 slab = keg_alloc_slab(keg, zone, M_WAITOK);
3096                 if (slab == NULL)
3097                         break;
3098                 MPASS(slab->us_keg == keg);
3099                 LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
3100                 slabs--;
3101         }
3102         ZONE_UNLOCK(zone);
3103 }
3104
3105 /* See uma.h */
3106 uint32_t *
3107 uma_find_refcnt(uma_zone_t zone, void *item)
3108 {
3109         uma_slabrefcnt_t slabref;
3110         uma_keg_t keg;
3111         uint32_t *refcnt;
3112         int idx;
3113
3114         slabref = (uma_slabrefcnt_t)vtoslab((vm_offset_t)item &
3115             (~UMA_SLAB_MASK));
3116         keg = slabref->us_keg;
3117         KASSERT(slabref != NULL && slabref->us_keg->uk_flags & UMA_ZONE_REFCNT,
3118             ("uma_find_refcnt(): zone possibly not UMA_ZONE_REFCNT"));
3119         idx = ((unsigned long)item - (unsigned long)slabref->us_data)
3120             / keg->uk_rsize;
3121         refcnt = &slabref->us_freelist[idx].us_refcnt;
3122         return refcnt;
3123 }
3124
3125 /* See uma.h */
3126 void
3127 uma_reclaim(void)
3128 {
3129 #ifdef UMA_DEBUG
3130         printf("UMA: vm asked us to release pages!\n");
3131 #endif
3132         bucket_enable();
3133         zone_foreach(zone_drain);
3134         /*
3135          * Some slabs may have been freed but this zone will be visited early
3136          * we visit again so that we can free pages that are empty once other
3137          * zones are drained.  We have to do the same for buckets.
3138          */
3139         zone_drain(slabzone);
3140         zone_drain(slabrefzone);
3141         bucket_zone_drain();
3142 }
3143
3144 /* See uma.h */
3145 int
3146 uma_zone_exhausted(uma_zone_t zone)
3147 {
3148         int full;
3149
3150         ZONE_LOCK(zone);
3151         full = (zone->uz_flags & UMA_ZFLAG_FULL);
3152         ZONE_UNLOCK(zone);
3153         return (full);
3154 }
3155
3156 int
3157 uma_zone_exhausted_nolock(uma_zone_t zone)
3158 {
3159         return (zone->uz_flags & UMA_ZFLAG_FULL);
3160 }
3161
3162 void *
3163 uma_large_malloc(int size, int wait)
3164 {
3165         void *mem;
3166         uma_slab_t slab;
3167         uint8_t flags;
3168
3169         slab = zone_alloc_item(slabzone, NULL, wait);
3170         if (slab == NULL)
3171                 return (NULL);
3172         mem = page_alloc(NULL, size, &flags, wait);
3173         if (mem) {
3174                 vsetslab((vm_offset_t)mem, slab);
3175                 slab->us_data = mem;
3176                 slab->us_flags = flags | UMA_SLAB_MALLOC;
3177                 slab->us_size = size;
3178         } else {
3179                 zone_free_item(slabzone, slab, NULL, SKIP_NONE,
3180                     ZFREE_STATFAIL | ZFREE_STATFREE);
3181         }
3182
3183         return (mem);
3184 }
3185
3186 void
3187 uma_large_free(uma_slab_t slab)
3188 {
3189         vsetobj((vm_offset_t)slab->us_data, kmem_object);
3190         page_free(slab->us_data, slab->us_size, slab->us_flags);
3191         zone_free_item(slabzone, slab, NULL, SKIP_NONE, ZFREE_STATFREE);
3192 }
3193
3194 void
3195 uma_print_stats(void)
3196 {
3197         zone_foreach(uma_print_zone);
3198 }
3199
3200 static void
3201 slab_print(uma_slab_t slab)
3202 {
3203         printf("slab: keg %p, data %p, freecount %d, firstfree %d\n",
3204                 slab->us_keg, slab->us_data, slab->us_freecount,
3205                 slab->us_firstfree);
3206 }
3207
3208 static void
3209 cache_print(uma_cache_t cache)
3210 {
3211         printf("alloc: %p(%d), free: %p(%d)\n",
3212                 cache->uc_allocbucket,
3213                 cache->uc_allocbucket?cache->uc_allocbucket->ub_cnt:0,
3214                 cache->uc_freebucket,
3215                 cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0);
3216 }
3217
3218 static void
3219 uma_print_keg(uma_keg_t keg)
3220 {
3221         uma_slab_t slab;
3222
3223         printf("keg: %s(%p) size %d(%d) flags %#x ipers %d ppera %d "
3224             "out %d free %d limit %d\n",
3225             keg->uk_name, keg, keg->uk_size, keg->uk_rsize, keg->uk_flags,
3226             keg->uk_ipers, keg->uk_ppera,
3227             (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free,
3228             (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers);
3229         printf("Part slabs:\n");
3230         LIST_FOREACH(slab, &keg->uk_part_slab, us_link)
3231                 slab_print(slab);
3232         printf("Free slabs:\n");
3233         LIST_FOREACH(slab, &keg->uk_free_slab, us_link)
3234                 slab_print(slab);
3235         printf("Full slabs:\n");
3236         LIST_FOREACH(slab, &keg->uk_full_slab, us_link)
3237                 slab_print(slab);
3238 }
3239
3240 void
3241 uma_print_zone(uma_zone_t zone)
3242 {
3243         uma_cache_t cache;
3244         uma_klink_t kl;
3245         int i;
3246
3247         printf("zone: %s(%p) size %d flags %#x\n",
3248             zone->uz_name, zone, zone->uz_size, zone->uz_flags);
3249         LIST_FOREACH(kl, &zone->uz_kegs, kl_link)
3250                 uma_print_keg(kl->kl_keg);
3251         CPU_FOREACH(i) {
3252                 cache = &zone->uz_cpu[i];
3253                 printf("CPU %d Cache:\n", i);
3254                 cache_print(cache);
3255         }
3256 }
3257
3258 #ifdef DDB
3259 /*
3260  * Generate statistics across both the zone and its per-cpu cache's.  Return
3261  * desired statistics if the pointer is non-NULL for that statistic.
3262  *
3263  * Note: does not update the zone statistics, as it can't safely clear the
3264  * per-CPU cache statistic.
3265  *
3266  * XXXRW: Following the uc_allocbucket and uc_freebucket pointers here isn't
3267  * safe from off-CPU; we should modify the caches to track this information
3268  * directly so that we don't have to.
3269  */
3270 static void
3271 uma_zone_sumstat(uma_zone_t z, int *cachefreep, uint64_t *allocsp,
3272     uint64_t *freesp, uint64_t *sleepsp)
3273 {
3274         uma_cache_t cache;
3275         uint64_t allocs, frees, sleeps;
3276         int cachefree, cpu;
3277
3278         allocs = frees = sleeps = 0;
3279         cachefree = 0;
3280         CPU_FOREACH(cpu) {
3281                 cache = &z->uz_cpu[cpu];
3282                 if (cache->uc_allocbucket != NULL)
3283                         cachefree += cache->uc_allocbucket->ub_cnt;
3284                 if (cache->uc_freebucket != NULL)
3285                         cachefree += cache->uc_freebucket->ub_cnt;
3286                 allocs += cache->uc_allocs;
3287                 frees += cache->uc_frees;
3288         }
3289         allocs += z->uz_allocs;
3290         frees += z->uz_frees;
3291         sleeps += z->uz_sleeps;
3292         if (cachefreep != NULL)
3293                 *cachefreep = cachefree;
3294         if (allocsp != NULL)
3295                 *allocsp = allocs;
3296         if (freesp != NULL)
3297                 *freesp = frees;
3298         if (sleepsp != NULL)
3299                 *sleepsp = sleeps;
3300 }
3301 #endif /* DDB */
3302
3303 static int
3304 sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS)
3305 {
3306         uma_keg_t kz;
3307         uma_zone_t z;
3308         int count;
3309
3310         count = 0;
3311         mtx_lock(&uma_mtx);
3312         LIST_FOREACH(kz, &uma_kegs, uk_link) {
3313                 LIST_FOREACH(z, &kz->uk_zones, uz_link)
3314                         count++;
3315         }
3316         mtx_unlock(&uma_mtx);
3317         return (sysctl_handle_int(oidp, &count, 0, req));
3318 }
3319
3320 static int
3321 sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS)
3322 {
3323         struct uma_stream_header ush;
3324         struct uma_type_header uth;
3325         struct uma_percpu_stat ups;
3326         uma_bucket_t bucket;
3327         struct sbuf sbuf;
3328         uma_cache_t cache;
3329         uma_klink_t kl;
3330         uma_keg_t kz;
3331         uma_zone_t z;
3332         uma_keg_t k;
3333         int count, error, i;
3334
3335         error = sysctl_wire_old_buffer(req, 0);
3336         if (error != 0)
3337                 return (error);
3338         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
3339
3340         count = 0;
3341         mtx_lock(&uma_mtx);
3342         LIST_FOREACH(kz, &uma_kegs, uk_link) {
3343                 LIST_FOREACH(z, &kz->uk_zones, uz_link)
3344                         count++;
3345         }
3346
3347         /*
3348          * Insert stream header.
3349          */
3350         bzero(&ush, sizeof(ush));
3351         ush.ush_version = UMA_STREAM_VERSION;
3352         ush.ush_maxcpus = (mp_maxid + 1);
3353         ush.ush_count = count;
3354         (void)sbuf_bcat(&sbuf, &ush, sizeof(ush));
3355
3356         LIST_FOREACH(kz, &uma_kegs, uk_link) {
3357                 LIST_FOREACH(z, &kz->uk_zones, uz_link) {
3358                         bzero(&uth, sizeof(uth));
3359                         ZONE_LOCK(z);
3360                         strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
3361                         uth.uth_align = kz->uk_align;
3362                         uth.uth_size = kz->uk_size;
3363                         uth.uth_rsize = kz->uk_rsize;
3364                         LIST_FOREACH(kl, &z->uz_kegs, kl_link) {
3365                                 k = kl->kl_keg;
3366                                 uth.uth_maxpages += k->uk_maxpages;
3367                                 uth.uth_pages += k->uk_pages;
3368                                 uth.uth_keg_free += k->uk_free;
3369                                 uth.uth_limit = (k->uk_maxpages / k->uk_ppera)
3370                                     * k->uk_ipers;
3371                         }
3372
3373                         /*
3374                          * A zone is secondary is it is not the first entry
3375                          * on the keg's zone list.
3376                          */
3377                         if ((z->uz_flags & UMA_ZONE_SECONDARY) &&
3378                             (LIST_FIRST(&kz->uk_zones) != z))
3379                                 uth.uth_zone_flags = UTH_ZONE_SECONDARY;
3380
3381                         LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link)
3382                                 uth.uth_zone_free += bucket->ub_cnt;
3383                         uth.uth_allocs = z->uz_allocs;
3384                         uth.uth_frees = z->uz_frees;
3385                         uth.uth_fails = z->uz_fails;
3386                         uth.uth_sleeps = z->uz_sleeps;
3387                         (void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
3388                         /*
3389                          * While it is not normally safe to access the cache
3390                          * bucket pointers while not on the CPU that owns the
3391                          * cache, we only allow the pointers to be exchanged
3392                          * without the zone lock held, not invalidated, so
3393                          * accept the possible race associated with bucket
3394                          * exchange during monitoring.
3395                          */
3396                         for (i = 0; i < (mp_maxid + 1); i++) {
3397                                 bzero(&ups, sizeof(ups));
3398                                 if (kz->uk_flags & UMA_ZFLAG_INTERNAL)
3399                                         goto skip;
3400                                 if (CPU_ABSENT(i))
3401                                         goto skip;
3402                                 cache = &z->uz_cpu[i];
3403                                 if (cache->uc_allocbucket != NULL)
3404                                         ups.ups_cache_free +=
3405                                             cache->uc_allocbucket->ub_cnt;
3406                                 if (cache->uc_freebucket != NULL)
3407                                         ups.ups_cache_free +=
3408                                             cache->uc_freebucket->ub_cnt;
3409                                 ups.ups_allocs = cache->uc_allocs;
3410                                 ups.ups_frees = cache->uc_frees;
3411 skip:
3412                                 (void)sbuf_bcat(&sbuf, &ups, sizeof(ups));
3413                         }
3414                         ZONE_UNLOCK(z);
3415                 }
3416         }
3417         mtx_unlock(&uma_mtx);
3418         error = sbuf_finish(&sbuf);
3419         sbuf_delete(&sbuf);
3420         return (error);
3421 }
3422
3423 #ifdef DDB
3424 DB_SHOW_COMMAND(uma, db_show_uma)
3425 {
3426         uint64_t allocs, frees, sleeps;
3427         uma_bucket_t bucket;
3428         uma_keg_t kz;
3429         uma_zone_t z;
3430         int cachefree;
3431
3432         db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
3433             "Requests", "Sleeps");
3434         LIST_FOREACH(kz, &uma_kegs, uk_link) {
3435                 LIST_FOREACH(z, &kz->uk_zones, uz_link) {
3436                         if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
3437                                 allocs = z->uz_allocs;
3438                                 frees = z->uz_frees;
3439                                 sleeps = z->uz_sleeps;
3440                                 cachefree = 0;
3441                         } else
3442                                 uma_zone_sumstat(z, &cachefree, &allocs,
3443                                     &frees, &sleeps);
3444                         if (!((z->uz_flags & UMA_ZONE_SECONDARY) &&
3445                             (LIST_FIRST(&kz->uk_zones) != z)))
3446                                 cachefree += kz->uk_free;
3447                         LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link)
3448                                 cachefree += bucket->ub_cnt;
3449                         db_printf("%18s %8ju %8jd %8d %12ju %8ju\n", z->uz_name,
3450                             (uintmax_t)kz->uk_size,
3451                             (intmax_t)(allocs - frees), cachefree,
3452                             (uintmax_t)allocs, sleeps);
3453                         if (db_pager_quit)
3454                                 return;
3455                 }
3456         }
3457 }
3458 #endif