sys/vm/uma_core.c

   1 /*-
   2  * Copyright (c) 2002-2005, 2009 Jeffrey Roberson <jeff@FreeBSD.org>
   3  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
   4  * Copyright (c) 2004-2006 Robert N. M. Watson
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice unmodified, this list of conditions, and the following
  12  *    disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27  */
  28
  29 /*
  30  * uma_core.c  Implementation of the Universal Memory allocator
  31  *
  32  * This allocator is intended to replace the multitude of similar object caches
  33  * in the standard FreeBSD kernel.  The intent is to be flexible as well as
  34  * effecient.  A primary design goal is to return unused memory to the rest of
  35  * the system.  This will make the system as a whole more flexible due to the
  36  * ability to move memory to subsystems which most need it instead of leaving
  37  * pools of reserved memory unused.
  38  *
  39  * The basic ideas stem from similar slab/zone based allocators whose algorithms
  40  * are well known.
  41  *
  42  */
  43
  44 /*
  45  * TODO:
  46  *      - Improve memory usage for large allocations
  47  *      - Investigate cache size adjustments
  48  */
  49
  50 #include <sys/cdefs.h>
  51 __FBSDID("$FreeBSD$");
  52
  53 /* I should really use ktr.. */
  54 /*
  55 #define UMA_DEBUG 1
  56 #define UMA_DEBUG_ALLOC 1
  57 #define UMA_DEBUG_ALLOC_1 1
  58 */
  59
  60 #include "opt_ddb.h"
  61 #include "opt_param.h"
  62 #include "opt_vm.h"
  63
  64 #include <sys/param.h>
  65 #include <sys/systm.h>
  66 #include <sys/kernel.h>
  67 #include <sys/types.h>
  68 #include <sys/queue.h>
  69 #include <sys/malloc.h>
  70 #include <sys/ktr.h>
  71 #include <sys/lock.h>
  72 #include <sys/sysctl.h>
  73 #include <sys/mutex.h>
  74 #include <sys/proc.h>
  75 #include <sys/rwlock.h>
  76 #include <sys/sbuf.h>
  77 #include <sys/smp.h>
  78 #include <sys/vmmeter.h>
  79
  80 #include <vm/vm.h>
  81 #include <vm/vm_object.h>
  82 #include <vm/vm_page.h>
  83 #include <vm/vm_pageout.h>
  84 #include <vm/vm_param.h>
  85 #include <vm/vm_map.h>
  86 #include <vm/vm_kern.h>
  87 #include <vm/vm_extern.h>
  88 #include <vm/uma.h>
  89 #include <vm/uma_int.h>
  90 #include <vm/uma_dbg.h>
  91
  92 #include <ddb/ddb.h>
  93
  94 #ifdef DEBUG_MEMGUARD
  95 #include <vm/memguard.h>
  96 #endif
  97
  98 /*
  99  * This is the zone and keg from which all zones are spawned.  The idea is that
 100  * even the zone & keg heads are allocated from the allocator, so we use the
 101  * bss section to bootstrap us.
 102  */
 103 static struct uma_keg masterkeg;
 104 static struct uma_zone masterzone_k;
 105 static struct uma_zone masterzone_z;
 106 static uma_zone_t kegs = &masterzone_k;
 107 static uma_zone_t zones = &masterzone_z;
 108
 109 /* This is the zone from which all of uma_slab_t's are allocated. */
 110 static uma_zone_t slabzone;
 111 static uma_zone_t slabrefzone;  /* With refcounters (for UMA_ZONE_REFCNT) */
 112
 113 /*
 114  * The initial hash tables come out of this zone so they can be allocated
 115  * prior to malloc coming up.
 116  */
 117 static uma_zone_t hashzone;
 118
 119 /* The boot-time adjusted value for cache line alignment. */
 120 int uma_align_cache = 64 - 1;
 121
 122 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
 123
 124 /*
 125  * Are we allowed to allocate buckets?
 126  */
 127 static int bucketdisable = 1;
 128
 129 /* Linked list of all kegs in the system */
 130 static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs);
 131
 132 /* This mutex protects the keg list */
 133 static struct mtx uma_mtx;
 134
 135 /* Linked list of boot time pages */
 136 static LIST_HEAD(,uma_slab) uma_boot_pages =
 137     LIST_HEAD_INITIALIZER(uma_boot_pages);
 138
 139 /* This mutex protects the boot time pages list */
 140 static struct mtx uma_boot_pages_mtx;
 141
 142 /* Is the VM done starting up? */
 143 static int booted = 0;
 144 #define UMA_STARTUP     1
 145 #define UMA_STARTUP2    2
 146
 147 /* Maximum number of allowed items-per-slab if the slab header is OFFPAGE */
 148 static u_int uma_max_ipers;
 149 static u_int uma_max_ipers_ref;
 150
 151 /*
 152  * This is the handle used to schedule events that need to happen
 153  * outside of the allocation fast path.
 154  */
 155 static struct callout uma_callout;
 156 #define UMA_TIMEOUT     20              /* Seconds for callout interval. */
 157
 158 /*
 159  * This structure is passed as the zone ctor arg so that I don't have to create
 160  * a special allocation function just for zones.
 161  */
 162 struct uma_zctor_args {
 163         const char *name;
 164         size_t size;
 165         uma_ctor ctor;
 166         uma_dtor dtor;
 167         uma_init uminit;
 168         uma_fini fini;
 169         uma_keg_t keg;
 170         int align;
 171         u_int32_t flags;
 172 };
 173
 174 struct uma_kctor_args {
 175         uma_zone_t zone;
 176         size_t size;
 177         uma_init uminit;
 178         uma_fini fini;
 179         int align;
 180         u_int32_t flags;
 181 };
 182
 183 struct uma_bucket_zone {
 184         uma_zone_t      ubz_zone;
 185         char            *ubz_name;
 186         int             ubz_entries;
 187 };
 188
 189 #define BUCKET_MAX      128
 190
 191 struct uma_bucket_zone bucket_zones[] = {
 192         { NULL, "16 Bucket", 16 },
 193         { NULL, "32 Bucket", 32 },
 194         { NULL, "64 Bucket", 64 },
 195         { NULL, "128 Bucket", 128 },
 196         { NULL, NULL, 0}
 197 };
 198
 199 #define BUCKET_SHIFT    4
 200 #define BUCKET_ZONES    ((BUCKET_MAX >> BUCKET_SHIFT) + 1)
 201
 202 /*
 203  * bucket_size[] maps requested bucket sizes to zones that allocate a bucket
 204  * of approximately the right size.
 205  */
 206 static uint8_t bucket_size[BUCKET_ZONES];
 207
 208 /*
 209  * Flags and enumerations to be passed to internal functions.
 210  */
 211 enum zfreeskip { SKIP_NONE, SKIP_DTOR, SKIP_FINI };
 212
 213 #define ZFREE_STATFAIL  0x00000001      /* Update zone failure statistic. */
 214 #define ZFREE_STATFREE  0x00000002      /* Update zone free statistic. */
 215
 216 /* Prototypes.. */
 217
 218 static void *noobj_alloc(uma_zone_t, int, u_int8_t *, int);
 219 static void *page_alloc(uma_zone_t, int, u_int8_t *, int);
 220 static void *startup_alloc(uma_zone_t, int, u_int8_t *, int);
 221 static void page_free(void *, int, u_int8_t);
 222 static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int);
 223 static void cache_drain(uma_zone_t);
 224 static void bucket_drain(uma_zone_t, uma_bucket_t);
 225 static void bucket_cache_drain(uma_zone_t zone);
 226 static int keg_ctor(void *, int, void *, int);
 227 static void keg_dtor(void *, int, void *);
 228 static int zone_ctor(void *, int, void *, int);
 229 static void zone_dtor(void *, int, void *);
 230 static int zero_init(void *, int, int);
 231 static void keg_small_init(uma_keg_t keg);
 232 static void keg_large_init(uma_keg_t keg);
 233 static void zone_foreach(void (*zfunc)(uma_zone_t));
 234 static void zone_timeout(uma_zone_t zone);
 235 static int hash_alloc(struct uma_hash *);
 236 static int hash_expand(struct uma_hash *, struct uma_hash *);
 237 static void hash_free(struct uma_hash *hash);
 238 static void uma_timeout(void *);
 239 static void uma_startup3(void);
 240 static void *zone_alloc_item(uma_zone_t, void *, int);
 241 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip,
 242     int);
 243 static void bucket_enable(void);
 244 static void bucket_init(void);
 245 static uma_bucket_t bucket_alloc(int, int);
 246 static void bucket_free(uma_bucket_t);
 247 static void bucket_zone_drain(void);
 248 static int zone_alloc_bucket(uma_zone_t zone, int flags);
 249 static uma_slab_t zone_fetch_slab(uma_zone_t zone, uma_keg_t last, int flags);
 250 static uma_slab_t zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int flags);
 251 static void *slab_alloc_item(uma_zone_t zone, uma_slab_t slab);
 252 static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
 253     uma_fini fini, int align, u_int32_t flags);
 254 static inline void zone_relock(uma_zone_t zone, uma_keg_t keg);
 255 static inline void keg_relock(uma_keg_t keg, uma_zone_t zone);
 256
 257 void uma_print_zone(uma_zone_t);
 258 void uma_print_stats(void);
 259 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
 260 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
 261
 262 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
 263
 264 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT,
 265     0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones");
 266
 267 SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
 268     0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
 269
 270 static int zone_warnings = 1;
 271 TUNABLE_INT("vm.zone_warnings", &zone_warnings);
 272 SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RW, &zone_warnings, 0,
 273     "Warn when UMA zones becomes full");
 274
 275 /*
 276  * This routine checks to see whether or not it's safe to enable buckets.
 277  */
 278
 279 static void
 280 bucket_enable(void)
 281 {
 282         bucketdisable = vm_page_count_min();
 283 }
 284
 285 /*
 286  * Initialize bucket_zones, the array of zones of buckets of various sizes.
 287  *
 288  * For each zone, calculate the memory required for each bucket, consisting
 289  * of the header and an array of pointers.  Initialize bucket_size[] to point
 290  * the range of appropriate bucket sizes at the zone.
 291  */
 292 static void
 293 bucket_init(void)
 294 {
 295         struct uma_bucket_zone *ubz;
 296         int i;
 297         int j;
 298
 299         for (i = 0, j = 0; bucket_zones[j].ubz_entries != 0; j++) {
 300                 int size;
 301
 302                 ubz = &bucket_zones[j];
 303                 size = roundup(sizeof(struct uma_bucket), sizeof(void *));
 304                 size += sizeof(void *) * ubz->ubz_entries;
 305                 ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
 306                     NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 307                     UMA_ZFLAG_INTERNAL | UMA_ZFLAG_BUCKET);
 308                 for (; i <= ubz->ubz_entries; i += (1 << BUCKET_SHIFT))
 309                         bucket_size[i >> BUCKET_SHIFT] = j;
 310         }
 311 }
 312
 313 /*
 314  * Given a desired number of entries for a bucket, return the zone from which
 315  * to allocate the bucket.
 316  */
 317 static struct uma_bucket_zone *
 318 bucket_zone_lookup(int entries)
 319 {
 320         int idx;
 321
 322         idx = howmany(entries, 1 << BUCKET_SHIFT);
 323         return (&bucket_zones[bucket_size[idx]]);
 324 }
 325
 326 static uma_bucket_t
 327 bucket_alloc(int entries, int bflags)
 328 {
 329         struct uma_bucket_zone *ubz;
 330         uma_bucket_t bucket;
 331
 332         /*
 333          * This is to stop us from allocating per cpu buckets while we're
 334          * running out of vm.boot_pages.  Otherwise, we would exhaust the
 335          * boot pages.  This also prevents us from allocating buckets in
 336          * low memory situations.
 337          */
 338         if (bucketdisable)
 339                 return (NULL);
 340
 341         ubz = bucket_zone_lookup(entries);
 342         bucket = zone_alloc_item(ubz->ubz_zone, NULL, bflags);
 343         if (bucket) {
 344 #ifdef INVARIANTS
 345                 bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
 346 #endif
 347                 bucket->ub_cnt = 0;
 348                 bucket->ub_entries = ubz->ubz_entries;
 349         }
 350
 351         return (bucket);
 352 }
 353
 354 static void
 355 bucket_free(uma_bucket_t bucket)
 356 {
 357         struct uma_bucket_zone *ubz;
 358
 359         ubz = bucket_zone_lookup(bucket->ub_entries);
 360         zone_free_item(ubz->ubz_zone, bucket, NULL, SKIP_NONE,
 361             ZFREE_STATFREE);
 362 }
 363
 364 static void
 365 bucket_zone_drain(void)
 366 {
 367         struct uma_bucket_zone *ubz;
 368
 369         for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
 370                 zone_drain(ubz->ubz_zone);
 371 }
 372
 373 static void
 374 zone_log_warning(uma_zone_t zone)
 375 {
 376         static const struct timeval warninterval = { 300, 0 };
 377
 378         if (!zone_warnings || zone->uz_warning == NULL)
 379                 return;
 380
 381         if (ratecheck(&zone->uz_ratecheck, &warninterval))
 382                 printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning);
 383 }
 384
 385 static inline uma_keg_t
 386 zone_first_keg(uma_zone_t zone)
 387 {
 388
 389         return (LIST_FIRST(&zone->uz_kegs)->kl_keg);
 390 }
 391
 392 static void
 393 zone_foreach_keg(uma_zone_t zone, void (*kegfn)(uma_keg_t))
 394 {
 395         uma_klink_t klink;
 396
 397         LIST_FOREACH(klink, &zone->uz_kegs, kl_link)
 398                 kegfn(klink->kl_keg);
 399 }
 400
 401 /*
 402  * Routine called by timeout which is used to fire off some time interval
 403  * based calculations.  (stats, hash size, etc.)
 404  *
 405  * Arguments:
 406  *      arg   Unused
 407  *
 408  * Returns:
 409  *      Nothing
 410  */
 411 static void
 412 uma_timeout(void *unused)
 413 {
 414         bucket_enable();
 415         zone_foreach(zone_timeout);
 416
 417         /* Reschedule this event */
 418         callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
 419 }
 420
 421 /*
 422  * Routine to perform timeout driven calculations.  This expands the
 423  * hashes and does per cpu statistics aggregation.
 424  *
 425  *  Returns nothing.
 426  */
 427 static void
 428 keg_timeout(uma_keg_t keg)
 429 {
 430
 431         KEG_LOCK(keg);
 432         /*
 433          * Expand the keg hash table.
 434          *
 435          * This is done if the number of slabs is larger than the hash size.
 436          * What I'm trying to do here is completely reduce collisions.  This
 437          * may be a little aggressive.  Should I allow for two collisions max?
 438          */
 439         if (keg->uk_flags & UMA_ZONE_HASH &&
 440             keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) {
 441                 struct uma_hash newhash;
 442                 struct uma_hash oldhash;
 443                 int ret;
 444
 445                 /*
 446                  * This is so involved because allocating and freeing
 447                  * while the keg lock is held will lead to deadlock.
 448                  * I have to do everything in stages and check for
 449                  * races.
 450                  */
 451                 newhash = keg->uk_hash;
 452                 KEG_UNLOCK(keg);
 453                 ret = hash_alloc(&newhash);
 454                 KEG_LOCK(keg);
 455                 if (ret) {
 456                         if (hash_expand(&keg->uk_hash, &newhash)) {
 457                                 oldhash = keg->uk_hash;
 458                                 keg->uk_hash = newhash;
 459                         } else
 460                                 oldhash = newhash;
 461
 462                         KEG_UNLOCK(keg);
 463                         hash_free(&oldhash);
 464                         KEG_LOCK(keg);
 465                 }
 466         }
 467         KEG_UNLOCK(keg);
 468 }
 469
 470 static void
 471 zone_timeout(uma_zone_t zone)
 472 {
 473
 474         zone_foreach_keg(zone, &keg_timeout);
 475 }
 476
 477 /*
 478  * Allocate and zero fill the next sized hash table from the appropriate
 479  * backing store.
 480  *
 481  * Arguments:
 482  *      hash  A new hash structure with the old hash size in uh_hashsize
 483  *
 484  * Returns:
 485  *      1 on sucess and 0 on failure.
 486  */
 487 static int
 488 hash_alloc(struct uma_hash *hash)
 489 {
 490         int oldsize;
 491         int alloc;
 492
 493         oldsize = hash->uh_hashsize;
 494
 495         /* We're just going to go to a power of two greater */
 496         if (oldsize)  {
 497                 hash->uh_hashsize = oldsize * 2;
 498                 alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
 499                 hash->uh_slab_hash = (struct slabhead *)malloc(alloc,
 500                     M_UMAHASH, M_NOWAIT);
 501         } else {
 502                 alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
 503                 hash->uh_slab_hash = zone_alloc_item(hashzone, NULL,
 504                     M_WAITOK);
 505                 hash->uh_hashsize = UMA_HASH_SIZE_INIT;
 506         }
 507         if (hash->uh_slab_hash) {
 508                 bzero(hash->uh_slab_hash, alloc);
 509                 hash->uh_hashmask = hash->uh_hashsize - 1;
 510                 return (1);
 511         }
 512
 513         return (0);
 514 }
 515
 516 /*
 517  * Expands the hash table for HASH zones.  This is done from zone_timeout
 518  * to reduce collisions.  This must not be done in the regular allocation
 519  * path, otherwise, we can recurse on the vm while allocating pages.
 520  *
 521  * Arguments:
 522  *      oldhash  The hash you want to expand
 523  *      newhash  The hash structure for the new table
 524  *
 525  * Returns:
 526  *      Nothing
 527  *
 528  * Discussion:
 529  */
 530 static int
 531 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
 532 {
 533         uma_slab_t slab;
 534         int hval;
 535         int i;
 536
 537         if (!newhash->uh_slab_hash)
 538                 return (0);
 539
 540         if (oldhash->uh_hashsize >= newhash->uh_hashsize)
 541                 return (0);
 542
 543         /*
 544          * I need to investigate hash algorithms for resizing without a
 545          * full rehash.
 546          */
 547
 548         for (i = 0; i < oldhash->uh_hashsize; i++)
 549                 while (!SLIST_EMPTY(&oldhash->uh_slab_hash[i])) {
 550                         slab = SLIST_FIRST(&oldhash->uh_slab_hash[i]);
 551                         SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[i], us_hlink);
 552                         hval = UMA_HASH(newhash, slab->us_data);
 553                         SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
 554                             slab, us_hlink);
 555                 }
 556
 557         return (1);
 558 }
 559
 560 /*
 561  * Free the hash bucket to the appropriate backing store.
 562  *
 563  * Arguments:
 564  *      slab_hash  The hash bucket we're freeing
 565  *      hashsize   The number of entries in that hash bucket
 566  *
 567  * Returns:
 568  *      Nothing
 569  */
 570 static void
 571 hash_free(struct uma_hash *hash)
 572 {
 573         if (hash->uh_slab_hash == NULL)
 574                 return;
 575         if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
 576                 zone_free_item(hashzone,
 577                     hash->uh_slab_hash, NULL, SKIP_NONE, ZFREE_STATFREE);
 578         else
 579                 free(hash->uh_slab_hash, M_UMAHASH);
 580 }
 581
 582 /*
 583  * Frees all outstanding items in a bucket
 584  *
 585  * Arguments:
 586  *      zone   The zone to free to, must be unlocked.
 587  *      bucket The free/alloc bucket with items, cpu queue must be locked.
 588  *
 589  * Returns:
 590  *      Nothing
 591  */
 592
 593 static void
 594 bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
 595 {
 596         void *item;
 597
 598         if (bucket == NULL)
 599                 return;
 600
 601         while (bucket->ub_cnt > 0)  {
 602                 bucket->ub_cnt--;
 603                 item = bucket->ub_bucket[bucket->ub_cnt];
 604 #ifdef INVARIANTS
 605                 bucket->ub_bucket[bucket->ub_cnt] = NULL;
 606                 KASSERT(item != NULL,
 607                     ("bucket_drain: botched ptr, item is NULL"));
 608 #endif
 609                 zone_free_item(zone, item, NULL, SKIP_DTOR, 0);
 610         }
 611 }
 612
 613 /*
 614  * Drains the per cpu caches for a zone.
 615  *
 616  * NOTE: This may only be called while the zone is being turn down, and not
 617  * during normal operation.  This is necessary in order that we do not have
 618  * to migrate CPUs to drain the per-CPU caches.
 619  *
 620  * Arguments:
 621  *      zone     The zone to drain, must be unlocked.
 622  *
 623  * Returns:
 624  *      Nothing
 625  */
 626 static void
 627 cache_drain(uma_zone_t zone)
 628 {
 629         uma_cache_t cache;
 630         int cpu;
 631
 632         /*
 633          * XXX: It is safe to not lock the per-CPU caches, because we're
 634          * tearing down the zone anyway.  I.e., there will be no further use
 635          * of the caches at this point.
 636          *
 637          * XXX: It would good to be able to assert that the zone is being
 638          * torn down to prevent improper use of cache_drain().
 639          *
 640          * XXX: We lock the zone before passing into bucket_cache_drain() as
 641          * it is used elsewhere.  Should the tear-down path be made special
 642          * there in some form?
 643          */
 644         CPU_FOREACH(cpu) {
 645                 cache = &zone->uz_cpu[cpu];
 646                 bucket_drain(zone, cache->uc_allocbucket);
 647                 bucket_drain(zone, cache->uc_freebucket);
 648                 if (cache->uc_allocbucket != NULL)
 649                         bucket_free(cache->uc_allocbucket);
 650                 if (cache->uc_freebucket != NULL)
 651                         bucket_free(cache->uc_freebucket);
 652                 cache->uc_allocbucket = cache->uc_freebucket = NULL;
 653         }
 654         ZONE_LOCK(zone);
 655         bucket_cache_drain(zone);
 656         ZONE_UNLOCK(zone);
 657 }
 658
 659 /*
 660  * Drain the cached buckets from a zone.  Expects a locked zone on entry.
 661  */
 662 static void
 663 bucket_cache_drain(uma_zone_t zone)
 664 {
 665         uma_bucket_t bucket;
 666
 667         /*
 668          * Drain the bucket queues and free the buckets, we just keep two per
 669          * cpu (alloc/free).
 670          */
 671         while ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) {
 672                 LIST_REMOVE(bucket, ub_link);
 673                 ZONE_UNLOCK(zone);
 674                 bucket_drain(zone, bucket);
 675                 bucket_free(bucket);
 676                 ZONE_LOCK(zone);
 677         }
 678
 679         /* Now we do the free queue.. */
 680         while ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
 681                 LIST_REMOVE(bucket, ub_link);
 682                 bucket_free(bucket);
 683         }
 684 }
 685
 686 /*
 687  * Frees pages from a keg back to the system.  This is done on demand from
 688  * the pageout daemon.
 689  *
 690  * Returns nothing.
 691  */
 692 static void
 693 keg_drain(uma_keg_t keg)
 694 {
 695         struct slabhead freeslabs = { 0 };
 696         uma_slab_t slab;
 697         uma_slab_t n;
 698         u_int8_t flags;
 699         u_int8_t *mem;
 700         int i;
 701
 702         /*
 703          * We don't want to take pages from statically allocated kegs at this
 704          * time
 705          */
 706         if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)
 707                 return;
 708
 709 #ifdef UMA_DEBUG
 710         printf("%s free items: %u\n", keg->uk_name, keg->uk_free);
 711 #endif
 712         KEG_LOCK(keg);
 713         if (keg->uk_free == 0)
 714                 goto finished;
 715
 716         slab = LIST_FIRST(&keg->uk_free_slab);
 717         while (slab) {
 718                 n = LIST_NEXT(slab, us_link);
 719
 720                 /* We have no where to free these to */
 721                 if (slab->us_flags & UMA_SLAB_BOOT) {
 722                         slab = n;
 723                         continue;
 724                 }
 725
 726                 LIST_REMOVE(slab, us_link);
 727                 keg->uk_pages -= keg->uk_ppera;
 728                 keg->uk_free -= keg->uk_ipers;
 729
 730                 if (keg->uk_flags & UMA_ZONE_HASH)
 731                         UMA_HASH_REMOVE(&keg->uk_hash, slab, slab->us_data);
 732
 733                 SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
 734
 735                 slab = n;
 736         }
 737 finished:
 738         KEG_UNLOCK(keg);
 739
 740         while ((slab = SLIST_FIRST(&freeslabs)) != NULL) {
 741                 SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink);
 742                 if (keg->uk_fini)
 743                         for (i = 0; i < keg->uk_ipers; i++)
 744                                 keg->uk_fini(
 745                                     slab->us_data + (keg->uk_rsize * i),
 746                                     keg->uk_size);
 747                 flags = slab->us_flags;
 748                 mem = slab->us_data;
 749
 750                 if (keg->uk_flags & UMA_ZONE_VTOSLAB) {
 751                         vm_object_t obj;
 752
 753                         if (flags & UMA_SLAB_KMEM)
 754                                 obj = kmem_object;
 755                         else if (flags & UMA_SLAB_KERNEL)
 756                                 obj = kernel_object;
 757                         else
 758                                 obj = NULL;
 759                         for (i = 0; i < keg->uk_ppera; i++)
 760                                 vsetobj((vm_offset_t)mem + (i * PAGE_SIZE),
 761                                     obj);
 762                 }
 763                 if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 764                         zone_free_item(keg->uk_slabzone, slab, NULL,
 765                             SKIP_NONE, ZFREE_STATFREE);
 766 #ifdef UMA_DEBUG
 767                 printf("%s: Returning %d bytes.\n",
 768                     keg->uk_name, UMA_SLAB_SIZE * keg->uk_ppera);
 769 #endif
 770                 keg->uk_freef(mem, UMA_SLAB_SIZE * keg->uk_ppera, flags);
 771         }
 772 }
 773
 774 static void
 775 zone_drain_wait(uma_zone_t zone, int waitok)
 776 {
 777
 778         /*
 779          * Set draining to interlock with zone_dtor() so we can release our
 780          * locks as we go.  Only dtor() should do a WAITOK call since it
 781          * is the only call that knows the structure will still be available
 782          * when it wakes up.
 783          */
 784         ZONE_LOCK(zone);
 785         while (zone->uz_flags & UMA_ZFLAG_DRAINING) {
 786                 if (waitok == M_NOWAIT)
 787                         goto out;
 788                 mtx_unlock(&uma_mtx);
 789                 msleep(zone, zone->uz_lock, PVM, "zonedrain", 1);
 790                 mtx_lock(&uma_mtx);
 791         }
 792         zone->uz_flags |= UMA_ZFLAG_DRAINING;
 793         bucket_cache_drain(zone);
 794         ZONE_UNLOCK(zone);
 795         /*
 796          * The DRAINING flag protects us from being freed while
 797          * we're running.  Normally the uma_mtx would protect us but we
 798          * must be able to release and acquire the right lock for each keg.
 799          */
 800         zone_foreach_keg(zone, &keg_drain);
 801         ZONE_LOCK(zone);
 802         zone->uz_flags &= ~UMA_ZFLAG_DRAINING;
 803         wakeup(zone);
 804 out:
 805         ZONE_UNLOCK(zone);
 806 }
 807
 808 void
 809 zone_drain(uma_zone_t zone)
 810 {
 811
 812         zone_drain_wait(zone, M_NOWAIT);
 813 }
 814
 815 /*
 816  * Allocate a new slab for a keg.  This does not insert the slab onto a list.
 817  *
 818  * Arguments:
 819  *      wait  Shall we wait?
 820  *
 821  * Returns:
 822  *      The slab that was allocated or NULL if there is no memory and the
 823  *      caller specified M_NOWAIT.
 824  */
 825 static uma_slab_t
 826 keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait)
 827 {
 828         uma_slabrefcnt_t slabref;
 829         uma_alloc allocf;
 830         uma_slab_t slab;
 831         u_int8_t *mem;
 832         u_int8_t flags;
 833         int i;
 834
 835         mtx_assert(&keg->uk_lock, MA_OWNED);
 836         slab = NULL;
 837
 838 #ifdef UMA_DEBUG
 839         printf("slab_zalloc:  Allocating a new slab for %s\n", keg->uk_name);
 840 #endif
 841         allocf = keg->uk_allocf;
 842         KEG_UNLOCK(keg);
 843
 844         if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
 845                 slab = zone_alloc_item(keg->uk_slabzone, NULL, wait);
 846                 if (slab == NULL) {
 847                         KEG_LOCK(keg);
 848                         return NULL;
 849                 }
 850         }
 851
 852         /*
 853          * This reproduces the old vm_zone behavior of zero filling pages the
 854          * first time they are added to a zone.
 855          *
 856          * Malloced items are zeroed in uma_zalloc.
 857          */
 858
 859         if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
 860                 wait |= M_ZERO;
 861         else
 862                 wait &= ~M_ZERO;
 863
 864         if (keg->uk_flags & UMA_ZONE_NODUMP)
 865                 wait |= M_NODUMP;
 866
 867         /* zone is passed for legacy reasons. */
 868         mem = allocf(zone, keg->uk_ppera * UMA_SLAB_SIZE, &flags, wait);
 869         if (mem == NULL) {
 870                 if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 871                         zone_free_item(keg->uk_slabzone, slab, NULL,
 872                             SKIP_NONE, ZFREE_STATFREE);
 873                 KEG_LOCK(keg);
 874                 return (NULL);
 875         }
 876
 877         /* Point the slab into the allocated memory */
 878         if (!(keg->uk_flags & UMA_ZONE_OFFPAGE))
 879                 slab = (uma_slab_t )(mem + keg->uk_pgoff);
 880
 881         if (keg->uk_flags & UMA_ZONE_VTOSLAB)
 882                 for (i = 0; i < keg->uk_ppera; i++)
 883                         vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab);
 884
 885         slab->us_keg = keg;
 886         slab->us_data = mem;
 887         slab->us_freecount = keg->uk_ipers;
 888         slab->us_firstfree = 0;
 889         slab->us_flags = flags;
 890
 891         if (keg->uk_flags & UMA_ZONE_REFCNT) {
 892                 slabref = (uma_slabrefcnt_t)slab;
 893                 for (i = 0; i < keg->uk_ipers; i++) {
 894                         slabref->us_freelist[i].us_refcnt = 0;
 895                         slabref->us_freelist[i].us_item = i+1;
 896                 }
 897         } else {
 898                 for (i = 0; i < keg->uk_ipers; i++)
 899                         slab->us_freelist[i].us_item = i+1;
 900         }
 901
 902         if (keg->uk_init != NULL) {
 903                 for (i = 0; i < keg->uk_ipers; i++)
 904                         if (keg->uk_init(slab->us_data + (keg->uk_rsize * i),
 905                             keg->uk_size, wait) != 0)
 906                                 break;
 907                 if (i != keg->uk_ipers) {
 908                         if (keg->uk_fini != NULL) {
 909                                 for (i--; i > -1; i--)
 910                                         keg->uk_fini(slab->us_data +
 911                                             (keg->uk_rsize * i),
 912                                             keg->uk_size);
 913                         }
 914                         if (keg->uk_flags & UMA_ZONE_VTOSLAB) {
 915                                 vm_object_t obj;
 916
 917                                 if (flags & UMA_SLAB_KMEM)
 918                                         obj = kmem_object;
 919                                 else if (flags & UMA_SLAB_KERNEL)
 920                                         obj = kernel_object;
 921                                 else
 922                                         obj = NULL;
 923                                 for (i = 0; i < keg->uk_ppera; i++)
 924                                         vsetobj((vm_offset_t)mem +
 925                                             (i * PAGE_SIZE), obj);
 926                         }
 927                         if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 928                                 zone_free_item(keg->uk_slabzone, slab,
 929                                     NULL, SKIP_NONE, ZFREE_STATFREE);
 930                         keg->uk_freef(mem, UMA_SLAB_SIZE * keg->uk_ppera,
 931                             flags);
 932                         KEG_LOCK(keg);
 933                         return (NULL);
 934                 }
 935         }
 936         KEG_LOCK(keg);
 937
 938         if (keg->uk_flags & UMA_ZONE_HASH)
 939                 UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
 940
 941         keg->uk_pages += keg->uk_ppera;
 942         keg->uk_free += keg->uk_ipers;
 943
 944         return (slab);
 945 }
 946
 947 /*
 948  * This function is intended to be used early on in place of page_alloc() so
 949  * that we may use the boot time page cache to satisfy allocations before
 950  * the VM is ready.
 951  */
 952 static void *
 953 startup_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait)
 954 {
 955         uma_keg_t keg;
 956         uma_slab_t tmps;
 957         int pages, check_pages;
 958
 959         keg = zone_first_keg(zone);
 960         pages = howmany(bytes, PAGE_SIZE);
 961         check_pages = pages - 1;
 962         KASSERT(pages > 0, ("startup_alloc can't reserve 0 pages\n"));
 963
 964         /*
 965          * Check our small startup cache to see if it has pages remaining.
 966          */
 967         mtx_lock(&uma_boot_pages_mtx);
 968
 969         /* First check if we have enough room. */
 970         tmps = LIST_FIRST(&uma_boot_pages);
 971         while (tmps != NULL && check_pages-- > 0)
 972                 tmps = LIST_NEXT(tmps, us_link);
 973         if (tmps != NULL) {
 974                 /*
 975                  * It's ok to lose tmps references.  The last one will
 976                  * have tmps->us_data pointing to the start address of
 977                  * "pages" contiguous pages of memory.
 978                  */
 979                 while (pages-- > 0) {
 980                         tmps = LIST_FIRST(&uma_boot_pages);
 981                         LIST_REMOVE(tmps, us_link);
 982                 }
 983                 mtx_unlock(&uma_boot_pages_mtx);
 984                 *pflag = tmps->us_flags;
 985                 return (tmps->us_data);
 986         }
 987         mtx_unlock(&uma_boot_pages_mtx);
 988         if (booted < UMA_STARTUP2)
 989                 panic("UMA: Increase vm.boot_pages");
 990         /*
 991          * Now that we've booted reset these users to their real allocator.
 992          */
 993 #ifdef UMA_MD_SMALL_ALLOC
 994         keg->uk_allocf = (keg->uk_ppera > 1) ? page_alloc : uma_small_alloc;
 995 #else
 996         keg->uk_allocf = page_alloc;
 997 #endif
 998         return keg->uk_allocf(zone, bytes, pflag, wait);
 999 }
1000
1001 /*
1002  * Allocates a number of pages from the system
1003  *
1004  * Arguments:
1005  *      bytes  The number of bytes requested
1006  *      wait  Shall we wait?
1007  *
1008  * Returns:
1009  *      A pointer to the alloced memory or possibly
1010  *      NULL if M_NOWAIT is set.
1011  */
1012 static void *
1013 page_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait)
1014 {
1015         void *p;        /* Returned page */
1016
1017         *pflag = UMA_SLAB_KMEM;
1018         p = (void *) kmem_malloc(kmem_map, bytes, wait);
1019
1020         return (p);
1021 }
1022
1023 /*
1024  * Allocates a number of pages from within an object
1025  *
1026  * Arguments:
1027  *      bytes  The number of bytes requested
1028  *      wait   Shall we wait?
1029  *
1030  * Returns:
1031  *      A pointer to the alloced memory or possibly
1032  *      NULL if M_NOWAIT is set.
1033  */
1034 static void *
1035 noobj_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
1036 {
1037         TAILQ_HEAD(, vm_page) alloctail;
1038         u_long npages;
1039         vm_offset_t retkva, zkva;
1040         vm_page_t p, p_next;
1041         uma_keg_t keg;
1042
1043         TAILQ_INIT(&alloctail);
1044         keg = zone_first_keg(zone);
1045
1046         npages = howmany(bytes, PAGE_SIZE);
1047         while (npages > 0) {
1048                 p = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT |
1049                     VM_ALLOC_WIRED | VM_ALLOC_NOOBJ);
1050                 if (p != NULL) {
1051                         /*
1052                          * Since the page does not belong to an object, its
1053                          * listq is unused.
1054                          */
1055                         TAILQ_INSERT_TAIL(&alloctail, p, listq);
1056                         npages--;
1057                         continue;
1058                 }
1059                 if (wait & M_WAITOK) {
1060                         VM_WAIT;
1061                         continue;
1062                 }
1063
1064                 /*
1065                  * Page allocation failed, free intermediate pages and
1066                  * exit.
1067                  */
1068                 TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
1069                         vm_page_unwire(p, 0);
1070                         vm_page_free(p);
1071                 }
1072                 return (NULL);
1073         }
1074         *flags = UMA_SLAB_PRIV;
1075         zkva = keg->uk_kva +
1076             atomic_fetchadd_long(&keg->uk_offset, round_page(bytes));
1077         retkva = zkva;
1078         TAILQ_FOREACH(p, &alloctail, listq) {
1079                 pmap_qenter(zkva, &p, 1);
1080                 zkva += PAGE_SIZE;
1081         }
1082
1083         return ((void *)retkva);
1084 }
1085
1086 /*
1087  * Frees a number of pages to the system
1088  *
1089  * Arguments:
1090  *      mem   A pointer to the memory to be freed
1091  *      size  The size of the memory being freed
1092  *      flags The original p->us_flags field
1093  *
1094  * Returns:
1095  *      Nothing
1096  */
1097 static void
1098 page_free(void *mem, int size, u_int8_t flags)
1099 {
1100         vm_map_t map;
1101
1102         if (flags & UMA_SLAB_KMEM)
1103                 map = kmem_map;
1104         else if (flags & UMA_SLAB_KERNEL)
1105                 map = kernel_map;
1106         else
1107                 panic("UMA: page_free used with invalid flags %d", flags);
1108
1109         kmem_free(map, (vm_offset_t)mem, size);
1110 }
1111
1112 /*
1113  * Zero fill initializer
1114  *
1115  * Arguments/Returns follow uma_init specifications
1116  */
1117 static int
1118 zero_init(void *mem, int size, int flags)
1119 {
1120         bzero(mem, size);
1121         return (0);
1122 }
1123
1124 /*
1125  * Finish creating a small uma keg.  This calculates ipers, and the keg size.
1126  *
1127  * Arguments
1128  *      keg  The zone we should initialize
1129  *
1130  * Returns
1131  *      Nothing
1132  */
1133 static void
1134 keg_small_init(uma_keg_t keg)
1135 {
1136         u_int rsize;
1137         u_int memused;
1138         u_int wastedspace;
1139         u_int shsize;
1140
1141         KASSERT(keg != NULL, ("Keg is null in keg_small_init"));
1142         rsize = keg->uk_size;
1143
1144         if (rsize < UMA_SMALLEST_UNIT)
1145                 rsize = UMA_SMALLEST_UNIT;
1146         if (rsize & keg->uk_align)
1147                 rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1);
1148
1149         keg->uk_rsize = rsize;
1150         keg->uk_ppera = 1;
1151
1152         if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
1153                 shsize = 0;
1154         } else if (keg->uk_flags & UMA_ZONE_REFCNT) {
1155                 rsize += UMA_FRITMREF_SZ;       /* linkage & refcnt */
1156                 shsize = sizeof(struct uma_slab_refcnt);
1157         } else {
1158                 rsize += UMA_FRITM_SZ;  /* Account for linkage */
1159                 shsize = sizeof(struct uma_slab);
1160         }
1161
1162         keg->uk_ipers = (UMA_SLAB_SIZE - shsize) / rsize;
1163         KASSERT(keg->uk_ipers != 0, ("keg_small_init: ipers is 0"));
1164         memused = keg->uk_ipers * rsize + shsize;
1165         wastedspace = UMA_SLAB_SIZE - memused;
1166
1167         /*
1168          * We can't do OFFPAGE if we're internal or if we've been
1169          * asked to not go to the VM for buckets.  If we do this we
1170          * may end up going to the VM (kmem_map) for slabs which we
1171          * do not want to do if we're UMA_ZFLAG_CACHEONLY as a
1172          * result of UMA_ZONE_VM, which clearly forbids it.
1173          */
1174         if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) ||
1175             (keg->uk_flags & UMA_ZFLAG_CACHEONLY))
1176                 return;
1177
1178         if ((wastedspace >= UMA_MAX_WASTE) &&
1179             (keg->uk_ipers < (UMA_SLAB_SIZE / keg->uk_rsize))) {
1180                 keg->uk_ipers = UMA_SLAB_SIZE / keg->uk_rsize;
1181                 KASSERT(keg->uk_ipers <= 255,
1182                     ("keg_small_init: keg->uk_ipers too high!"));
1183 #ifdef UMA_DEBUG
1184                 printf("UMA decided we need offpage slab headers for "
1185                     "keg: %s, calculated wastedspace = %d, "
1186                     "maximum wasted space allowed = %d, "
1187                     "calculated ipers = %d, "
1188                     "new wasted space = %d\n", keg->uk_name, wastedspace,
1189                     UMA_MAX_WASTE, keg->uk_ipers,
1190                     UMA_SLAB_SIZE - keg->uk_ipers * keg->uk_rsize);
1191 #endif
1192                 keg->uk_flags |= UMA_ZONE_OFFPAGE;
1193                 if ((keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1194                         keg->uk_flags |= UMA_ZONE_HASH;
1195         }
1196 }
1197
1198 /*
1199  * Finish creating a large (> UMA_SLAB_SIZE) uma kegs.  Just give in and do
1200  * OFFPAGE for now.  When I can allow for more dynamic slab sizes this will be
1201  * more complicated.
1202  *
1203  * Arguments
1204  *      keg  The keg we should initialize
1205  *
1206  * Returns
1207  *      Nothing
1208  */
1209 static void
1210 keg_large_init(uma_keg_t keg)
1211 {
1212         int pages;
1213
1214         KASSERT(keg != NULL, ("Keg is null in keg_large_init"));
1215         KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0,
1216             ("keg_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY keg"));
1217
1218         pages = keg->uk_size / UMA_SLAB_SIZE;
1219
1220         /* Account for remainder */
1221         if ((pages * UMA_SLAB_SIZE) < keg->uk_size)
1222                 pages++;
1223
1224         keg->uk_ppera = pages;
1225         keg->uk_ipers = 1;
1226         keg->uk_rsize = keg->uk_size;
1227
1228         /* We can't do OFFPAGE if we're internal, bail out here. */
1229         if (keg->uk_flags & UMA_ZFLAG_INTERNAL)
1230                 return;
1231
1232         keg->uk_flags |= UMA_ZONE_OFFPAGE;
1233         if ((keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1234                 keg->uk_flags |= UMA_ZONE_HASH;
1235 }
1236
1237 static void
1238 keg_cachespread_init(uma_keg_t keg)
1239 {
1240         int alignsize;
1241         int trailer;
1242         int pages;
1243         int rsize;
1244
1245         alignsize = keg->uk_align + 1;
1246         rsize = keg->uk_size;
1247         /*
1248          * We want one item to start on every align boundary in a page.  To
1249          * do this we will span pages.  We will also extend the item by the
1250          * size of align if it is an even multiple of align.  Otherwise, it
1251          * would fall on the same boundary every time.
1252          */
1253         if (rsize & keg->uk_align)
1254                 rsize = (rsize & ~keg->uk_align) + alignsize;
1255         if ((rsize & alignsize) == 0)
1256                 rsize += alignsize;
1257         trailer = rsize - keg->uk_size;
1258         pages = (rsize * (PAGE_SIZE / alignsize)) / PAGE_SIZE;
1259         pages = MIN(pages, (128 * 1024) / PAGE_SIZE);
1260         keg->uk_rsize = rsize;
1261         keg->uk_ppera = pages;
1262         keg->uk_ipers = ((pages * PAGE_SIZE) + trailer) / rsize;
1263         keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB;
1264         KASSERT(keg->uk_ipers <= uma_max_ipers,
1265             ("%s: keg->uk_ipers too high(%d) increase max_ipers", __func__,
1266             keg->uk_ipers));
1267 }
1268
1269 /*
1270  * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
1271  * the keg onto the global keg list.
1272  *
1273  * Arguments/Returns follow uma_ctor specifications
1274  *      udata  Actually uma_kctor_args
1275  */
1276 static int
1277 keg_ctor(void *mem, int size, void *udata, int flags)
1278 {
1279         struct uma_kctor_args *arg = udata;
1280         uma_keg_t keg = mem;
1281         uma_zone_t zone;
1282
1283         bzero(keg, size);
1284         keg->uk_size = arg->size;
1285         keg->uk_init = arg->uminit;
1286         keg->uk_fini = arg->fini;
1287         keg->uk_align = arg->align;
1288         keg->uk_free = 0;
1289         keg->uk_pages = 0;
1290         keg->uk_flags = arg->flags;
1291         keg->uk_allocf = page_alloc;
1292         keg->uk_freef = page_free;
1293         keg->uk_recurse = 0;
1294         keg->uk_slabzone = NULL;
1295
1296         /*
1297          * The master zone is passed to us at keg-creation time.
1298          */
1299         zone = arg->zone;
1300         keg->uk_name = zone->uz_name;
1301
1302         if (arg->flags & UMA_ZONE_VM)
1303                 keg->uk_flags |= UMA_ZFLAG_CACHEONLY;
1304
1305         if (arg->flags & UMA_ZONE_ZINIT)
1306                 keg->uk_init = zero_init;
1307
1308         if (arg->flags & UMA_ZONE_REFCNT || arg->flags & UMA_ZONE_MALLOC)
1309                 keg->uk_flags |= UMA_ZONE_VTOSLAB;
1310
1311         /*
1312          * The +UMA_FRITM_SZ added to uk_size is to account for the
1313          * linkage that is added to the size in keg_small_init().  If
1314          * we don't account for this here then we may end up in
1315          * keg_small_init() with a calculated 'ipers' of 0.
1316          */
1317         if (keg->uk_flags & UMA_ZONE_REFCNT) {
1318                 if (keg->uk_flags & UMA_ZONE_CACHESPREAD)
1319                         keg_cachespread_init(keg);
1320                 else if ((keg->uk_size+UMA_FRITMREF_SZ) >
1321                     (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt)))
1322                         keg_large_init(keg);
1323                 else
1324                         keg_small_init(keg);
1325         } else {
1326                 if (keg->uk_flags & UMA_ZONE_CACHESPREAD)
1327                         keg_cachespread_init(keg);
1328                 else if ((keg->uk_size+UMA_FRITM_SZ) >
1329                     (UMA_SLAB_SIZE - sizeof(struct uma_slab)))
1330                         keg_large_init(keg);
1331                 else
1332                         keg_small_init(keg);
1333         }
1334
1335         if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
1336                 if (keg->uk_flags & UMA_ZONE_REFCNT)
1337                         keg->uk_slabzone = slabrefzone;
1338                 else
1339                         keg->uk_slabzone = slabzone;
1340         }
1341
1342         /*
1343          * If we haven't booted yet we need allocations to go through the
1344          * startup cache until the vm is ready.
1345          */
1346         if (keg->uk_ppera == 1) {
1347 #ifdef UMA_MD_SMALL_ALLOC
1348                 keg->uk_allocf = uma_small_alloc;
1349                 keg->uk_freef = uma_small_free;
1350
1351                 if (booted < UMA_STARTUP)
1352                         keg->uk_allocf = startup_alloc;
1353 #else
1354                 if (booted < UMA_STARTUP2)
1355                         keg->uk_allocf = startup_alloc;
1356 #endif
1357         } else if (booted < UMA_STARTUP2 &&
1358             (keg->uk_flags & UMA_ZFLAG_INTERNAL))
1359                 keg->uk_allocf = startup_alloc;
1360
1361         /*
1362          * Initialize keg's lock (shared among zones).
1363          */
1364         if (arg->flags & UMA_ZONE_MTXCLASS)
1365                 KEG_LOCK_INIT(keg, 1);
1366         else
1367                 KEG_LOCK_INIT(keg, 0);
1368
1369         /*
1370          * If we're putting the slab header in the actual page we need to
1371          * figure out where in each page it goes.  This calculates a right
1372          * justified offset into the memory on an ALIGN_PTR boundary.
1373          */
1374         if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) {
1375                 u_int totsize;
1376
1377                 /* Size of the slab struct and free list */
1378                 if (keg->uk_flags & UMA_ZONE_REFCNT)
1379                         totsize = sizeof(struct uma_slab_refcnt) +
1380                             keg->uk_ipers * UMA_FRITMREF_SZ;
1381                 else
1382                         totsize = sizeof(struct uma_slab) +
1383                             keg->uk_ipers * UMA_FRITM_SZ;
1384
1385                 if (totsize & UMA_ALIGN_PTR)
1386                         totsize = (totsize & ~UMA_ALIGN_PTR) +
1387                             (UMA_ALIGN_PTR + 1);
1388                 keg->uk_pgoff = (UMA_SLAB_SIZE * keg->uk_ppera) - totsize;
1389
1390                 if (keg->uk_flags & UMA_ZONE_REFCNT)
1391                         totsize = keg->uk_pgoff + sizeof(struct uma_slab_refcnt)
1392                             + keg->uk_ipers * UMA_FRITMREF_SZ;
1393                 else
1394                         totsize = keg->uk_pgoff + sizeof(struct uma_slab)
1395                             + keg->uk_ipers * UMA_FRITM_SZ;
1396
1397                 /*
1398                  * The only way the following is possible is if with our
1399                  * UMA_ALIGN_PTR adjustments we are now bigger than
1400                  * UMA_SLAB_SIZE.  I haven't checked whether this is
1401                  * mathematically possible for all cases, so we make
1402                  * sure here anyway.
1403                  */
1404                 if (totsize > UMA_SLAB_SIZE * keg->uk_ppera) {
1405                         printf("zone %s ipers %d rsize %d size %d\n",
1406                             zone->uz_name, keg->uk_ipers, keg->uk_rsize,
1407                             keg->uk_size);
1408                         panic("UMA slab won't fit.");
1409                 }
1410         }
1411
1412         if (keg->uk_flags & UMA_ZONE_HASH)
1413                 hash_alloc(&keg->uk_hash);
1414
1415 #ifdef UMA_DEBUG
1416         printf("UMA: %s(%p) size %d(%d) flags %#x ipers %d ppera %d out %d free %d\n",
1417             zone->uz_name, zone, keg->uk_size, keg->uk_rsize, keg->uk_flags,
1418             keg->uk_ipers, keg->uk_ppera,
1419             (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free);
1420 #endif
1421
1422         LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
1423
1424         mtx_lock(&uma_mtx);
1425         LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
1426         mtx_unlock(&uma_mtx);
1427         return (0);
1428 }
1429
1430 /*
1431  * Zone header ctor.  This initializes all fields, locks, etc.
1432  *
1433  * Arguments/Returns follow uma_ctor specifications
1434  *      udata  Actually uma_zctor_args
1435  */
1436 static int
1437 zone_ctor(void *mem, int size, void *udata, int flags)
1438 {
1439         struct uma_zctor_args *arg = udata;
1440         uma_zone_t zone = mem;
1441         uma_zone_t z;
1442         uma_keg_t keg;
1443
1444         bzero(zone, size);
1445         zone->uz_name = arg->name;
1446         zone->uz_ctor = arg->ctor;
1447         zone->uz_dtor = arg->dtor;
1448         zone->uz_slab = zone_fetch_slab;
1449         zone->uz_init = NULL;
1450         zone->uz_fini = NULL;
1451         zone->uz_allocs = 0;
1452         zone->uz_frees = 0;
1453         zone->uz_fails = 0;
1454         zone->uz_sleeps = 0;
1455         zone->uz_fills = zone->uz_count = 0;
1456         zone->uz_flags = 0;
1457         zone->uz_warning = NULL;
1458         timevalclear(&zone->uz_ratecheck);
1459         keg = arg->keg;
1460
1461         if (arg->flags & UMA_ZONE_SECONDARY) {
1462                 KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
1463                 zone->uz_init = arg->uminit;
1464                 zone->uz_fini = arg->fini;
1465                 zone->uz_lock = &keg->uk_lock;
1466                 zone->uz_flags |= UMA_ZONE_SECONDARY;
1467                 mtx_lock(&uma_mtx);
1468                 ZONE_LOCK(zone);
1469                 LIST_FOREACH(z, &keg->uk_zones, uz_link) {
1470                         if (LIST_NEXT(z, uz_link) == NULL) {
1471                                 LIST_INSERT_AFTER(z, zone, uz_link);
1472                                 break;
1473                         }
1474                 }
1475                 ZONE_UNLOCK(zone);
1476                 mtx_unlock(&uma_mtx);
1477         } else if (keg == NULL) {
1478                 if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
1479                     arg->align, arg->flags)) == NULL)
1480                         return (ENOMEM);
1481         } else {
1482                 struct uma_kctor_args karg;
1483                 int error;
1484
1485                 /* We should only be here from uma_startup() */
1486                 karg.size = arg->size;
1487                 karg.uminit = arg->uminit;
1488                 karg.fini = arg->fini;
1489                 karg.align = arg->align;
1490                 karg.flags = arg->flags;
1491                 karg.zone = zone;
1492                 error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg,
1493                     flags);
1494                 if (error)
1495                         return (error);
1496         }
1497         /*
1498          * Link in the first keg.
1499          */
1500         zone->uz_klink.kl_keg = keg;
1501         LIST_INSERT_HEAD(&zone->uz_kegs, &zone->uz_klink, kl_link);
1502         zone->uz_lock = &keg->uk_lock;
1503         zone->uz_size = keg->uk_size;
1504         zone->uz_flags |= (keg->uk_flags &
1505             (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT));
1506
1507         /*
1508          * Some internal zones don't have room allocated for the per cpu
1509          * caches.  If we're internal, bail out here.
1510          */
1511         if (keg->uk_flags & UMA_ZFLAG_INTERNAL) {
1512                 KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0,
1513                     ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
1514                 return (0);
1515         }
1516
1517         if (keg->uk_flags & UMA_ZONE_MAXBUCKET)
1518                 zone->uz_count = BUCKET_MAX;
1519         else if (keg->uk_ipers <= BUCKET_MAX)
1520                 zone->uz_count = keg->uk_ipers;
1521         else
1522                 zone->uz_count = BUCKET_MAX;
1523         return (0);
1524 }
1525
1526 /*
1527  * Keg header dtor.  This frees all data, destroys locks, frees the hash
1528  * table and removes the keg from the global list.
1529  *
1530  * Arguments/Returns follow uma_dtor specifications
1531  *      udata  unused
1532  */
1533 static void
1534 keg_dtor(void *arg, int size, void *udata)
1535 {
1536         uma_keg_t keg;
1537
1538         keg = (uma_keg_t)arg;
1539         KEG_LOCK(keg);
1540         if (keg->uk_free != 0) {
1541                 printf("Freed UMA keg was not empty (%d items). "
1542                     " Lost %d pages of memory.\n",
1543                     keg->uk_free, keg->uk_pages);
1544         }
1545         KEG_UNLOCK(keg);
1546
1547         hash_free(&keg->uk_hash);
1548
1549         KEG_LOCK_FINI(keg);
1550 }
1551
1552 /*
1553  * Zone header dtor.
1554  *
1555  * Arguments/Returns follow uma_dtor specifications
1556  *      udata  unused
1557  */
1558 static void
1559 zone_dtor(void *arg, int size, void *udata)
1560 {
1561         uma_klink_t klink;
1562         uma_zone_t zone;
1563         uma_keg_t keg;
1564
1565         zone = (uma_zone_t)arg;
1566         keg = zone_first_keg(zone);
1567
1568         if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
1569                 cache_drain(zone);
1570
1571         mtx_lock(&uma_mtx);
1572         LIST_REMOVE(zone, uz_link);
1573         mtx_unlock(&uma_mtx);
1574         /*
1575          * XXX there are some races here where
1576          * the zone can be drained but zone lock
1577          * released and then refilled before we
1578          * remove it... we dont care for now
1579          */
1580         zone_drain_wait(zone, M_WAITOK);
1581         /*
1582          * Unlink all of our kegs.
1583          */
1584         while ((klink = LIST_FIRST(&zone->uz_kegs)) != NULL) {
1585                 klink->kl_keg = NULL;
1586                 LIST_REMOVE(klink, kl_link);
1587                 if (klink == &zone->uz_klink)
1588                         continue;
1589                 free(klink, M_TEMP);
1590         }
1591         /*
1592          * We only destroy kegs from non secondary zones.
1593          */
1594         if ((zone->uz_flags & UMA_ZONE_SECONDARY) == 0)  {
1595                 mtx_lock(&uma_mtx);
1596                 LIST_REMOVE(keg, uk_link);
1597                 mtx_unlock(&uma_mtx);
1598                 zone_free_item(kegs, keg, NULL, SKIP_NONE,
1599                     ZFREE_STATFREE);
1600         }
1601 }
1602
1603 /*
1604  * Traverses every zone in the system and calls a callback
1605  *
1606  * Arguments:
1607  *      zfunc  A pointer to a function which accepts a zone
1608  *              as an argument.
1609  *
1610  * Returns:
1611  *      Nothing
1612  */
1613 static void
1614 zone_foreach(void (*zfunc)(uma_zone_t))
1615 {
1616         uma_keg_t keg;
1617         uma_zone_t zone;
1618
1619         mtx_lock(&uma_mtx);
1620         LIST_FOREACH(keg, &uma_kegs, uk_link) {
1621                 LIST_FOREACH(zone, &keg->uk_zones, uz_link)
1622                         zfunc(zone);
1623         }
1624         mtx_unlock(&uma_mtx);
1625 }
1626
1627 /* Public functions */
1628 /* See uma.h */
1629 void
1630 uma_startup(void *bootmem, int boot_pages)
1631 {
1632         struct uma_zctor_args args;
1633         uma_slab_t slab;
1634         u_int slabsize;
1635         u_int objsize, totsize, wsize;
1636         int i;
1637
1638 #ifdef UMA_DEBUG
1639         printf("Creating uma keg headers zone and keg.\n");
1640 #endif
1641         mtx_init(&uma_mtx, "UMA lock", NULL, MTX_DEF);
1642
1643         /*
1644          * Figure out the maximum number of items-per-slab we'll have if
1645          * we're using the OFFPAGE slab header to track free items, given
1646          * all possible object sizes and the maximum desired wastage
1647          * (UMA_MAX_WASTE).
1648          *
1649          * We iterate until we find an object size for
1650          * which the calculated wastage in keg_small_init() will be
1651          * enough to warrant OFFPAGE.  Since wastedspace versus objsize
1652          * is an overall increasing see-saw function, we find the smallest
1653          * objsize such that the wastage is always acceptable for objects
1654          * with that objsize or smaller.  Since a smaller objsize always
1655          * generates a larger possible uma_max_ipers, we use this computed
1656          * objsize to calculate the largest ipers possible.  Since the
1657          * ipers calculated for OFFPAGE slab headers is always larger than
1658          * the ipers initially calculated in keg_small_init(), we use
1659          * the former's equation (UMA_SLAB_SIZE / keg->uk_rsize) to
1660          * obtain the maximum ipers possible for offpage slab headers.
1661          *
1662          * It should be noted that ipers versus objsize is an inversly
1663          * proportional function which drops off rather quickly so as
1664          * long as our UMA_MAX_WASTE is such that the objsize we calculate
1665          * falls into the portion of the inverse relation AFTER the steep
1666          * falloff, then uma_max_ipers shouldn't be too high (~10 on i386).
1667          *
1668          * Note that we have 8-bits (1 byte) to use as a freelist index
1669          * inside the actual slab header itself and this is enough to
1670          * accomodate us.  In the worst case, a UMA_SMALLEST_UNIT sized
1671          * object with offpage slab header would have ipers =
1672          * UMA_SLAB_SIZE / UMA_SMALLEST_UNIT (currently = 256), which is
1673          * 1 greater than what our byte-integer freelist index can
1674          * accomodate, but we know that this situation never occurs as
1675          * for UMA_SMALLEST_UNIT-sized objects, we will never calculate
1676          * that we need to go to offpage slab headers.  Or, if we do,
1677          * then we trap that condition below and panic in the INVARIANTS case.
1678          */
1679         wsize = UMA_SLAB_SIZE - sizeof(struct uma_slab) - UMA_MAX_WASTE;
1680         totsize = wsize;
1681         objsize = UMA_SMALLEST_UNIT;
1682         while (totsize >= wsize) {
1683                 totsize = (UMA_SLAB_SIZE - sizeof(struct uma_slab)) /
1684                     (objsize + UMA_FRITM_SZ);
1685                 totsize *= (UMA_FRITM_SZ + objsize);
1686                 objsize++;
1687         }
1688         if (objsize > UMA_SMALLEST_UNIT)
1689                 objsize--;
1690         uma_max_ipers = MAX(UMA_SLAB_SIZE / objsize, 64);
1691
1692         wsize = UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt) - UMA_MAX_WASTE;
1693         totsize = wsize;
1694         objsize = UMA_SMALLEST_UNIT;
1695         while (totsize >= wsize) {
1696                 totsize = (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt)) /
1697                     (objsize + UMA_FRITMREF_SZ);
1698                 totsize *= (UMA_FRITMREF_SZ + objsize);
1699                 objsize++;
1700         }
1701         if (objsize > UMA_SMALLEST_UNIT)
1702                 objsize--;
1703         uma_max_ipers_ref = MAX(UMA_SLAB_SIZE / objsize, 64);
1704
1705         KASSERT((uma_max_ipers_ref <= 255) && (uma_max_ipers <= 255),
1706             ("uma_startup: calculated uma_max_ipers values too large!"));
1707
1708 #ifdef UMA_DEBUG
1709         printf("Calculated uma_max_ipers (for OFFPAGE) is %d\n", uma_max_ipers);
1710         printf("Calculated uma_max_ipers_ref (for OFFPAGE) is %d\n",
1711             uma_max_ipers_ref);
1712 #endif
1713
1714         /* "manually" create the initial zone */
1715         args.name = "UMA Kegs";
1716         args.size = sizeof(struct uma_keg);
1717         args.ctor = keg_ctor;
1718         args.dtor = keg_dtor;
1719         args.uminit = zero_init;
1720         args.fini = NULL;
1721         args.keg = &masterkeg;
1722         args.align = 32 - 1;
1723         args.flags = UMA_ZFLAG_INTERNAL;
1724         /* The initial zone has no Per cpu queues so it's smaller */
1725         zone_ctor(kegs, sizeof(struct uma_zone), &args, M_WAITOK);
1726
1727 #ifdef UMA_DEBUG
1728         printf("Filling boot free list.\n");
1729 #endif
1730         for (i = 0; i < boot_pages; i++) {
1731                 slab = (uma_slab_t)((u_int8_t *)bootmem + (i * UMA_SLAB_SIZE));
1732                 slab->us_data = (u_int8_t *)slab;
1733                 slab->us_flags = UMA_SLAB_BOOT;
1734                 LIST_INSERT_HEAD(&uma_boot_pages, slab, us_link);
1735         }
1736         mtx_init(&uma_boot_pages_mtx, "UMA boot pages", NULL, MTX_DEF);
1737
1738 #ifdef UMA_DEBUG
1739         printf("Creating uma zone headers zone and keg.\n");
1740 #endif
1741         args.name = "UMA Zones";
1742         args.size = sizeof(struct uma_zone) +
1743             (sizeof(struct uma_cache) * (mp_maxid + 1));
1744         args.ctor = zone_ctor;
1745         args.dtor = zone_dtor;
1746         args.uminit = zero_init;
1747         args.fini = NULL;
1748         args.keg = NULL;
1749         args.align = 32 - 1;
1750         args.flags = UMA_ZFLAG_INTERNAL;
1751         /* The initial zone has no Per cpu queues so it's smaller */
1752         zone_ctor(zones, sizeof(struct uma_zone), &args, M_WAITOK);
1753
1754 #ifdef UMA_DEBUG
1755         printf("Initializing pcpu cache locks.\n");
1756 #endif
1757 #ifdef UMA_DEBUG
1758         printf("Creating slab and hash zones.\n");
1759 #endif
1760
1761         /*
1762          * This is the max number of free list items we'll have with
1763          * offpage slabs.
1764          */
1765         slabsize = uma_max_ipers * UMA_FRITM_SZ;
1766         slabsize += sizeof(struct uma_slab);
1767
1768         /* Now make a zone for slab headers */
1769         slabzone = uma_zcreate("UMA Slabs",
1770                                 slabsize,
1771                                 NULL, NULL, NULL, NULL,
1772                                 UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
1773
1774         /*
1775          * We also create a zone for the bigger slabs with reference
1776          * counts in them, to accomodate UMA_ZONE_REFCNT zones.
1777          */
1778         slabsize = uma_max_ipers_ref * UMA_FRITMREF_SZ;
1779         slabsize += sizeof(struct uma_slab_refcnt);
1780         slabrefzone = uma_zcreate("UMA RCntSlabs",
1781                                   slabsize,
1782                                   NULL, NULL, NULL, NULL,
1783                                   UMA_ALIGN_PTR,
1784                                   UMA_ZFLAG_INTERNAL);
1785
1786         hashzone = uma_zcreate("UMA Hash",
1787             sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
1788             NULL, NULL, NULL, NULL,
1789             UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
1790
1791         bucket_init();
1792
1793         booted = UMA_STARTUP;
1794
1795 #ifdef UMA_DEBUG
1796         printf("UMA startup complete.\n");
1797 #endif
1798 }
1799
1800 /* see uma.h */
1801 void
1802 uma_startup2(void)
1803 {
1804         booted = UMA_STARTUP2;
1805         bucket_enable();
1806 #ifdef UMA_DEBUG
1807         printf("UMA startup2 complete.\n");
1808 #endif
1809 }
1810
1811 /*
1812  * Initialize our callout handle
1813  *
1814  */
1815
1816 static void
1817 uma_startup3(void)
1818 {
1819 #ifdef UMA_DEBUG
1820         printf("Starting callout.\n");
1821 #endif
1822         callout_init(&uma_callout, CALLOUT_MPSAFE);
1823         callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
1824 #ifdef UMA_DEBUG
1825         printf("UMA startup3 complete.\n");
1826 #endif
1827 }
1828
1829 static uma_keg_t
1830 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
1831                 int align, u_int32_t flags)
1832 {
1833         struct uma_kctor_args args;
1834
1835         args.size = size;
1836         args.uminit = uminit;
1837         args.fini = fini;
1838         args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
1839         args.flags = flags;
1840         args.zone = zone;
1841         return (zone_alloc_item(kegs, &args, M_WAITOK));
1842 }
1843
1844 /* See uma.h */
1845 void
1846 uma_set_align(int align)
1847 {
1848
1849         if (align != UMA_ALIGN_CACHE)
1850                 uma_align_cache = align;
1851 }
1852
1853 /* See uma.h */
1854 uma_zone_t
1855 uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
1856                 uma_init uminit, uma_fini fini, int align, u_int32_t flags)
1857
1858 {
1859         struct uma_zctor_args args;
1860
1861         /* This stuff is essential for the zone ctor */
1862         args.name = name;
1863         args.size = size;
1864         args.ctor = ctor;
1865         args.dtor = dtor;
1866         args.uminit = uminit;
1867         args.fini = fini;
1868         args.align = align;
1869         args.flags = flags;
1870         args.keg = NULL;
1871
1872         return (zone_alloc_item(zones, &args, M_WAITOK));
1873 }
1874
1875 /* See uma.h */
1876 uma_zone_t
1877 uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
1878                     uma_init zinit, uma_fini zfini, uma_zone_t master)
1879 {
1880         struct uma_zctor_args args;
1881         uma_keg_t keg;
1882
1883         keg = zone_first_keg(master);
1884         args.name = name;
1885         args.size = keg->uk_size;
1886         args.ctor = ctor;
1887         args.dtor = dtor;
1888         args.uminit = zinit;
1889         args.fini = zfini;
1890         args.align = keg->uk_align;
1891         args.flags = keg->uk_flags | UMA_ZONE_SECONDARY;
1892         args.keg = keg;
1893
1894         /* XXX Attaches only one keg of potentially many. */
1895         return (zone_alloc_item(zones, &args, M_WAITOK));
1896 }
1897
1898 static void
1899 zone_lock_pair(uma_zone_t a, uma_zone_t b)
1900 {
1901         if (a < b) {
1902                 ZONE_LOCK(a);
1903                 mtx_lock_flags(b->uz_lock, MTX_DUPOK);
1904         } else {
1905                 ZONE_LOCK(b);
1906                 mtx_lock_flags(a->uz_lock, MTX_DUPOK);
1907         }
1908 }
1909
1910 static void
1911 zone_unlock_pair(uma_zone_t a, uma_zone_t b)
1912 {
1913
1914         ZONE_UNLOCK(a);
1915         ZONE_UNLOCK(b);
1916 }
1917
1918 int
1919 uma_zsecond_add(uma_zone_t zone, uma_zone_t master)
1920 {
1921         uma_klink_t klink;
1922         uma_klink_t kl;
1923         int error;
1924
1925         error = 0;
1926         klink = malloc(sizeof(*klink), M_TEMP, M_WAITOK | M_ZERO);
1927
1928         zone_lock_pair(zone, master);
1929         /*
1930          * zone must use vtoslab() to resolve objects and must already be
1931          * a secondary.
1932          */
1933         if ((zone->uz_flags & (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY))
1934             != (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY)) {
1935                 error = EINVAL;
1936                 goto out;
1937         }
1938         /*
1939          * The new master must also use vtoslab().
1940          */
1941         if ((zone->uz_flags & UMA_ZONE_VTOSLAB) != UMA_ZONE_VTOSLAB) {
1942                 error = EINVAL;
1943                 goto out;
1944         }
1945         /*
1946          * Both must either be refcnt, or not be refcnt.
1947          */
1948         if ((zone->uz_flags & UMA_ZONE_REFCNT) !=
1949             (master->uz_flags & UMA_ZONE_REFCNT)) {
1950                 error = EINVAL;
1951                 goto out;
1952         }
1953         /*
1954          * The underlying object must be the same size.  rsize
1955          * may be different.
1956          */
1957         if (master->uz_size != zone->uz_size) {
1958                 error = E2BIG;
1959                 goto out;
1960         }
1961         /*
1962          * Put it at the end of the list.
1963          */
1964         klink->kl_keg = zone_first_keg(master);
1965         LIST_FOREACH(kl, &zone->uz_kegs, kl_link) {
1966                 if (LIST_NEXT(kl, kl_link) == NULL) {
1967                         LIST_INSERT_AFTER(kl, klink, kl_link);
1968                         break;
1969                 }
1970         }
1971         klink = NULL;
1972         zone->uz_flags |= UMA_ZFLAG_MULTI;
1973         zone->uz_slab = zone_fetch_slab_multi;
1974
1975 out:
1976         zone_unlock_pair(zone, master);
1977         if (klink != NULL)
1978                 free(klink, M_TEMP);
1979
1980         return (error);
1981 }
1982
1983
1984 /* See uma.h */
1985 void
1986 uma_zdestroy(uma_zone_t zone)
1987 {
1988
1989         zone_free_item(zones, zone, NULL, SKIP_NONE, ZFREE_STATFREE);
1990 }
1991
1992 /* See uma.h */
1993 void *
1994 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
1995 {
1996         void *item;
1997         uma_cache_t cache;
1998         uma_bucket_t bucket;
1999         int cpu;
2000
2001         /* This is the fast path allocation */
2002 #ifdef UMA_DEBUG_ALLOC_1
2003         printf("Allocating one item from %s(%p)\n", zone->uz_name, zone);
2004 #endif
2005         CTR3(KTR_UMA, "uma_zalloc_arg thread %x zone %s flags %d", curthread,
2006             zone->uz_name, flags);
2007
2008         if (flags & M_WAITOK) {
2009                 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2010                     "uma_zalloc_arg: zone \"%s\"", zone->uz_name);
2011         }
2012 #ifdef DEBUG_MEMGUARD
2013         if (memguard_cmp_zone(zone)) {
2014                 item = memguard_alloc(zone->uz_size, flags);
2015                 if (item != NULL) {
2016                         /*
2017                          * Avoid conflict with the use-after-free
2018                          * protecting infrastructure from INVARIANTS.
2019                          */
2020                         if (zone->uz_init != NULL &&
2021                             zone->uz_init != mtrash_init &&
2022                             zone->uz_init(item, zone->uz_size, flags) != 0)
2023                                 return (NULL);
2024                         if (zone->uz_ctor != NULL &&
2025                             zone->uz_ctor != mtrash_ctor &&
2026                             zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
2027                                 zone->uz_fini(item, zone->uz_size);
2028                                 return (NULL);
2029                         }
2030                         return (item);
2031                 }
2032                 /* This is unfortunate but should not be fatal. */
2033         }
2034 #endif
2035         /*
2036          * If possible, allocate from the per-CPU cache.  There are two
2037          * requirements for safe access to the per-CPU cache: (1) the thread
2038          * accessing the cache must not be preempted or yield during access,
2039          * and (2) the thread must not migrate CPUs without switching which
2040          * cache it accesses.  We rely on a critical section to prevent
2041          * preemption and migration.  We release the critical section in
2042          * order to acquire the zone mutex if we are unable to allocate from
2043          * the current cache; when we re-acquire the critical section, we
2044          * must detect and handle migration if it has occurred.
2045          */
2046 zalloc_restart:
2047         critical_enter();
2048         cpu = curcpu;
2049         cache = &zone->uz_cpu[cpu];
2050
2051 zalloc_start:
2052         bucket = cache->uc_allocbucket;
2053
2054         if (bucket) {
2055                 if (bucket->ub_cnt > 0) {
2056                         bucket->ub_cnt--;
2057                         item = bucket->ub_bucket[bucket->ub_cnt];
2058 #ifdef INVARIANTS
2059                         bucket->ub_bucket[bucket->ub_cnt] = NULL;
2060 #endif
2061                         KASSERT(item != NULL,
2062                             ("uma_zalloc: Bucket pointer mangled."));
2063                         cache->uc_allocs++;
2064                         critical_exit();
2065 #ifdef INVARIANTS
2066                         ZONE_LOCK(zone);
2067                         uma_dbg_alloc(zone, NULL, item);
2068                         ZONE_UNLOCK(zone);
2069 #endif
2070                         if (zone->uz_ctor != NULL) {
2071                                 if (zone->uz_ctor(item, zone->uz_size,
2072                                     udata, flags) != 0) {
2073                                         zone_free_item(zone, item, udata,
2074                                             SKIP_DTOR, ZFREE_STATFAIL |
2075                                             ZFREE_STATFREE);
2076                                         return (NULL);
2077                                 }
2078                         }
2079                         if (flags & M_ZERO)
2080                                 bzero(item, zone->uz_size);
2081                         return (item);
2082                 } else if (cache->uc_freebucket) {
2083                         /*
2084                          * We have run out of items in our allocbucket.
2085                          * See if we can switch with our free bucket.
2086                          */
2087                         if (cache->uc_freebucket->ub_cnt > 0) {
2088 #ifdef UMA_DEBUG_ALLOC
2089                                 printf("uma_zalloc: Swapping empty with"
2090                                     " alloc.\n");
2091 #endif
2092                                 bucket = cache->uc_freebucket;
2093                                 cache->uc_freebucket = cache->uc_allocbucket;
2094                                 cache->uc_allocbucket = bucket;
2095
2096                                 goto zalloc_start;
2097                         }
2098                 }
2099         }
2100         /*
2101          * Attempt to retrieve the item from the per-CPU cache has failed, so
2102          * we must go back to the zone.  This requires the zone lock, so we
2103          * must drop the critical section, then re-acquire it when we go back
2104          * to the cache.  Since the critical section is released, we may be
2105          * preempted or migrate.  As such, make sure not to maintain any
2106          * thread-local state specific to the cache from prior to releasing
2107          * the critical section.
2108          */
2109         critical_exit();
2110         ZONE_LOCK(zone);
2111         critical_enter();
2112         cpu = curcpu;
2113         cache = &zone->uz_cpu[cpu];
2114         bucket = cache->uc_allocbucket;
2115         if (bucket != NULL) {
2116                 if (bucket->ub_cnt > 0) {
2117                         ZONE_UNLOCK(zone);
2118                         goto zalloc_start;
2119                 }
2120                 bucket = cache->uc_freebucket;
2121                 if (bucket != NULL && bucket->ub_cnt > 0) {
2122                         ZONE_UNLOCK(zone);
2123                         goto zalloc_start;
2124                 }
2125         }
2126
2127         /* Since we have locked the zone we may as well send back our stats */
2128         zone->uz_allocs += cache->uc_allocs;
2129         cache->uc_allocs = 0;
2130         zone->uz_frees += cache->uc_frees;
2131         cache->uc_frees = 0;
2132
2133         /* Our old one is now a free bucket */
2134         if (cache->uc_allocbucket) {
2135                 KASSERT(cache->uc_allocbucket->ub_cnt == 0,
2136                     ("uma_zalloc_arg: Freeing a non free bucket."));
2137                 LIST_INSERT_HEAD(&zone->uz_free_bucket,
2138                     cache->uc_allocbucket, ub_link);
2139                 cache->uc_allocbucket = NULL;
2140         }
2141
2142         /* Check the free list for a new alloc bucket */
2143         if ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) {
2144                 KASSERT(bucket->ub_cnt != 0,
2145                     ("uma_zalloc_arg: Returning an empty bucket."));
2146
2147                 LIST_REMOVE(bucket, ub_link);
2148                 cache->uc_allocbucket = bucket;
2149                 ZONE_UNLOCK(zone);
2150                 goto zalloc_start;
2151         }
2152         /* We are no longer associated with this CPU. */
2153         critical_exit();
2154
2155         /* Bump up our uz_count so we get here less */
2156         if (zone->uz_count < BUCKET_MAX)
2157                 zone->uz_count++;
2158
2159         /*
2160          * Now lets just fill a bucket and put it on the free list.  If that
2161          * works we'll restart the allocation from the begining.
2162          */
2163         if (zone_alloc_bucket(zone, flags)) {
2164                 ZONE_UNLOCK(zone);
2165                 goto zalloc_restart;
2166         }
2167         ZONE_UNLOCK(zone);
2168         /*
2169          * We may not be able to get a bucket so return an actual item.
2170          */
2171 #ifdef UMA_DEBUG
2172         printf("uma_zalloc_arg: Bucketzone returned NULL\n");
2173 #endif
2174
2175         item = zone_alloc_item(zone, udata, flags);
2176         return (item);
2177 }
2178
2179 static uma_slab_t
2180 keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int flags)
2181 {
2182         uma_slab_t slab;
2183
2184         mtx_assert(&keg->uk_lock, MA_OWNED);
2185         slab = NULL;
2186
2187         for (;;) {
2188                 /*
2189                  * Find a slab with some space.  Prefer slabs that are partially
2190                  * used over those that are totally full.  This helps to reduce
2191                  * fragmentation.
2192                  */
2193                 if (keg->uk_free != 0) {
2194                         if (!LIST_EMPTY(&keg->uk_part_slab)) {
2195                                 slab = LIST_FIRST(&keg->uk_part_slab);
2196                         } else {
2197                                 slab = LIST_FIRST(&keg->uk_free_slab);
2198                                 LIST_REMOVE(slab, us_link);
2199                                 LIST_INSERT_HEAD(&keg->uk_part_slab, slab,
2200                                     us_link);
2201                         }
2202                         MPASS(slab->us_keg == keg);
2203                         return (slab);
2204                 }
2205
2206                 /*
2207                  * M_NOVM means don't ask at all!
2208                  */
2209                 if (flags & M_NOVM)
2210                         break;
2211
2212                 if (keg->uk_maxpages && keg->uk_pages >= keg->uk_maxpages) {
2213                         keg->uk_flags |= UMA_ZFLAG_FULL;
2214                         /*
2215                          * If this is not a multi-zone, set the FULL bit.
2216                          * Otherwise slab_multi() takes care of it.
2217                          */
2218                         if ((zone->uz_flags & UMA_ZFLAG_MULTI) == 0) {
2219                                 zone->uz_flags |= UMA_ZFLAG_FULL;
2220                                 zone_log_warning(zone);
2221                         }
2222                         if (flags & M_NOWAIT)
2223                                 break;
2224                         zone->uz_sleeps++;
2225                         msleep(keg, &keg->uk_lock, PVM, "keglimit", 0);
2226                         continue;
2227                 }
2228                 keg->uk_recurse++;
2229                 slab = keg_alloc_slab(keg, zone, flags);
2230                 keg->uk_recurse--;
2231                 /*
2232                  * If we got a slab here it's safe to mark it partially used
2233                  * and return.  We assume that the caller is going to remove
2234                  * at least one item.
2235                  */
2236                 if (slab) {
2237                         MPASS(slab->us_keg == keg);
2238                         LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
2239                         return (slab);
2240                 }
2241                 /*
2242                  * We might not have been able to get a slab but another cpu
2243                  * could have while we were unlocked.  Check again before we
2244                  * fail.
2245                  */
2246                 flags |= M_NOVM;
2247         }
2248         return (slab);
2249 }
2250
2251 static inline void
2252 zone_relock(uma_zone_t zone, uma_keg_t keg)
2253 {
2254         if (zone->uz_lock != &keg->uk_lock) {
2255                 KEG_UNLOCK(keg);
2256                 ZONE_LOCK(zone);
2257         }
2258 }
2259
2260 static inline void
2261 keg_relock(uma_keg_t keg, uma_zone_t zone)
2262 {
2263         if (zone->uz_lock != &keg->uk_lock) {
2264                 ZONE_UNLOCK(zone);
2265                 KEG_LOCK(keg);
2266         }
2267 }
2268
2269 static uma_slab_t
2270 zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int flags)
2271 {
2272         uma_slab_t slab;
2273
2274         if (keg == NULL)
2275                 keg = zone_first_keg(zone);
2276         /*
2277          * This is to prevent us from recursively trying to allocate
2278          * buckets.  The problem is that if an allocation forces us to
2279          * grab a new bucket we will call page_alloc, which will go off
2280          * and cause the vm to allocate vm_map_entries.  If we need new
2281          * buckets there too we will recurse in kmem_alloc and bad
2282          * things happen.  So instead we return a NULL bucket, and make
2283          * the code that allocates buckets smart enough to deal with it
2284          */
2285         if (keg->uk_flags & UMA_ZFLAG_BUCKET && keg->uk_recurse != 0)
2286                 return (NULL);
2287
2288         for (;;) {
2289                 slab = keg_fetch_slab(keg, zone, flags);
2290                 if (slab)
2291                         return (slab);
2292                 if (flags & (M_NOWAIT | M_NOVM))
2293                         break;
2294         }
2295         return (NULL);
2296 }
2297
2298 /*
2299  * uma_zone_fetch_slab_multi:  Fetches a slab from one available keg.  Returns
2300  * with the keg locked.  Caller must call zone_relock() afterwards if the
2301  * zone lock is required.  On NULL the zone lock is held.
2302  *
2303  * The last pointer is used to seed the search.  It is not required.
2304  */
2305 static uma_slab_t
2306 zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int rflags)
2307 {
2308         uma_klink_t klink;
2309         uma_slab_t slab;
2310         uma_keg_t keg;
2311         int flags;
2312         int empty;
2313         int full;
2314
2315         /*
2316          * Don't wait on the first pass.  This will skip limit tests
2317          * as well.  We don't want to block if we can find a provider
2318          * without blocking.
2319          */
2320         flags = (rflags & ~M_WAITOK) | M_NOWAIT;
2321         /*
2322          * Use the last slab allocated as a hint for where to start
2323          * the search.
2324          */
2325         if (last) {
2326                 slab = keg_fetch_slab(last, zone, flags);
2327                 if (slab)
2328                         return (slab);
2329                 zone_relock(zone, last);
2330                 last = NULL;
2331         }
2332         /*
2333          * Loop until we have a slab incase of transient failures
2334          * while M_WAITOK is specified.  I'm not sure this is 100%
2335          * required but we've done it for so long now.
2336          */
2337         for (;;) {
2338                 empty = 0;
2339                 full = 0;
2340                 /*
2341                  * Search the available kegs for slabs.  Be careful to hold the
2342                  * correct lock while calling into the keg layer.
2343                  */
2344                 LIST_FOREACH(klink, &zone->uz_kegs, kl_link) {
2345                         keg = klink->kl_keg;
2346                         keg_relock(keg, zone);
2347                         if ((keg->uk_flags & UMA_ZFLAG_FULL) == 0) {
2348                                 slab = keg_fetch_slab(keg, zone, flags);
2349                                 if (slab)
2350                                         return (slab);
2351                         }
2352                         if (keg->uk_flags & UMA_ZFLAG_FULL)
2353                                 full++;
2354                         else
2355                                 empty++;
2356                         zone_relock(zone, keg);
2357                 }
2358                 if (rflags & (M_NOWAIT | M_NOVM))
2359                         break;
2360                 flags = rflags;
2361                 /*
2362                  * All kegs are full.  XXX We can't atomically check all kegs
2363                  * and sleep so just sleep for a short period and retry.
2364                  */
2365                 if (full && !empty) {
2366                         zone->uz_flags |= UMA_ZFLAG_FULL;
2367                         zone->uz_sleeps++;
2368                         zone_log_warning(zone);
2369                         msleep(zone, zone->uz_lock, PVM, "zonelimit", hz/100);
2370                         zone->uz_flags &= ~UMA_ZFLAG_FULL;
2371                         continue;
2372                 }
2373         }
2374         return (NULL);
2375 }
2376
2377 static void *
2378 slab_alloc_item(uma_zone_t zone, uma_slab_t slab)
2379 {
2380         uma_keg_t keg;
2381         uma_slabrefcnt_t slabref;
2382         void *item;
2383         u_int8_t freei;
2384
2385         keg = slab->us_keg;
2386         mtx_assert(&keg->uk_lock, MA_OWNED);
2387
2388         freei = slab->us_firstfree;
2389         if (keg->uk_flags & UMA_ZONE_REFCNT) {
2390                 slabref = (uma_slabrefcnt_t)slab;
2391                 slab->us_firstfree = slabref->us_freelist[freei].us_item;
2392         } else {
2393                 slab->us_firstfree = slab->us_freelist[freei].us_item;
2394         }
2395         item = slab->us_data + (keg->uk_rsize * freei);
2396
2397         slab->us_freecount--;
2398         keg->uk_free--;
2399 #ifdef INVARIANTS
2400         uma_dbg_alloc(zone, slab, item);
2401 #endif
2402         /* Move this slab to the full list */
2403         if (slab->us_freecount == 0) {
2404                 LIST_REMOVE(slab, us_link);
2405                 LIST_INSERT_HEAD(&keg->uk_full_slab, slab, us_link);
2406         }
2407
2408         return (item);
2409 }
2410
2411 static int
2412 zone_alloc_bucket(uma_zone_t zone, int flags)
2413 {
2414         uma_bucket_t bucket;
2415         uma_slab_t slab;
2416         uma_keg_t keg;
2417         int16_t saved;
2418         int max, origflags = flags;
2419
2420         /*
2421          * Try this zone's free list first so we don't allocate extra buckets.
2422          */
2423         if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
2424                 KASSERT(bucket->ub_cnt == 0,
2425                     ("zone_alloc_bucket: Bucket on free list is not empty."));
2426                 LIST_REMOVE(bucket, ub_link);
2427         } else {
2428                 int bflags;
2429
2430                 bflags = (flags & ~M_ZERO);
2431                 if (zone->uz_flags & UMA_ZFLAG_CACHEONLY)
2432                         bflags |= M_NOVM;
2433
2434                 ZONE_UNLOCK(zone);
2435                 bucket = bucket_alloc(zone->uz_count, bflags);
2436                 ZONE_LOCK(zone);
2437         }
2438
2439         if (bucket == NULL) {
2440                 return (0);
2441         }
2442
2443 #ifdef SMP
2444         /*
2445          * This code is here to limit the number of simultaneous bucket fills
2446          * for any given zone to the number of per cpu caches in this zone. This
2447          * is done so that we don't allocate more memory than we really need.
2448          */
2449         if (zone->uz_fills >= mp_ncpus)
2450                 goto done;
2451
2452 #endif
2453         zone->uz_fills++;
2454
2455         max = MIN(bucket->ub_entries, zone->uz_count);
2456         /* Try to keep the buckets totally full */
2457         saved = bucket->ub_cnt;
2458         slab = NULL;
2459         keg = NULL;
2460         while (bucket->ub_cnt < max &&
2461             (slab = zone->uz_slab(zone, keg, flags)) != NULL) {
2462                 keg = slab->us_keg;
2463                 while (slab->us_freecount && bucket->ub_cnt < max) {
2464                         bucket->ub_bucket[bucket->ub_cnt++] =
2465                             slab_alloc_item(zone, slab);
2466                 }
2467
2468                 /* Don't block on the next fill */
2469                 flags |= M_NOWAIT;
2470         }
2471         if (slab)
2472                 zone_relock(zone, keg);
2473
2474         /*
2475          * We unlock here because we need to call the zone's init.
2476          * It should be safe to unlock because the slab dealt with
2477          * above is already on the appropriate list within the keg
2478          * and the bucket we filled is not yet on any list, so we
2479          * own it.
2480          */
2481         if (zone->uz_init != NULL) {
2482                 int i;
2483
2484                 ZONE_UNLOCK(zone);
2485                 for (i = saved; i < bucket->ub_cnt; i++)
2486                         if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size,
2487                             origflags) != 0)
2488                                 break;
2489                 /*
2490                  * If we couldn't initialize the whole bucket, put the
2491                  * rest back onto the freelist.
2492                  */
2493                 if (i != bucket->ub_cnt) {
2494                         int j;
2495
2496                         for (j = i; j < bucket->ub_cnt; j++) {
2497                                 zone_free_item(zone, bucket->ub_bucket[j],
2498                                     NULL, SKIP_FINI, 0);
2499 #ifdef INVARIANTS
2500                                 bucket->ub_bucket[j] = NULL;
2501 #endif
2502                         }
2503                         bucket->ub_cnt = i;
2504                 }
2505                 ZONE_LOCK(zone);
2506         }
2507
2508         zone->uz_fills--;
2509         if (bucket->ub_cnt != 0) {
2510                 LIST_INSERT_HEAD(&zone->uz_full_bucket,
2511                     bucket, ub_link);
2512                 return (1);
2513         }
2514 #ifdef SMP
2515 done:
2516 #endif
2517         bucket_free(bucket);
2518
2519         return (0);
2520 }
2521 /*
2522  * Allocates an item for an internal zone
2523  *
2524  * Arguments
2525  *      zone   The zone to alloc for.
2526  *      udata  The data to be passed to the constructor.
2527  *      flags  M_WAITOK, M_NOWAIT, M_ZERO.
2528  *
2529  * Returns
2530  *      NULL if there is no memory and M_NOWAIT is set
2531  *      An item if successful
2532  */
2533
2534 static void *
2535 zone_alloc_item(uma_zone_t zone, void *udata, int flags)
2536 {
2537         uma_slab_t slab;
2538         void *item;
2539
2540         item = NULL;
2541
2542 #ifdef UMA_DEBUG_ALLOC
2543         printf("INTERNAL: Allocating one item from %s(%p)\n", zone->uz_name, zone);
2544 #endif
2545         ZONE_LOCK(zone);
2546
2547         slab = zone->uz_slab(zone, NULL, flags);
2548         if (slab == NULL) {
2549                 zone->uz_fails++;
2550                 ZONE_UNLOCK(zone);
2551                 return (NULL);
2552         }
2553
2554         item = slab_alloc_item(zone, slab);
2555
2556         zone_relock(zone, slab->us_keg);
2557         zone->uz_allocs++;
2558         ZONE_UNLOCK(zone);
2559
2560         /*
2561          * We have to call both the zone's init (not the keg's init)
2562          * and the zone's ctor.  This is because the item is going from
2563          * a keg slab directly to the user, and the user is expecting it
2564          * to be both zone-init'd as well as zone-ctor'd.
2565          */
2566         if (zone->uz_init != NULL) {
2567                 if (zone->uz_init(item, zone->uz_size, flags) != 0) {
2568                         zone_free_item(zone, item, udata, SKIP_FINI,
2569                             ZFREE_STATFAIL | ZFREE_STATFREE);
2570                         return (NULL);
2571                 }
2572         }
2573         if (zone->uz_ctor != NULL) {
2574                 if (zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
2575                         zone_free_item(zone, item, udata, SKIP_DTOR,
2576                             ZFREE_STATFAIL | ZFREE_STATFREE);
2577                         return (NULL);
2578                 }
2579         }
2580         if (flags & M_ZERO)
2581                 bzero(item, zone->uz_size);
2582
2583         return (item);
2584 }
2585
2586 /* See uma.h */
2587 void
2588 uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
2589 {
2590         uma_cache_t cache;
2591         uma_bucket_t bucket;
2592         int bflags;
2593         int cpu;
2594
2595 #ifdef UMA_DEBUG_ALLOC_1
2596         printf("Freeing item %p to %s(%p)\n", item, zone->uz_name, zone);
2597 #endif
2598         CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread,
2599             zone->uz_name);
2600
2601         /* uma_zfree(..., NULL) does nothing, to match free(9). */
2602         if (item == NULL)
2603                 return;
2604 #ifdef DEBUG_MEMGUARD
2605         if (is_memguard_addr(item)) {
2606                 if (zone->uz_dtor != NULL && zone->uz_dtor != mtrash_dtor)
2607                         zone->uz_dtor(item, zone->uz_size, udata);
2608                 if (zone->uz_fini != NULL && zone->uz_fini != mtrash_fini)
2609                         zone->uz_fini(item, zone->uz_size);
2610                 memguard_free(item);
2611                 return;
2612         }
2613 #endif
2614         if (zone->uz_dtor)
2615                 zone->uz_dtor(item, zone->uz_size, udata);
2616
2617 #ifdef INVARIANTS
2618         ZONE_LOCK(zone);
2619         if (zone->uz_flags & UMA_ZONE_MALLOC)
2620                 uma_dbg_free(zone, udata, item);
2621         else
2622                 uma_dbg_free(zone, NULL, item);
2623         ZONE_UNLOCK(zone);
2624 #endif
2625         /*
2626          * The race here is acceptable.  If we miss it we'll just have to wait
2627          * a little longer for the limits to be reset.
2628          */
2629         if (zone->uz_flags & UMA_ZFLAG_FULL)
2630                 goto zfree_internal;
2631
2632         /*
2633          * If possible, free to the per-CPU cache.  There are two
2634          * requirements for safe access to the per-CPU cache: (1) the thread
2635          * accessing the cache must not be preempted or yield during access,
2636          * and (2) the thread must not migrate CPUs without switching which
2637          * cache it accesses.  We rely on a critical section to prevent
2638          * preemption and migration.  We release the critical section in
2639          * order to acquire the zone mutex if we are unable to free to the
2640          * current cache; when we re-acquire the critical section, we must
2641          * detect and handle migration if it has occurred.
2642          */
2643 zfree_restart:
2644         critical_enter();
2645         cpu = curcpu;
2646         cache = &zone->uz_cpu[cpu];
2647
2648 zfree_start:
2649         bucket = cache->uc_freebucket;
2650
2651         if (bucket) {
2652                 /*
2653                  * Do we have room in our bucket? It is OK for this uz count
2654                  * check to be slightly out of sync.
2655                  */
2656
2657                 if (bucket->ub_cnt < bucket->ub_entries) {
2658                         KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL,
2659                             ("uma_zfree: Freeing to non free bucket index."));
2660                         bucket->ub_bucket[bucket->ub_cnt] = item;
2661                         bucket->ub_cnt++;
2662                         cache->uc_frees++;
2663                         critical_exit();
2664                         return;
2665                 } else if (cache->uc_allocbucket) {
2666 #ifdef UMA_DEBUG_ALLOC
2667                         printf("uma_zfree: Swapping buckets.\n");
2668 #endif
2669                         /*
2670                          * We have run out of space in our freebucket.
2671                          * See if we can switch with our alloc bucket.
2672                          */
2673                         if (cache->uc_allocbucket->ub_cnt <
2674                             cache->uc_freebucket->ub_cnt) {
2675                                 bucket = cache->uc_freebucket;
2676                                 cache->uc_freebucket = cache->uc_allocbucket;
2677                                 cache->uc_allocbucket = bucket;
2678                                 goto zfree_start;
2679                         }
2680                 }
2681         }
2682         /*
2683          * We can get here for two reasons:
2684          *
2685          * 1) The buckets are NULL
2686          * 2) The alloc and free buckets are both somewhat full.
2687          *
2688          * We must go back the zone, which requires acquiring the zone lock,
2689          * which in turn means we must release and re-acquire the critical
2690          * section.  Since the critical section is released, we may be
2691          * preempted or migrate.  As such, make sure not to maintain any
2692          * thread-local state specific to the cache from prior to releasing
2693          * the critical section.
2694          */
2695         critical_exit();
2696         ZONE_LOCK(zone);
2697         critical_enter();
2698         cpu = curcpu;
2699         cache = &zone->uz_cpu[cpu];
2700         if (cache->uc_freebucket != NULL) {
2701                 if (cache->uc_freebucket->ub_cnt <
2702                     cache->uc_freebucket->ub_entries) {
2703                         ZONE_UNLOCK(zone);
2704                         goto zfree_start;
2705                 }
2706                 if (cache->uc_allocbucket != NULL &&
2707                     (cache->uc_allocbucket->ub_cnt <
2708                     cache->uc_freebucket->ub_cnt)) {
2709                         ZONE_UNLOCK(zone);
2710                         goto zfree_start;
2711                 }
2712         }
2713
2714         /* Since we have locked the zone we may as well send back our stats */
2715         zone->uz_allocs += cache->uc_allocs;
2716         cache->uc_allocs = 0;
2717         zone->uz_frees += cache->uc_frees;
2718         cache->uc_frees = 0;
2719
2720         bucket = cache->uc_freebucket;
2721         cache->uc_freebucket = NULL;
2722
2723         /* Can we throw this on the zone full list? */
2724         if (bucket != NULL) {
2725 #ifdef UMA_DEBUG_ALLOC
2726                 printf("uma_zfree: Putting old bucket on the free list.\n");
2727 #endif
2728                 /* ub_cnt is pointing to the last free item */
2729                 KASSERT(bucket->ub_cnt != 0,
2730                     ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n"));
2731                 LIST_INSERT_HEAD(&zone->uz_full_bucket,
2732                     bucket, ub_link);
2733         }
2734         if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
2735                 LIST_REMOVE(bucket, ub_link);
2736                 ZONE_UNLOCK(zone);
2737                 cache->uc_freebucket = bucket;
2738                 goto zfree_start;
2739         }
2740         /* We are no longer associated with this CPU. */
2741         critical_exit();
2742
2743         /* And the zone.. */
2744         ZONE_UNLOCK(zone);
2745
2746 #ifdef UMA_DEBUG_ALLOC
2747         printf("uma_zfree: Allocating new free bucket.\n");
2748 #endif
2749         bflags = M_NOWAIT;
2750
2751         if (zone->uz_flags & UMA_ZFLAG_CACHEONLY)
2752                 bflags |= M_NOVM;
2753         bucket = bucket_alloc(zone->uz_count, bflags);
2754         if (bucket) {
2755                 ZONE_LOCK(zone);
2756                 LIST_INSERT_HEAD(&zone->uz_free_bucket,
2757                     bucket, ub_link);
2758                 ZONE_UNLOCK(zone);
2759                 goto zfree_restart;
2760         }
2761
2762         /*
2763          * If nothing else caught this, we'll just do an internal free.
2764          */
2765 zfree_internal:
2766         zone_free_item(zone, item, udata, SKIP_DTOR, ZFREE_STATFREE);
2767
2768         return;
2769 }
2770
2771 /*
2772  * Frees an item to an INTERNAL zone or allocates a free bucket
2773  *
2774  * Arguments:
2775  *      zone   The zone to free to
2776  *      item   The item we're freeing
2777  *      udata  User supplied data for the dtor
2778  *      skip   Skip dtors and finis
2779  */
2780 static void
2781 zone_free_item(uma_zone_t zone, void *item, void *udata,
2782     enum zfreeskip skip, int flags)
2783 {
2784         uma_slab_t slab;
2785         uma_slabrefcnt_t slabref;
2786         uma_keg_t keg;
2787         u_int8_t *mem;
2788         u_int8_t freei;
2789         int clearfull;
2790
2791         if (skip < SKIP_DTOR && zone->uz_dtor)
2792                 zone->uz_dtor(item, zone->uz_size, udata);
2793
2794         if (skip < SKIP_FINI && zone->uz_fini)
2795                 zone->uz_fini(item, zone->uz_size);
2796
2797         ZONE_LOCK(zone);
2798
2799         if (flags & ZFREE_STATFAIL)
2800                 zone->uz_fails++;
2801         if (flags & ZFREE_STATFREE)
2802                 zone->uz_frees++;
2803
2804         if (!(zone->uz_flags & UMA_ZONE_VTOSLAB)) {
2805                 mem = (u_int8_t *)((unsigned long)item & (~UMA_SLAB_MASK));
2806                 keg = zone_first_keg(zone); /* Must only be one. */
2807                 if (zone->uz_flags & UMA_ZONE_HASH) {
2808                         slab = hash_sfind(&keg->uk_hash, mem);
2809                 } else {
2810                         mem += keg->uk_pgoff;
2811                         slab = (uma_slab_t)mem;
2812                 }
2813         } else {
2814                 /* This prevents redundant lookups via free(). */
2815                 if ((zone->uz_flags & UMA_ZONE_MALLOC) && udata != NULL)
2816                         slab = (uma_slab_t)udata;
2817                 else
2818                         slab = vtoslab((vm_offset_t)item);
2819                 keg = slab->us_keg;
2820                 keg_relock(keg, zone);
2821         }
2822         MPASS(keg == slab->us_keg);
2823
2824         /* Do we need to remove from any lists? */
2825         if (slab->us_freecount+1 == keg->uk_ipers) {
2826                 LIST_REMOVE(slab, us_link);
2827                 LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
2828         } else if (slab->us_freecount == 0) {
2829                 LIST_REMOVE(slab, us_link);
2830                 LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
2831         }
2832
2833         /* Slab management stuff */
2834         freei = ((unsigned long)item - (unsigned long)slab->us_data)
2835                 / keg->uk_rsize;
2836
2837 #ifdef INVARIANTS
2838         if (!skip)
2839                 uma_dbg_free(zone, slab, item);
2840 #endif
2841
2842         if (keg->uk_flags & UMA_ZONE_REFCNT) {
2843                 slabref = (uma_slabrefcnt_t)slab;
2844                 slabref->us_freelist[freei].us_item = slab->us_firstfree;
2845         } else {
2846                 slab->us_freelist[freei].us_item = slab->us_firstfree;
2847         }
2848         slab->us_firstfree = freei;
2849         slab->us_freecount++;
2850
2851         /* Zone statistics */
2852         keg->uk_free++;
2853
2854         clearfull = 0;
2855         if (keg->uk_flags & UMA_ZFLAG_FULL) {
2856                 if (keg->uk_pages < keg->uk_maxpages) {
2857                         keg->uk_flags &= ~UMA_ZFLAG_FULL;
2858                         clearfull = 1;
2859                 }
2860
2861                 /*
2862                  * We can handle one more allocation. Since we're clearing ZFLAG_FULL,
2863                  * wake up all procs blocked on pages. This should be uncommon, so
2864                  * keeping this simple for now (rather than adding count of blocked
2865                  * threads etc).
2866                  */
2867                 wakeup(keg);
2868         }
2869         if (clearfull) {
2870                 zone_relock(zone, keg);
2871                 zone->uz_flags &= ~UMA_ZFLAG_FULL;
2872                 wakeup(zone);
2873                 ZONE_UNLOCK(zone);
2874         } else
2875                 KEG_UNLOCK(keg);
2876 }
2877
2878 /* See uma.h */
2879 int
2880 uma_zone_set_max(uma_zone_t zone, int nitems)
2881 {
2882         uma_keg_t keg;
2883
2884         ZONE_LOCK(zone);
2885         keg = zone_first_keg(zone);
2886         keg->uk_maxpages = (nitems / keg->uk_ipers) * keg->uk_ppera;
2887         if (keg->uk_maxpages * keg->uk_ipers < nitems)
2888                 keg->uk_maxpages += keg->uk_ppera;
2889         nitems = keg->uk_maxpages * keg->uk_ipers;
2890         ZONE_UNLOCK(zone);
2891
2892         return (nitems);
2893 }
2894
2895 /* See uma.h */
2896 int
2897 uma_zone_get_max(uma_zone_t zone)
2898 {
2899         int nitems;
2900         uma_keg_t keg;
2901
2902         ZONE_LOCK(zone);
2903         keg = zone_first_keg(zone);
2904         nitems = keg->uk_maxpages * keg->uk_ipers;
2905         ZONE_UNLOCK(zone);
2906
2907         return (nitems);
2908 }
2909
2910 /* See uma.h */
2911 void
2912 uma_zone_set_warning(uma_zone_t zone, const char *warning)
2913 {
2914
2915         ZONE_LOCK(zone);
2916         zone->uz_warning = warning;
2917         ZONE_UNLOCK(zone);
2918 }
2919
2920 /* See uma.h */
2921 int
2922 uma_zone_get_cur(uma_zone_t zone)
2923 {
2924         int64_t nitems;
2925         u_int i;
2926
2927         ZONE_LOCK(zone);
2928         nitems = zone->uz_allocs - zone->uz_frees;
2929         CPU_FOREACH(i) {
2930                 /*
2931                  * See the comment in sysctl_vm_zone_stats() regarding the
2932                  * safety of accessing the per-cpu caches. With the zone lock
2933                  * held, it is safe, but can potentially result in stale data.
2934                  */
2935                 nitems += zone->uz_cpu[i].uc_allocs -
2936                     zone->uz_cpu[i].uc_frees;
2937         }
2938         ZONE_UNLOCK(zone);
2939
2940         return (nitems < 0 ? 0 : nitems);
2941 }
2942
2943 /* See uma.h */
2944 void
2945 uma_zone_set_init(uma_zone_t zone, uma_init uminit)
2946 {
2947         uma_keg_t keg;
2948
2949         ZONE_LOCK(zone);
2950         keg = zone_first_keg(zone);
2951         KASSERT(keg->uk_pages == 0,
2952             ("uma_zone_set_init on non-empty keg"));
2953         keg->uk_init = uminit;
2954         ZONE_UNLOCK(zone);
2955 }
2956
2957 /* See uma.h */
2958 void
2959 uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
2960 {
2961         uma_keg_t keg;
2962
2963         ZONE_LOCK(zone);
2964         keg = zone_first_keg(zone);
2965         KASSERT(keg->uk_pages == 0,
2966             ("uma_zone_set_fini on non-empty keg"));
2967         keg->uk_fini = fini;
2968         ZONE_UNLOCK(zone);
2969 }
2970
2971 /* See uma.h */
2972 void
2973 uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
2974 {
2975         ZONE_LOCK(zone);
2976         KASSERT(zone_first_keg(zone)->uk_pages == 0,
2977             ("uma_zone_set_zinit on non-empty keg"));
2978         zone->uz_init = zinit;
2979         ZONE_UNLOCK(zone);
2980 }
2981
2982 /* See uma.h */
2983 void
2984 uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
2985 {
2986         ZONE_LOCK(zone);
2987         KASSERT(zone_first_keg(zone)->uk_pages == 0,
2988             ("uma_zone_set_zfini on non-empty keg"));
2989         zone->uz_fini = zfini;
2990         ZONE_UNLOCK(zone);
2991 }
2992
2993 /* See uma.h */
2994 /* XXX uk_freef is not actually used with the zone locked */
2995 void
2996 uma_zone_set_freef(uma_zone_t zone, uma_free freef)
2997 {
2998
2999         ZONE_LOCK(zone);
3000         zone_first_keg(zone)->uk_freef = freef;
3001         ZONE_UNLOCK(zone);
3002 }
3003
3004 /* See uma.h */
3005 /* XXX uk_allocf is not actually used with the zone locked */
3006 void
3007 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
3008 {
3009         uma_keg_t keg;
3010
3011         ZONE_LOCK(zone);
3012         keg = zone_first_keg(zone);
3013         keg->uk_flags |= UMA_ZFLAG_PRIVALLOC;
3014         keg->uk_allocf = allocf;
3015         ZONE_UNLOCK(zone);
3016 }
3017
3018 /* See uma.h */
3019 int
3020 uma_zone_reserve_kva(uma_zone_t zone, int count)
3021 {
3022         uma_keg_t keg;
3023         vm_offset_t kva;
3024         int pages;
3025
3026         keg = zone_first_keg(zone);
3027         pages = count / keg->uk_ipers;
3028
3029         if (pages * keg->uk_ipers < count)
3030                 pages++;
3031
3032 #ifdef UMA_MD_SMALL_ALLOC
3033         if (keg->uk_ppera > 1) {
3034 #else
3035         if (1) {
3036 #endif
3037                 kva = kmem_alloc_nofault(kernel_map, pages * UMA_SLAB_SIZE);
3038                 if (kva == 0)
3039                         return (0);
3040         } else
3041                 kva = 0;
3042         ZONE_LOCK(zone);
3043         keg->uk_kva = kva;
3044         keg->uk_offset = 0;
3045         keg->uk_maxpages = pages;
3046 #ifdef UMA_MD_SMALL_ALLOC
3047         keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc;
3048 #else
3049         keg->uk_allocf = noobj_alloc;
3050 #endif
3051         keg->uk_flags |= UMA_ZONE_NOFREE | UMA_ZFLAG_PRIVALLOC;
3052         ZONE_UNLOCK(zone);
3053         return (1);
3054 }
3055
3056 /* See uma.h */
3057 void
3058 uma_prealloc(uma_zone_t zone, int items)
3059 {
3060         int slabs;
3061         uma_slab_t slab;
3062         uma_keg_t keg;
3063
3064         keg = zone_first_keg(zone);
3065         ZONE_LOCK(zone);
3066         slabs = items / keg->uk_ipers;
3067         if (slabs * keg->uk_ipers < items)
3068                 slabs++;
3069         while (slabs > 0) {
3070                 slab = keg_alloc_slab(keg, zone, M_WAITOK);
3071                 if (slab == NULL)
3072                         break;
3073                 MPASS(slab->us_keg == keg);
3074                 LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
3075                 slabs--;
3076         }
3077         ZONE_UNLOCK(zone);
3078 }
3079
3080 /* See uma.h */
3081 u_int32_t *
3082 uma_find_refcnt(uma_zone_t zone, void *item)
3083 {
3084         uma_slabrefcnt_t slabref;
3085         uma_keg_t keg;
3086         u_int32_t *refcnt;
3087         int idx;
3088
3089         slabref = (uma_slabrefcnt_t)vtoslab((vm_offset_t)item &
3090             (~UMA_SLAB_MASK));
3091         keg = slabref->us_keg;
3092         KASSERT(slabref != NULL && slabref->us_keg->uk_flags & UMA_ZONE_REFCNT,
3093             ("uma_find_refcnt(): zone possibly not UMA_ZONE_REFCNT"));
3094         idx = ((unsigned long)item - (unsigned long)slabref->us_data)
3095             / keg->uk_rsize;
3096         refcnt = &slabref->us_freelist[idx].us_refcnt;
3097         return refcnt;
3098 }
3099
3100 /* See uma.h */
3101 void
3102 uma_reclaim(void)
3103 {
3104 #ifdef UMA_DEBUG
3105         printf("UMA: vm asked us to release pages!\n");
3106 #endif
3107         bucket_enable();
3108         zone_foreach(zone_drain);
3109         /*
3110          * Some slabs may have been freed but this zone will be visited early
3111          * we visit again so that we can free pages that are empty once other
3112          * zones are drained.  We have to do the same for buckets.
3113          */
3114         zone_drain(slabzone);
3115         zone_drain(slabrefzone);
3116         bucket_zone_drain();
3117 }
3118
3119 /* See uma.h */
3120 int
3121 uma_zone_exhausted(uma_zone_t zone)
3122 {
3123         int full;
3124
3125         ZONE_LOCK(zone);
3126         full = (zone->uz_flags & UMA_ZFLAG_FULL);
3127         ZONE_UNLOCK(zone);
3128         return (full);
3129 }
3130
3131 int
3132 uma_zone_exhausted_nolock(uma_zone_t zone)
3133 {
3134         return (zone->uz_flags & UMA_ZFLAG_FULL);
3135 }
3136
3137 void *
3138 uma_large_malloc(int size, int wait)
3139 {
3140         void *mem;
3141         uma_slab_t slab;
3142         u_int8_t flags;
3143
3144         slab = zone_alloc_item(slabzone, NULL, wait);
3145         if (slab == NULL)
3146                 return (NULL);
3147         mem = page_alloc(NULL, size, &flags, wait);
3148         if (mem) {
3149                 vsetslab((vm_offset_t)mem, slab);
3150                 slab->us_data = mem;
3151                 slab->us_flags = flags | UMA_SLAB_MALLOC;
3152                 slab->us_size = size;
3153         } else {
3154                 zone_free_item(slabzone, slab, NULL, SKIP_NONE,
3155                     ZFREE_STATFAIL | ZFREE_STATFREE);
3156         }
3157
3158         return (mem);
3159 }
3160
3161 void
3162 uma_large_free(uma_slab_t slab)
3163 {
3164         vsetobj((vm_offset_t)slab->us_data, kmem_object);
3165         page_free(slab->us_data, slab->us_size, slab->us_flags);
3166         zone_free_item(slabzone, slab, NULL, SKIP_NONE, ZFREE_STATFREE);
3167 }
3168
3169 void
3170 uma_print_stats(void)
3171 {
3172         zone_foreach(uma_print_zone);
3173 }
3174
3175 static void
3176 slab_print(uma_slab_t slab)
3177 {
3178         printf("slab: keg %p, data %p, freecount %d, firstfree %d\n",
3179                 slab->us_keg, slab->us_data, slab->us_freecount,
3180                 slab->us_firstfree);
3181 }
3182
3183 static void
3184 cache_print(uma_cache_t cache)
3185 {
3186         printf("alloc: %p(%d), free: %p(%d)\n",
3187                 cache->uc_allocbucket,
3188                 cache->uc_allocbucket?cache->uc_allocbucket->ub_cnt:0,
3189                 cache->uc_freebucket,
3190                 cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0);
3191 }
3192
3193 static void
3194 uma_print_keg(uma_keg_t keg)
3195 {
3196         uma_slab_t slab;
3197
3198         printf("keg: %s(%p) size %d(%d) flags %#x ipers %d ppera %d "
3199             "out %d free %d limit %d\n",
3200             keg->uk_name, keg, keg->uk_size, keg->uk_rsize, keg->uk_flags,
3201             keg->uk_ipers, keg->uk_ppera,
3202             (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free,
3203             (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers);
3204         printf("Part slabs:\n");
3205         LIST_FOREACH(slab, &keg->uk_part_slab, us_link)
3206                 slab_print(slab);
3207         printf("Free slabs:\n");
3208         LIST_FOREACH(slab, &keg->uk_free_slab, us_link)
3209                 slab_print(slab);
3210         printf("Full slabs:\n");
3211         LIST_FOREACH(slab, &keg->uk_full_slab, us_link)
3212                 slab_print(slab);
3213 }
3214
3215 void
3216 uma_print_zone(uma_zone_t zone)
3217 {
3218         uma_cache_t cache;
3219         uma_klink_t kl;
3220         int i;
3221
3222         printf("zone: %s(%p) size %d flags %#x\n",
3223             zone->uz_name, zone, zone->uz_size, zone->uz_flags);
3224         LIST_FOREACH(kl, &zone->uz_kegs, kl_link)
3225                 uma_print_keg(kl->kl_keg);
3226         CPU_FOREACH(i) {
3227                 cache = &zone->uz_cpu[i];
3228                 printf("CPU %d Cache:\n", i);
3229                 cache_print(cache);
3230         }
3231 }
3232
3233 #ifdef DDB
3234 /*
3235  * Generate statistics across both the zone and its per-cpu cache's.  Return
3236  * desired statistics if the pointer is non-NULL for that statistic.
3237  *
3238  * Note: does not update the zone statistics, as it can't safely clear the
3239  * per-CPU cache statistic.
3240  *
3241  * XXXRW: Following the uc_allocbucket and uc_freebucket pointers here isn't
3242  * safe from off-CPU; we should modify the caches to track this information
3243  * directly so that we don't have to.
3244  */
3245 static void
3246 uma_zone_sumstat(uma_zone_t z, int *cachefreep, u_int64_t *allocsp,
3247     u_int64_t *freesp, u_int64_t *sleepsp)
3248 {
3249         uma_cache_t cache;
3250         u_int64_t allocs, frees, sleeps;
3251         int cachefree, cpu;
3252
3253         allocs = frees = sleeps = 0;
3254         cachefree = 0;
3255         CPU_FOREACH(cpu) {
3256                 cache = &z->uz_cpu[cpu];
3257                 if (cache->uc_allocbucket != NULL)
3258                         cachefree += cache->uc_allocbucket->ub_cnt;
3259                 if (cache->uc_freebucket != NULL)
3260                         cachefree += cache->uc_freebucket->ub_cnt;
3261                 allocs += cache->uc_allocs;
3262                 frees += cache->uc_frees;
3263         }
3264         allocs += z->uz_allocs;
3265         frees += z->uz_frees;
3266         sleeps += z->uz_sleeps;
3267         if (cachefreep != NULL)
3268                 *cachefreep = cachefree;
3269         if (allocsp != NULL)
3270                 *allocsp = allocs;
3271         if (freesp != NULL)
3272                 *freesp = frees;
3273         if (sleepsp != NULL)
3274                 *sleepsp = sleeps;
3275 }
3276 #endif /* DDB */
3277
3278 static int
3279 sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS)
3280 {
3281         uma_keg_t kz;
3282         uma_zone_t z;
3283         int count;
3284
3285         count = 0;
3286         mtx_lock(&uma_mtx);
3287         LIST_FOREACH(kz, &uma_kegs, uk_link) {
3288                 LIST_FOREACH(z, &kz->uk_zones, uz_link)
3289                         count++;
3290         }
3291         mtx_unlock(&uma_mtx);
3292         return (sysctl_handle_int(oidp, &count, 0, req));
3293 }
3294
3295 static int
3296 sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS)
3297 {
3298         struct uma_stream_header ush;
3299         struct uma_type_header uth;
3300         struct uma_percpu_stat ups;
3301         uma_bucket_t bucket;
3302         struct sbuf sbuf;
3303         uma_cache_t cache;
3304         uma_klink_t kl;
3305         uma_keg_t kz;
3306         uma_zone_t z;
3307         uma_keg_t k;
3308         int count, error, i;
3309
3310         error = sysctl_wire_old_buffer(req, 0);
3311         if (error != 0)
3312                 return (error);
3313         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
3314
3315         count = 0;
3316         mtx_lock(&uma_mtx);
3317         LIST_FOREACH(kz, &uma_kegs, uk_link) {
3318                 LIST_FOREACH(z, &kz->uk_zones, uz_link)
3319                         count++;
3320         }
3321
3322         /*
3323          * Insert stream header.
3324          */
3325         bzero(&ush, sizeof(ush));
3326         ush.ush_version = UMA_STREAM_VERSION;
3327         ush.ush_maxcpus = (mp_maxid + 1);
3328         ush.ush_count = count;
3329         (void)sbuf_bcat(&sbuf, &ush, sizeof(ush));
3330
3331         LIST_FOREACH(kz, &uma_kegs, uk_link) {
3332                 LIST_FOREACH(z, &kz->uk_zones, uz_link) {
3333                         bzero(&uth, sizeof(uth));
3334                         ZONE_LOCK(z);
3335                         strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
3336                         uth.uth_align = kz->uk_align;
3337                         uth.uth_size = kz->uk_size;
3338                         uth.uth_rsize = kz->uk_rsize;
3339                         LIST_FOREACH(kl, &z->uz_kegs, kl_link) {
3340                                 k = kl->kl_keg;
3341                                 uth.uth_maxpages += k->uk_maxpages;
3342                                 uth.uth_pages += k->uk_pages;
3343                                 uth.uth_keg_free += k->uk_free;
3344                                 uth.uth_limit = (k->uk_maxpages / k->uk_ppera)
3345                                     * k->uk_ipers;
3346                         }
3347
3348                         /*
3349                          * A zone is secondary is it is not the first entry
3350                          * on the keg's zone list.
3351                          */
3352                         if ((z->uz_flags & UMA_ZONE_SECONDARY) &&
3353                             (LIST_FIRST(&kz->uk_zones) != z))
3354                                 uth.uth_zone_flags = UTH_ZONE_SECONDARY;
3355
3356                         LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link)
3357                                 uth.uth_zone_free += bucket->ub_cnt;
3358                         uth.uth_allocs = z->uz_allocs;
3359                         uth.uth_frees = z->uz_frees;
3360                         uth.uth_fails = z->uz_fails;
3361                         uth.uth_sleeps = z->uz_sleeps;
3362                         (void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
3363                         /*
3364                          * While it is not normally safe to access the cache
3365                          * bucket pointers while not on the CPU that owns the
3366                          * cache, we only allow the pointers to be exchanged
3367                          * without the zone lock held, not invalidated, so
3368                          * accept the possible race associated with bucket
3369                          * exchange during monitoring.
3370                          */
3371                         for (i = 0; i < (mp_maxid + 1); i++) {
3372                                 bzero(&ups, sizeof(ups));
3373                                 if (kz->uk_flags & UMA_ZFLAG_INTERNAL)
3374                                         goto skip;
3375                                 if (CPU_ABSENT(i))
3376                                         goto skip;
3377                                 cache = &z->uz_cpu[i];
3378                                 if (cache->uc_allocbucket != NULL)
3379                                         ups.ups_cache_free +=
3380                                             cache->uc_allocbucket->ub_cnt;
3381                                 if (cache->uc_freebucket != NULL)
3382                                         ups.ups_cache_free +=
3383                                             cache->uc_freebucket->ub_cnt;
3384                                 ups.ups_allocs = cache->uc_allocs;
3385                                 ups.ups_frees = cache->uc_frees;
3386 skip:
3387                                 (void)sbuf_bcat(&sbuf, &ups, sizeof(ups));
3388                         }
3389                         ZONE_UNLOCK(z);
3390                 }
3391         }
3392         mtx_unlock(&uma_mtx);
3393         error = sbuf_finish(&sbuf);
3394         sbuf_delete(&sbuf);
3395         return (error);
3396 }
3397
3398 #ifdef DDB
3399 DB_SHOW_COMMAND(uma, db_show_uma)
3400 {
3401         u_int64_t allocs, frees, sleeps;
3402         uma_bucket_t bucket;
3403         uma_keg_t kz;
3404         uma_zone_t z;
3405         int cachefree;
3406
3407         db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
3408             "Requests", "Sleeps");
3409         LIST_FOREACH(kz, &uma_kegs, uk_link) {
3410                 LIST_FOREACH(z, &kz->uk_zones, uz_link) {
3411                         if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
3412                                 allocs = z->uz_allocs;
3413                                 frees = z->uz_frees;
3414                                 sleeps = z->uz_sleeps;
3415                                 cachefree = 0;
3416                         } else
3417                                 uma_zone_sumstat(z, &cachefree, &allocs,
3418                                     &frees, &sleeps);
3419                         if (!((z->uz_flags & UMA_ZONE_SECONDARY) &&
3420                             (LIST_FIRST(&kz->uk_zones) != z)))
3421                                 cachefree += kz->uk_free;
3422                         LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link)
3423                                 cachefree += bucket->ub_cnt;
3424                         db_printf("%18s %8ju %8jd %8d %12ju %8ju\n", z->uz_name,
3425                             (uintmax_t)kz->uk_size,
3426                             (intmax_t)(allocs - frees), cachefree,
3427                             (uintmax_t)allocs, sleeps);
3428                         if (db_pager_quit)
3429                                 return;
3430                 }
3431         }
3432 }
3433 #endif