sys/vm/uma_core.c

   1 /*-
   2  * Copyright (c) 2002-2005, 2009, 2013 Jeffrey Roberson <jeff@FreeBSD.org>
   3  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
   4  * Copyright (c) 2004-2006 Robert N. M. Watson
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice unmodified, this list of conditions, and the following
  12  *    disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27  */
  28
  29 /*
  30  * uma_core.c  Implementation of the Universal Memory allocator
  31  *
  32  * This allocator is intended to replace the multitude of similar object caches
  33  * in the standard FreeBSD kernel.  The intent is to be flexible as well as
  34  * effecient.  A primary design goal is to return unused memory to the rest of
  35  * the system.  This will make the system as a whole more flexible due to the
  36  * ability to move memory to subsystems which most need it instead of leaving
  37  * pools of reserved memory unused.
  38  *
  39  * The basic ideas stem from similar slab/zone based allocators whose algorithms
  40  * are well known.
  41  *
  42  */
  43
  44 /*
  45  * TODO:
  46  *      - Improve memory usage for large allocations
  47  *      - Investigate cache size adjustments
  48  */
  49
  50 #include <sys/cdefs.h>
  51 __FBSDID("$FreeBSD$");
  52
  53 /* I should really use ktr.. */
  54 /*
  55 #define UMA_DEBUG 1
  56 #define UMA_DEBUG_ALLOC 1
  57 #define UMA_DEBUG_ALLOC_1 1
  58 */
  59
  60 #include "opt_ddb.h"
  61 #include "opt_param.h"
  62 #include "opt_vm.h"
  63
  64 #include <sys/param.h>
  65 #include <sys/systm.h>
  66 #include <sys/bitset.h>
  67 #include <sys/kernel.h>
  68 #include <sys/types.h>
  69 #include <sys/queue.h>
  70 #include <sys/malloc.h>
  71 #include <sys/ktr.h>
  72 #include <sys/lock.h>
  73 #include <sys/sysctl.h>
  74 #include <sys/mutex.h>
  75 #include <sys/proc.h>
  76 #include <sys/random.h>
  77 #include <sys/rwlock.h>
  78 #include <sys/sbuf.h>
  79 #include <sys/sched.h>
  80 #include <sys/smp.h>
  81 #include <sys/vmmeter.h>
  82
  83 #include <vm/vm.h>
  84 #include <vm/vm_object.h>
  85 #include <vm/vm_page.h>
  86 #include <vm/vm_pageout.h>
  87 #include <vm/vm_param.h>
  88 #include <vm/vm_map.h>
  89 #include <vm/vm_kern.h>
  90 #include <vm/vm_extern.h>
  91 #include <vm/uma.h>
  92 #include <vm/uma_int.h>
  93 #include <vm/uma_dbg.h>
  94
  95 #include <ddb/ddb.h>
  96
  97 #ifdef DEBUG_MEMGUARD
  98 #include <vm/memguard.h>
  99 #endif
 100
 101 /*
 102  * This is the zone and keg from which all zones are spawned.  The idea is that
 103  * even the zone & keg heads are allocated from the allocator, so we use the
 104  * bss section to bootstrap us.
 105  */
 106 static struct uma_keg masterkeg;
 107 static struct uma_zone masterzone_k;
 108 static struct uma_zone masterzone_z;
 109 static uma_zone_t kegs = &masterzone_k;
 110 static uma_zone_t zones = &masterzone_z;
 111
 112 /* This is the zone from which all of uma_slab_t's are allocated. */
 113 static uma_zone_t slabzone;
 114 static uma_zone_t slabrefzone;  /* With refcounters (for UMA_ZONE_REFCNT) */
 115
 116 /*
 117  * The initial hash tables come out of this zone so they can be allocated
 118  * prior to malloc coming up.
 119  */
 120 static uma_zone_t hashzone;
 121
 122 /* The boot-time adjusted value for cache line alignment. */
 123 int uma_align_cache = 64 - 1;
 124
 125 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
 126
 127 /*
 128  * Are we allowed to allocate buckets?
 129  */
 130 static int bucketdisable = 1;
 131
 132 /* Linked list of all kegs in the system */
 133 static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs);
 134
 135 /* Linked list of all cache-only zones in the system */
 136 static LIST_HEAD(,uma_zone) uma_cachezones =
 137     LIST_HEAD_INITIALIZER(uma_cachezones);
 138
 139 /* This RW lock protects the keg list */
 140 static struct rwlock_padalign uma_rwlock;
 141
 142 /* Linked list of boot time pages */
 143 static LIST_HEAD(,uma_slab) uma_boot_pages =
 144     LIST_HEAD_INITIALIZER(uma_boot_pages);
 145
 146 /* This mutex protects the boot time pages list */
 147 static struct mtx_padalign uma_boot_pages_mtx;
 148
 149 static struct sx uma_drain_lock;
 150
 151 /* Is the VM done starting up? */
 152 static int booted = 0;
 153 #define UMA_STARTUP     1
 154 #define UMA_STARTUP2    2
 155
 156 /*
 157  * Only mbuf clusters use ref zones.  Just provide enough references
 158  * to support the one user.  New code should not use the ref facility.
 159  */
 160 static const u_int uma_max_ipers_ref = PAGE_SIZE / MCLBYTES;
 161
 162 /*
 163  * This is the handle used to schedule events that need to happen
 164  * outside of the allocation fast path.
 165  */
 166 static struct callout uma_callout;
 167 #define UMA_TIMEOUT     20              /* Seconds for callout interval. */
 168
 169 /*
 170  * This structure is passed as the zone ctor arg so that I don't have to create
 171  * a special allocation function just for zones.
 172  */
 173 struct uma_zctor_args {
 174         const char *name;
 175         size_t size;
 176         uma_ctor ctor;
 177         uma_dtor dtor;
 178         uma_init uminit;
 179         uma_fini fini;
 180         uma_import import;
 181         uma_release release;
 182         void *arg;
 183         uma_keg_t keg;
 184         int align;
 185         uint32_t flags;
 186 };
 187
 188 struct uma_kctor_args {
 189         uma_zone_t zone;
 190         size_t size;
 191         uma_init uminit;
 192         uma_fini fini;
 193         int align;
 194         uint32_t flags;
 195 };
 196
 197 struct uma_bucket_zone {
 198         uma_zone_t      ubz_zone;
 199         char            *ubz_name;
 200         int             ubz_entries;    /* Number of items it can hold. */
 201         int             ubz_maxsize;    /* Maximum allocation size per-item. */
 202 };
 203
 204 /*
 205  * Compute the actual number of bucket entries to pack them in power
 206  * of two sizes for more efficient space utilization.
 207  */
 208 #define BUCKET_SIZE(n)                                          \
 209     (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *))
 210
 211 #define BUCKET_MAX      BUCKET_SIZE(256)
 212
 213 struct uma_bucket_zone bucket_zones[] = {
 214         { NULL, "4 Bucket", BUCKET_SIZE(4), 4096 },
 215         { NULL, "6 Bucket", BUCKET_SIZE(6), 3072 },
 216         { NULL, "8 Bucket", BUCKET_SIZE(8), 2048 },
 217         { NULL, "12 Bucket", BUCKET_SIZE(12), 1536 },
 218         { NULL, "16 Bucket", BUCKET_SIZE(16), 1024 },
 219         { NULL, "32 Bucket", BUCKET_SIZE(32), 512 },
 220         { NULL, "64 Bucket", BUCKET_SIZE(64), 256 },
 221         { NULL, "128 Bucket", BUCKET_SIZE(128), 128 },
 222         { NULL, "256 Bucket", BUCKET_SIZE(256), 64 },
 223         { NULL, NULL, 0}
 224 };
 225
 226 /*
 227  * Flags and enumerations to be passed to internal functions.
 228  */
 229 enum zfreeskip { SKIP_NONE = 0, SKIP_DTOR, SKIP_FINI };
 230
 231 /* Prototypes.. */
 232
 233 static void *noobj_alloc(uma_zone_t, vm_size_t, uint8_t *, int);
 234 static void *page_alloc(uma_zone_t, vm_size_t, uint8_t *, int);
 235 static void *startup_alloc(uma_zone_t, vm_size_t, uint8_t *, int);
 236 static void page_free(void *, vm_size_t, uint8_t);
 237 static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int);
 238 static void cache_drain(uma_zone_t);
 239 static void bucket_drain(uma_zone_t, uma_bucket_t);
 240 static void bucket_cache_drain(uma_zone_t zone);
 241 static int keg_ctor(void *, int, void *, int);
 242 static void keg_dtor(void *, int, void *);
 243 static int zone_ctor(void *, int, void *, int);
 244 static void zone_dtor(void *, int, void *);
 245 static int zero_init(void *, int, int);
 246 static void keg_small_init(uma_keg_t keg);
 247 static void keg_large_init(uma_keg_t keg);
 248 static void zone_foreach(void (*zfunc)(uma_zone_t));
 249 static void zone_timeout(uma_zone_t zone);
 250 static int hash_alloc(struct uma_hash *);
 251 static int hash_expand(struct uma_hash *, struct uma_hash *);
 252 static void hash_free(struct uma_hash *hash);
 253 static void uma_timeout(void *);
 254 static void uma_startup3(void);
 255 static void *zone_alloc_item(uma_zone_t, void *, int);
 256 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip);
 257 static void bucket_enable(void);
 258 static void bucket_init(void);
 259 static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int);
 260 static void bucket_free(uma_zone_t zone, uma_bucket_t, void *);
 261 static void bucket_zone_drain(void);
 262 static uma_bucket_t zone_alloc_bucket(uma_zone_t zone, void *, int flags);
 263 static uma_slab_t zone_fetch_slab(uma_zone_t zone, uma_keg_t last, int flags);
 264 static uma_slab_t zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int flags);
 265 static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab);
 266 static void slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item);
 267 static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
 268     uma_fini fini, int align, uint32_t flags);
 269 static int zone_import(uma_zone_t zone, void **bucket, int max, int flags);
 270 static void zone_release(uma_zone_t zone, void **bucket, int cnt);
 271 static void uma_zero_item(void *item, uma_zone_t zone);
 272
 273 void uma_print_zone(uma_zone_t);
 274 void uma_print_stats(void);
 275 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
 276 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
 277
 278 #ifdef INVARIANTS
 279 static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item);
 280 static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item);
 281 #endif
 282
 283 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
 284
 285 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT,
 286     0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones");
 287
 288 SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
 289     0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
 290
 291 static int zone_warnings = 1;
 292 SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0,
 293     "Warn when UMA zones becomes full");
 294
 295 /*
 296  * This routine checks to see whether or not it's safe to enable buckets.
 297  */
 298 static void
 299 bucket_enable(void)
 300 {
 301         bucketdisable = vm_page_count_min();
 302 }
 303
 304 /*
 305  * Initialize bucket_zones, the array of zones of buckets of various sizes.
 306  *
 307  * For each zone, calculate the memory required for each bucket, consisting
 308  * of the header and an array of pointers.
 309  */
 310 static void
 311 bucket_init(void)
 312 {
 313         struct uma_bucket_zone *ubz;
 314         int size;
 315
 316         for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) {
 317                 size = roundup(sizeof(struct uma_bucket), sizeof(void *));
 318                 size += sizeof(void *) * ubz->ubz_entries;
 319                 ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
 320                     NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 321                     UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET);
 322         }
 323 }
 324
 325 /*
 326  * Given a desired number of entries for a bucket, return the zone from which
 327  * to allocate the bucket.
 328  */
 329 static struct uma_bucket_zone *
 330 bucket_zone_lookup(int entries)
 331 {
 332         struct uma_bucket_zone *ubz;
 333
 334         for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
 335                 if (ubz->ubz_entries >= entries)
 336                         return (ubz);
 337         ubz--;
 338         return (ubz);
 339 }
 340
 341 static int
 342 bucket_select(int size)
 343 {
 344         struct uma_bucket_zone *ubz;
 345
 346         ubz = &bucket_zones[0];
 347         if (size > ubz->ubz_maxsize)
 348                 return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1);
 349
 350         for (; ubz->ubz_entries != 0; ubz++)
 351                 if (ubz->ubz_maxsize < size)
 352                         break;
 353         ubz--;
 354         return (ubz->ubz_entries);
 355 }
 356
 357 static uma_bucket_t
 358 bucket_alloc(uma_zone_t zone, void *udata, int flags)
 359 {
 360         struct uma_bucket_zone *ubz;
 361         uma_bucket_t bucket;
 362
 363         /*
 364          * This is to stop us from allocating per cpu buckets while we're
 365          * running out of vm.boot_pages.  Otherwise, we would exhaust the
 366          * boot pages.  This also prevents us from allocating buckets in
 367          * low memory situations.
 368          */
 369         if (bucketdisable)
 370                 return (NULL);
 371         /*
 372          * To limit bucket recursion we store the original zone flags
 373          * in a cookie passed via zalloc_arg/zfree_arg.  This allows the
 374          * NOVM flag to persist even through deep recursions.  We also
 375          * store ZFLAG_BUCKET once we have recursed attempting to allocate
 376          * a bucket for a bucket zone so we do not allow infinite bucket
 377          * recursion.  This cookie will even persist to frees of unused
 378          * buckets via the allocation path or bucket allocations in the
 379          * free path.
 380          */
 381         if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
 382                 udata = (void *)(uintptr_t)zone->uz_flags;
 383         else {
 384                 if ((uintptr_t)udata & UMA_ZFLAG_BUCKET)
 385                         return (NULL);
 386                 udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET);
 387         }
 388         if ((uintptr_t)udata & UMA_ZFLAG_CACHEONLY)
 389                 flags |= M_NOVM;
 390         ubz = bucket_zone_lookup(zone->uz_count);
 391         if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0)
 392                 ubz++;
 393         bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags);
 394         if (bucket) {
 395 #ifdef INVARIANTS
 396                 bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
 397 #endif
 398                 bucket->ub_cnt = 0;
 399                 bucket->ub_entries = ubz->ubz_entries;
 400         }
 401
 402         return (bucket);
 403 }
 404
 405 static void
 406 bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata)
 407 {
 408         struct uma_bucket_zone *ubz;
 409
 410         KASSERT(bucket->ub_cnt == 0,
 411             ("bucket_free: Freeing a non free bucket."));
 412         if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
 413                 udata = (void *)(uintptr_t)zone->uz_flags;
 414         ubz = bucket_zone_lookup(bucket->ub_entries);
 415         uma_zfree_arg(ubz->ubz_zone, bucket, udata);
 416 }
 417
 418 static void
 419 bucket_zone_drain(void)
 420 {
 421         struct uma_bucket_zone *ubz;
 422
 423         for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
 424                 zone_drain(ubz->ubz_zone);
 425 }
 426
 427 static void
 428 zone_log_warning(uma_zone_t zone)
 429 {
 430         static const struct timeval warninterval = { 300, 0 };
 431
 432         if (!zone_warnings || zone->uz_warning == NULL)
 433                 return;
 434
 435         if (ratecheck(&zone->uz_ratecheck, &warninterval))
 436                 printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning);
 437 }
 438
 439 static inline void
 440 zone_maxaction(uma_zone_t zone)
 441 {
 442         if (zone->uz_maxaction)
 443                 (*zone->uz_maxaction)(zone);
 444 }
 445
 446 static void
 447 zone_foreach_keg(uma_zone_t zone, void (*kegfn)(uma_keg_t))
 448 {
 449         uma_klink_t klink;
 450
 451         LIST_FOREACH(klink, &zone->uz_kegs, kl_link)
 452                 kegfn(klink->kl_keg);
 453 }
 454
 455 /*
 456  * Routine called by timeout which is used to fire off some time interval
 457  * based calculations.  (stats, hash size, etc.)
 458  *
 459  * Arguments:
 460  *      arg   Unused
 461  *
 462  * Returns:
 463  *      Nothing
 464  */
 465 static void
 466 uma_timeout(void *unused)
 467 {
 468         bucket_enable();
 469         zone_foreach(zone_timeout);
 470
 471         /* Reschedule this event */
 472         callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
 473 }
 474
 475 /*
 476  * Routine to perform timeout driven calculations.  This expands the
 477  * hashes and does per cpu statistics aggregation.
 478  *
 479  *  Returns nothing.
 480  */
 481 static void
 482 keg_timeout(uma_keg_t keg)
 483 {
 484
 485         KEG_LOCK(keg);
 486         /*
 487          * Expand the keg hash table.
 488          *
 489          * This is done if the number of slabs is larger than the hash size.
 490          * What I'm trying to do here is completely reduce collisions.  This
 491          * may be a little aggressive.  Should I allow for two collisions max?
 492          */
 493         if (keg->uk_flags & UMA_ZONE_HASH &&
 494             keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) {
 495                 struct uma_hash newhash;
 496                 struct uma_hash oldhash;
 497                 int ret;
 498
 499                 /*
 500                  * This is so involved because allocating and freeing
 501                  * while the keg lock is held will lead to deadlock.
 502                  * I have to do everything in stages and check for
 503                  * races.
 504                  */
 505                 newhash = keg->uk_hash;
 506                 KEG_UNLOCK(keg);
 507                 ret = hash_alloc(&newhash);
 508                 KEG_LOCK(keg);
 509                 if (ret) {
 510                         if (hash_expand(&keg->uk_hash, &newhash)) {
 511                                 oldhash = keg->uk_hash;
 512                                 keg->uk_hash = newhash;
 513                         } else
 514                                 oldhash = newhash;
 515
 516                         KEG_UNLOCK(keg);
 517                         hash_free(&oldhash);
 518                         return;
 519                 }
 520         }
 521         KEG_UNLOCK(keg);
 522 }
 523
 524 static void
 525 zone_timeout(uma_zone_t zone)
 526 {
 527
 528         zone_foreach_keg(zone, &keg_timeout);
 529 }
 530
 531 /*
 532  * Allocate and zero fill the next sized hash table from the appropriate
 533  * backing store.
 534  *
 535  * Arguments:
 536  *      hash  A new hash structure with the old hash size in uh_hashsize
 537  *
 538  * Returns:
 539  *      1 on sucess and 0 on failure.
 540  */
 541 static int
 542 hash_alloc(struct uma_hash *hash)
 543 {
 544         int oldsize;
 545         int alloc;
 546
 547         oldsize = hash->uh_hashsize;
 548
 549         /* We're just going to go to a power of two greater */
 550         if (oldsize)  {
 551                 hash->uh_hashsize = oldsize * 2;
 552                 alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
 553                 hash->uh_slab_hash = (struct slabhead *)malloc(alloc,
 554                     M_UMAHASH, M_NOWAIT);
 555         } else {
 556                 alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
 557                 hash->uh_slab_hash = zone_alloc_item(hashzone, NULL,
 558                     M_WAITOK);
 559                 hash->uh_hashsize = UMA_HASH_SIZE_INIT;
 560         }
 561         if (hash->uh_slab_hash) {
 562                 bzero(hash->uh_slab_hash, alloc);
 563                 hash->uh_hashmask = hash->uh_hashsize - 1;
 564                 return (1);
 565         }
 566
 567         return (0);
 568 }
 569
 570 /*
 571  * Expands the hash table for HASH zones.  This is done from zone_timeout
 572  * to reduce collisions.  This must not be done in the regular allocation
 573  * path, otherwise, we can recurse on the vm while allocating pages.
 574  *
 575  * Arguments:
 576  *      oldhash  The hash you want to expand
 577  *      newhash  The hash structure for the new table
 578  *
 579  * Returns:
 580  *      Nothing
 581  *
 582  * Discussion:
 583  */
 584 static int
 585 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
 586 {
 587         uma_slab_t slab;
 588         int hval;
 589         int i;
 590
 591         if (!newhash->uh_slab_hash)
 592                 return (0);
 593
 594         if (oldhash->uh_hashsize >= newhash->uh_hashsize)
 595                 return (0);
 596
 597         /*
 598          * I need to investigate hash algorithms for resizing without a
 599          * full rehash.
 600          */
 601
 602         for (i = 0; i < oldhash->uh_hashsize; i++)
 603                 while (!SLIST_EMPTY(&oldhash->uh_slab_hash[i])) {
 604                         slab = SLIST_FIRST(&oldhash->uh_slab_hash[i]);
 605                         SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[i], us_hlink);
 606                         hval = UMA_HASH(newhash, slab->us_data);
 607                         SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
 608                             slab, us_hlink);
 609                 }
 610
 611         return (1);
 612 }
 613
 614 /*
 615  * Free the hash bucket to the appropriate backing store.
 616  *
 617  * Arguments:
 618  *      slab_hash  The hash bucket we're freeing
 619  *      hashsize   The number of entries in that hash bucket
 620  *
 621  * Returns:
 622  *      Nothing
 623  */
 624 static void
 625 hash_free(struct uma_hash *hash)
 626 {
 627         if (hash->uh_slab_hash == NULL)
 628                 return;
 629         if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
 630                 zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE);
 631         else
 632                 free(hash->uh_slab_hash, M_UMAHASH);
 633 }
 634
 635 /*
 636  * Frees all outstanding items in a bucket
 637  *
 638  * Arguments:
 639  *      zone   The zone to free to, must be unlocked.
 640  *      bucket The free/alloc bucket with items, cpu queue must be locked.
 641  *
 642  * Returns:
 643  *      Nothing
 644  */
 645
 646 static void
 647 bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
 648 {
 649         int i;
 650
 651         if (bucket == NULL)
 652                 return;
 653
 654         if (zone->uz_fini)
 655                 for (i = 0; i < bucket->ub_cnt; i++)
 656                         zone->uz_fini(bucket->ub_bucket[i], zone->uz_size);
 657         zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt);
 658         bucket->ub_cnt = 0;
 659 }
 660
 661 /*
 662  * Drains the per cpu caches for a zone.
 663  *
 664  * NOTE: This may only be called while the zone is being turn down, and not
 665  * during normal operation.  This is necessary in order that we do not have
 666  * to migrate CPUs to drain the per-CPU caches.
 667  *
 668  * Arguments:
 669  *      zone     The zone to drain, must be unlocked.
 670  *
 671  * Returns:
 672  *      Nothing
 673  */
 674 static void
 675 cache_drain(uma_zone_t zone)
 676 {
 677         uma_cache_t cache;
 678         int cpu;
 679
 680         /*
 681          * XXX: It is safe to not lock the per-CPU caches, because we're
 682          * tearing down the zone anyway.  I.e., there will be no further use
 683          * of the caches at this point.
 684          *
 685          * XXX: It would good to be able to assert that the zone is being
 686          * torn down to prevent improper use of cache_drain().
 687          *
 688          * XXX: We lock the zone before passing into bucket_cache_drain() as
 689          * it is used elsewhere.  Should the tear-down path be made special
 690          * there in some form?
 691          */
 692         CPU_FOREACH(cpu) {
 693                 cache = &zone->uz_cpu[cpu];
 694                 bucket_drain(zone, cache->uc_allocbucket);
 695                 bucket_drain(zone, cache->uc_freebucket);
 696                 if (cache->uc_allocbucket != NULL)
 697                         bucket_free(zone, cache->uc_allocbucket, NULL);
 698                 if (cache->uc_freebucket != NULL)
 699                         bucket_free(zone, cache->uc_freebucket, NULL);
 700                 cache->uc_allocbucket = cache->uc_freebucket = NULL;
 701         }
 702         ZONE_LOCK(zone);
 703         bucket_cache_drain(zone);
 704         ZONE_UNLOCK(zone);
 705 }
 706
 707 static void
 708 cache_shrink(uma_zone_t zone)
 709 {
 710
 711         if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
 712                 return;
 713
 714         ZONE_LOCK(zone);
 715         zone->uz_count = (zone->uz_count_min + zone->uz_count) / 2;
 716         ZONE_UNLOCK(zone);
 717 }
 718
 719 static void
 720 cache_drain_safe_cpu(uma_zone_t zone)
 721 {
 722         uma_cache_t cache;
 723         uma_bucket_t b1, b2;
 724
 725         if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
 726                 return;
 727
 728         b1 = b2 = NULL;
 729         ZONE_LOCK(zone);
 730         critical_enter();
 731         cache = &zone->uz_cpu[curcpu];
 732         if (cache->uc_allocbucket) {
 733                 if (cache->uc_allocbucket->ub_cnt != 0)
 734                         LIST_INSERT_HEAD(&zone->uz_buckets,
 735                             cache->uc_allocbucket, ub_link);
 736                 else
 737                         b1 = cache->uc_allocbucket;
 738                 cache->uc_allocbucket = NULL;
 739         }
 740         if (cache->uc_freebucket) {
 741                 if (cache->uc_freebucket->ub_cnt != 0)
 742                         LIST_INSERT_HEAD(&zone->uz_buckets,
 743                             cache->uc_freebucket, ub_link);
 744                 else
 745                         b2 = cache->uc_freebucket;
 746                 cache->uc_freebucket = NULL;
 747         }
 748         critical_exit();
 749         ZONE_UNLOCK(zone);
 750         if (b1)
 751                 bucket_free(zone, b1, NULL);
 752         if (b2)
 753                 bucket_free(zone, b2, NULL);
 754 }
 755
 756 /*
 757  * Safely drain per-CPU caches of a zone(s) to alloc bucket.
 758  * This is an expensive call because it needs to bind to all CPUs
 759  * one by one and enter a critical section on each of them in order
 760  * to safely access their cache buckets.
 761  * Zone lock must not be held on call this function.
 762  */
 763 static void
 764 cache_drain_safe(uma_zone_t zone)
 765 {
 766         int cpu;
 767
 768         /*
 769          * Polite bucket sizes shrinking was not enouth, shrink aggressively.
 770          */
 771         if (zone)
 772                 cache_shrink(zone);
 773         else
 774                 zone_foreach(cache_shrink);
 775
 776         CPU_FOREACH(cpu) {
 777                 thread_lock(curthread);
 778                 sched_bind(curthread, cpu);
 779                 thread_unlock(curthread);
 780
 781                 if (zone)
 782                         cache_drain_safe_cpu(zone);
 783                 else
 784                         zone_foreach(cache_drain_safe_cpu);
 785         }
 786         thread_lock(curthread);
 787         sched_unbind(curthread);
 788         thread_unlock(curthread);
 789 }
 790
 791 /*
 792  * Drain the cached buckets from a zone.  Expects a locked zone on entry.
 793  */
 794 static void
 795 bucket_cache_drain(uma_zone_t zone)
 796 {
 797         uma_bucket_t bucket;
 798
 799         /*
 800          * Drain the bucket queues and free the buckets, we just keep two per
 801          * cpu (alloc/free).
 802          */
 803         while ((bucket = LIST_FIRST(&zone->uz_buckets)) != NULL) {
 804                 LIST_REMOVE(bucket, ub_link);
 805                 ZONE_UNLOCK(zone);
 806                 bucket_drain(zone, bucket);
 807                 bucket_free(zone, bucket, NULL);
 808                 ZONE_LOCK(zone);
 809         }
 810
 811         /*
 812          * Shrink further bucket sizes.  Price of single zone lock collision
 813          * is probably lower then price of global cache drain.
 814          */
 815         if (zone->uz_count > zone->uz_count_min)
 816                 zone->uz_count--;
 817 }
 818
 819 static void
 820 keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start)
 821 {
 822         uint8_t *mem;
 823         int i;
 824         uint8_t flags;
 825
 826         mem = slab->us_data;
 827         flags = slab->us_flags;
 828         i = start;
 829         if (keg->uk_fini != NULL) {
 830                 for (i--; i > -1; i--)
 831                         keg->uk_fini(slab->us_data + (keg->uk_rsize * i),
 832                             keg->uk_size);
 833         }
 834         if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 835                 zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
 836 #ifdef UMA_DEBUG
 837         printf("%s: Returning %d bytes.\n", keg->uk_name,
 838             PAGE_SIZE * keg->uk_ppera);
 839 #endif
 840         keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera, flags);
 841 }
 842
 843 /*
 844  * Frees pages from a keg back to the system.  This is done on demand from
 845  * the pageout daemon.
 846  *
 847  * Returns nothing.
 848  */
 849 static void
 850 keg_drain(uma_keg_t keg)
 851 {
 852         struct slabhead freeslabs = { 0 };
 853         uma_slab_t slab;
 854         uma_slab_t n;
 855
 856         /*
 857          * We don't want to take pages from statically allocated kegs at this
 858          * time
 859          */
 860         if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)
 861                 return;
 862
 863 #ifdef UMA_DEBUG
 864         printf("%s free items: %u\n", keg->uk_name, keg->uk_free);
 865 #endif
 866         KEG_LOCK(keg);
 867         if (keg->uk_free == 0)
 868                 goto finished;
 869
 870         slab = LIST_FIRST(&keg->uk_free_slab);
 871         while (slab) {
 872                 n = LIST_NEXT(slab, us_link);
 873
 874                 /* We have no where to free these to */
 875                 if (slab->us_flags & UMA_SLAB_BOOT) {
 876                         slab = n;
 877                         continue;
 878                 }
 879
 880                 LIST_REMOVE(slab, us_link);
 881                 keg->uk_pages -= keg->uk_ppera;
 882                 keg->uk_free -= keg->uk_ipers;
 883
 884                 if (keg->uk_flags & UMA_ZONE_HASH)
 885                         UMA_HASH_REMOVE(&keg->uk_hash, slab, slab->us_data);
 886
 887                 SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
 888
 889                 slab = n;
 890         }
 891 finished:
 892         KEG_UNLOCK(keg);
 893
 894         while ((slab = SLIST_FIRST(&freeslabs)) != NULL) {
 895                 SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink);
 896                 keg_free_slab(keg, slab, keg->uk_ipers);
 897         }
 898 }
 899
 900 static void
 901 zone_drain_wait(uma_zone_t zone, int waitok)
 902 {
 903
 904         /*
 905          * Set draining to interlock with zone_dtor() so we can release our
 906          * locks as we go.  Only dtor() should do a WAITOK call since it
 907          * is the only call that knows the structure will still be available
 908          * when it wakes up.
 909          */
 910         ZONE_LOCK(zone);
 911         while (zone->uz_flags & UMA_ZFLAG_DRAINING) {
 912                 if (waitok == M_NOWAIT)
 913                         goto out;
 914                 msleep(zone, zone->uz_lockptr, PVM, "zonedrain", 1);
 915         }
 916         zone->uz_flags |= UMA_ZFLAG_DRAINING;
 917         bucket_cache_drain(zone);
 918         ZONE_UNLOCK(zone);
 919         /*
 920          * The DRAINING flag protects us from being freed while
 921          * we're running.  Normally the uma_rwlock would protect us but we
 922          * must be able to release and acquire the right lock for each keg.
 923          */
 924         zone_foreach_keg(zone, &keg_drain);
 925         ZONE_LOCK(zone);
 926         zone->uz_flags &= ~UMA_ZFLAG_DRAINING;
 927         wakeup(zone);
 928 out:
 929         ZONE_UNLOCK(zone);
 930 }
 931
 932 void
 933 zone_drain(uma_zone_t zone)
 934 {
 935
 936         zone_drain_wait(zone, M_NOWAIT);
 937 }
 938
 939 /*
 940  * Allocate a new slab for a keg.  This does not insert the slab onto a list.
 941  *
 942  * Arguments:
 943  *      wait  Shall we wait?
 944  *
 945  * Returns:
 946  *      The slab that was allocated or NULL if there is no memory and the
 947  *      caller specified M_NOWAIT.
 948  */
 949 static uma_slab_t
 950 keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait)
 951 {
 952         uma_slabrefcnt_t slabref;
 953         uma_alloc allocf;
 954         uma_slab_t slab;
 955         uint8_t *mem;
 956         uint8_t flags;
 957         int i;
 958
 959         mtx_assert(&keg->uk_lock, MA_OWNED);
 960         slab = NULL;
 961         mem = NULL;
 962
 963 #ifdef UMA_DEBUG
 964         printf("alloc_slab:  Allocating a new slab for %s\n", keg->uk_name);
 965 #endif
 966         allocf = keg->uk_allocf;
 967         KEG_UNLOCK(keg);
 968
 969         if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
 970                 slab = zone_alloc_item(keg->uk_slabzone, NULL, wait);
 971                 if (slab == NULL)
 972                         goto out;
 973         }
 974
 975         /*
 976          * This reproduces the old vm_zone behavior of zero filling pages the
 977          * first time they are added to a zone.
 978          *
 979          * Malloced items are zeroed in uma_zalloc.
 980          */
 981
 982         if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
 983                 wait |= M_ZERO;
 984         else
 985                 wait &= ~M_ZERO;
 986
 987         if (keg->uk_flags & UMA_ZONE_NODUMP)
 988                 wait |= M_NODUMP;
 989
 990         /* zone is passed for legacy reasons. */
 991         mem = allocf(zone, keg->uk_ppera * PAGE_SIZE, &flags, wait);
 992         if (mem == NULL) {
 993                 if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 994                         zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
 995                 slab = NULL;
 996                 goto out;
 997         }
 998
 999         /* Point the slab into the allocated memory */
1000         if (!(keg->uk_flags & UMA_ZONE_OFFPAGE))
1001                 slab = (uma_slab_t )(mem + keg->uk_pgoff);
1002
1003         if (keg->uk_flags & UMA_ZONE_VTOSLAB)
1004                 for (i = 0; i < keg->uk_ppera; i++)
1005                         vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab);
1006
1007         slab->us_keg = keg;
1008         slab->us_data = mem;
1009         slab->us_freecount = keg->uk_ipers;
1010         slab->us_flags = flags;
1011         BIT_FILL(SLAB_SETSIZE, &slab->us_free);
1012 #ifdef INVARIANTS
1013         BIT_ZERO(SLAB_SETSIZE, &slab->us_debugfree);
1014 #endif
1015         if (keg->uk_flags & UMA_ZONE_REFCNT) {
1016                 slabref = (uma_slabrefcnt_t)slab;
1017                 for (i = 0; i < keg->uk_ipers; i++)
1018                         slabref->us_refcnt[i] = 0;
1019         }
1020
1021         if (keg->uk_init != NULL) {
1022                 for (i = 0; i < keg->uk_ipers; i++)
1023                         if (keg->uk_init(slab->us_data + (keg->uk_rsize * i),
1024                             keg->uk_size, wait) != 0)
1025                                 break;
1026                 if (i != keg->uk_ipers) {
1027                         keg_free_slab(keg, slab, i);
1028                         slab = NULL;
1029                         goto out;
1030                 }
1031         }
1032 out:
1033         KEG_LOCK(keg);
1034
1035         if (slab != NULL) {
1036                 if (keg->uk_flags & UMA_ZONE_HASH)
1037                         UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
1038
1039                 keg->uk_pages += keg->uk_ppera;
1040                 keg->uk_free += keg->uk_ipers;
1041         }
1042
1043         return (slab);
1044 }
1045
1046 /*
1047  * This function is intended to be used early on in place of page_alloc() so
1048  * that we may use the boot time page cache to satisfy allocations before
1049  * the VM is ready.
1050  */
1051 static void *
1052 startup_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait)
1053 {
1054         uma_keg_t keg;
1055         uma_slab_t tmps;
1056         int pages, check_pages;
1057
1058         keg = zone_first_keg(zone);
1059         pages = howmany(bytes, PAGE_SIZE);
1060         check_pages = pages - 1;
1061         KASSERT(pages > 0, ("startup_alloc can't reserve 0 pages\n"));
1062
1063         /*
1064          * Check our small startup cache to see if it has pages remaining.
1065          */
1066         mtx_lock(&uma_boot_pages_mtx);
1067
1068         /* First check if we have enough room. */
1069         tmps = LIST_FIRST(&uma_boot_pages);
1070         while (tmps != NULL && check_pages-- > 0)
1071                 tmps = LIST_NEXT(tmps, us_link);
1072         if (tmps != NULL) {
1073                 /*
1074                  * It's ok to lose tmps references.  The last one will
1075                  * have tmps->us_data pointing to the start address of
1076                  * "pages" contiguous pages of memory.
1077                  */
1078                 while (pages-- > 0) {
1079                         tmps = LIST_FIRST(&uma_boot_pages);
1080                         LIST_REMOVE(tmps, us_link);
1081                 }
1082                 mtx_unlock(&uma_boot_pages_mtx);
1083                 *pflag = tmps->us_flags;
1084                 return (tmps->us_data);
1085         }
1086         mtx_unlock(&uma_boot_pages_mtx);
1087         if (booted < UMA_STARTUP2)
1088                 panic("UMA: Increase vm.boot_pages");
1089         /*
1090          * Now that we've booted reset these users to their real allocator.
1091          */
1092 #ifdef UMA_MD_SMALL_ALLOC
1093         keg->uk_allocf = (keg->uk_ppera > 1) ? page_alloc : uma_small_alloc;
1094 #else
1095         keg->uk_allocf = page_alloc;
1096 #endif
1097         return keg->uk_allocf(zone, bytes, pflag, wait);
1098 }
1099
1100 /*
1101  * Allocates a number of pages from the system
1102  *
1103  * Arguments:
1104  *      bytes  The number of bytes requested
1105  *      wait  Shall we wait?
1106  *
1107  * Returns:
1108  *      A pointer to the alloced memory or possibly
1109  *      NULL if M_NOWAIT is set.
1110  */
1111 static void *
1112 page_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait)
1113 {
1114         void *p;        /* Returned page */
1115
1116         *pflag = UMA_SLAB_KMEM;
1117         p = (void *) kmem_malloc(kmem_arena, bytes, wait);
1118
1119         return (p);
1120 }
1121
1122 /*
1123  * Allocates a number of pages from within an object
1124  *
1125  * Arguments:
1126  *      bytes  The number of bytes requested
1127  *      wait   Shall we wait?
1128  *
1129  * Returns:
1130  *      A pointer to the alloced memory or possibly
1131  *      NULL if M_NOWAIT is set.
1132  */
1133 static void *
1134 noobj_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait)
1135 {
1136         TAILQ_HEAD(, vm_page) alloctail;
1137         u_long npages;
1138         vm_offset_t retkva, zkva;
1139         vm_page_t p, p_next;
1140         uma_keg_t keg;
1141
1142         TAILQ_INIT(&alloctail);
1143         keg = zone_first_keg(zone);
1144
1145         npages = howmany(bytes, PAGE_SIZE);
1146         while (npages > 0) {
1147                 p = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT |
1148                     VM_ALLOC_WIRED | VM_ALLOC_NOOBJ);
1149                 if (p != NULL) {
1150                         /*
1151                          * Since the page does not belong to an object, its
1152                          * listq is unused.
1153                          */
1154                         TAILQ_INSERT_TAIL(&alloctail, p, listq);
1155                         npages--;
1156                         continue;
1157                 }
1158                 if (wait & M_WAITOK) {
1159                         VM_WAIT;
1160                         continue;
1161                 }
1162
1163                 /*
1164                  * Page allocation failed, free intermediate pages and
1165                  * exit.
1166                  */
1167                 TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
1168                         vm_page_unwire(p, PQ_NONE);
1169                         vm_page_free(p);
1170                 }
1171                 return (NULL);
1172         }
1173         *flags = UMA_SLAB_PRIV;
1174         zkva = keg->uk_kva +
1175             atomic_fetchadd_long(&keg->uk_offset, round_page(bytes));
1176         retkva = zkva;
1177         TAILQ_FOREACH(p, &alloctail, listq) {
1178                 pmap_qenter(zkva, &p, 1);
1179                 zkva += PAGE_SIZE;
1180         }
1181
1182         return ((void *)retkva);
1183 }
1184
1185 /*
1186  * Frees a number of pages to the system
1187  *
1188  * Arguments:
1189  *      mem   A pointer to the memory to be freed
1190  *      size  The size of the memory being freed
1191  *      flags The original p->us_flags field
1192  *
1193  * Returns:
1194  *      Nothing
1195  */
1196 static void
1197 page_free(void *mem, vm_size_t size, uint8_t flags)
1198 {
1199         struct vmem *vmem;
1200
1201         if (flags & UMA_SLAB_KMEM)
1202                 vmem = kmem_arena;
1203         else if (flags & UMA_SLAB_KERNEL)
1204                 vmem = kernel_arena;
1205         else
1206                 panic("UMA: page_free used with invalid flags %d", flags);
1207
1208         kmem_free(vmem, (vm_offset_t)mem, size);
1209 }
1210
1211 /*
1212  * Zero fill initializer
1213  *
1214  * Arguments/Returns follow uma_init specifications
1215  */
1216 static int
1217 zero_init(void *mem, int size, int flags)
1218 {
1219         bzero(mem, size);
1220         return (0);
1221 }
1222
1223 /*
1224  * Finish creating a small uma keg.  This calculates ipers, and the keg size.
1225  *
1226  * Arguments
1227  *      keg  The zone we should initialize
1228  *
1229  * Returns
1230  *      Nothing
1231  */
1232 static void
1233 keg_small_init(uma_keg_t keg)
1234 {
1235         u_int rsize;
1236         u_int memused;
1237         u_int wastedspace;
1238         u_int shsize;
1239
1240         if (keg->uk_flags & UMA_ZONE_PCPU) {
1241                 u_int ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
1242
1243                 keg->uk_slabsize = sizeof(struct pcpu);
1244                 keg->uk_ppera = howmany(ncpus * sizeof(struct pcpu),
1245                     PAGE_SIZE);
1246         } else {
1247                 keg->uk_slabsize = UMA_SLAB_SIZE;
1248                 keg->uk_ppera = 1;
1249         }
1250
1251         /*
1252          * Calculate the size of each allocation (rsize) according to
1253          * alignment.  If the requested size is smaller than we have
1254          * allocation bits for we round it up.
1255          */
1256         rsize = keg->uk_size;
1257         if (rsize < keg->uk_slabsize / SLAB_SETSIZE)
1258                 rsize = keg->uk_slabsize / SLAB_SETSIZE;
1259         if (rsize & keg->uk_align)
1260                 rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1);
1261         keg->uk_rsize = rsize;
1262
1263         KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 ||
1264             keg->uk_rsize < sizeof(struct pcpu),
1265             ("%s: size %u too large", __func__, keg->uk_rsize));
1266
1267         if (keg->uk_flags & UMA_ZONE_REFCNT)
1268                 rsize += sizeof(uint32_t);
1269
1270         if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1271                 shsize = 0;
1272         else
1273                 shsize = sizeof(struct uma_slab);
1274
1275         keg->uk_ipers = (keg->uk_slabsize - shsize) / rsize;
1276         KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
1277             ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
1278
1279         memused = keg->uk_ipers * rsize + shsize;
1280         wastedspace = keg->uk_slabsize - memused;
1281
1282         /*
1283          * We can't do OFFPAGE if we're internal or if we've been
1284          * asked to not go to the VM for buckets.  If we do this we
1285          * may end up going to the VM  for slabs which we do not
1286          * want to do if we're UMA_ZFLAG_CACHEONLY as a result
1287          * of UMA_ZONE_VM, which clearly forbids it.
1288          */
1289         if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) ||
1290             (keg->uk_flags & UMA_ZFLAG_CACHEONLY))
1291                 return;
1292
1293         /*
1294          * See if using an OFFPAGE slab will limit our waste.  Only do
1295          * this if it permits more items per-slab.
1296          *
1297          * XXX We could try growing slabsize to limit max waste as well.
1298          * Historically this was not done because the VM could not
1299          * efficiently handle contiguous allocations.
1300          */
1301         if ((wastedspace >= keg->uk_slabsize / UMA_MAX_WASTE) &&
1302             (keg->uk_ipers < (keg->uk_slabsize / keg->uk_rsize))) {
1303                 keg->uk_ipers = keg->uk_slabsize / keg->uk_rsize;
1304                 KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
1305                     ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
1306 #ifdef UMA_DEBUG
1307                 printf("UMA decided we need offpage slab headers for "
1308                     "keg: %s, calculated wastedspace = %d, "
1309                     "maximum wasted space allowed = %d, "
1310                     "calculated ipers = %d, "
1311                     "new wasted space = %d\n", keg->uk_name, wastedspace,
1312                     keg->uk_slabsize / UMA_MAX_WASTE, keg->uk_ipers,
1313                     keg->uk_slabsize - keg->uk_ipers * keg->uk_rsize);
1314 #endif
1315                 keg->uk_flags |= UMA_ZONE_OFFPAGE;
1316         }
1317
1318         if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
1319             (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1320                 keg->uk_flags |= UMA_ZONE_HASH;
1321 }
1322
1323 /*
1324  * Finish creating a large (> UMA_SLAB_SIZE) uma kegs.  Just give in and do
1325  * OFFPAGE for now.  When I can allow for more dynamic slab sizes this will be
1326  * more complicated.
1327  *
1328  * Arguments
1329  *      keg  The keg we should initialize
1330  *
1331  * Returns
1332  *      Nothing
1333  */
1334 static void
1335 keg_large_init(uma_keg_t keg)
1336 {
1337         u_int shsize;
1338
1339         KASSERT(keg != NULL, ("Keg is null in keg_large_init"));
1340         KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0,
1341             ("keg_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY keg"));
1342         KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
1343             ("%s: Cannot large-init a UMA_ZONE_PCPU keg", __func__));
1344
1345         keg->uk_ppera = howmany(keg->uk_size, PAGE_SIZE);
1346         keg->uk_slabsize = keg->uk_ppera * PAGE_SIZE;
1347         keg->uk_ipers = 1;
1348         keg->uk_rsize = keg->uk_size;
1349
1350         /* We can't do OFFPAGE if we're internal, bail out here. */
1351         if (keg->uk_flags & UMA_ZFLAG_INTERNAL)
1352                 return;
1353
1354         /* Check whether we have enough space to not do OFFPAGE. */
1355         if ((keg->uk_flags & UMA_ZONE_OFFPAGE) == 0) {
1356                 shsize = sizeof(struct uma_slab);
1357                 if (keg->uk_flags & UMA_ZONE_REFCNT)
1358                         shsize += keg->uk_ipers * sizeof(uint32_t);
1359                 if (shsize & UMA_ALIGN_PTR)
1360                         shsize = (shsize & ~UMA_ALIGN_PTR) +
1361                             (UMA_ALIGN_PTR + 1);
1362
1363                 if ((PAGE_SIZE * keg->uk_ppera) - keg->uk_rsize < shsize)
1364                         keg->uk_flags |= UMA_ZONE_OFFPAGE;
1365         }
1366
1367         if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
1368             (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1369                 keg->uk_flags |= UMA_ZONE_HASH;
1370 }
1371
1372 static void
1373 keg_cachespread_init(uma_keg_t keg)
1374 {
1375         int alignsize;
1376         int trailer;
1377         int pages;
1378         int rsize;
1379
1380         KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
1381             ("%s: Cannot cachespread-init a UMA_ZONE_PCPU keg", __func__));
1382
1383         alignsize = keg->uk_align + 1;
1384         rsize = keg->uk_size;
1385         /*
1386          * We want one item to start on every align boundary in a page.  To
1387          * do this we will span pages.  We will also extend the item by the
1388          * size of align if it is an even multiple of align.  Otherwise, it
1389          * would fall on the same boundary every time.
1390          */
1391         if (rsize & keg->uk_align)
1392                 rsize = (rsize & ~keg->uk_align) + alignsize;
1393         if ((rsize & alignsize) == 0)
1394                 rsize += alignsize;
1395         trailer = rsize - keg->uk_size;
1396         pages = (rsize * (PAGE_SIZE / alignsize)) / PAGE_SIZE;
1397         pages = MIN(pages, (128 * 1024) / PAGE_SIZE);
1398         keg->uk_rsize = rsize;
1399         keg->uk_ppera = pages;
1400         keg->uk_slabsize = UMA_SLAB_SIZE;
1401         keg->uk_ipers = ((pages * PAGE_SIZE) + trailer) / rsize;
1402         keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB;
1403         KASSERT(keg->uk_ipers <= SLAB_SETSIZE,
1404             ("%s: keg->uk_ipers too high(%d) increase max_ipers", __func__,
1405             keg->uk_ipers));
1406 }
1407
1408 /*
1409  * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
1410  * the keg onto the global keg list.
1411  *
1412  * Arguments/Returns follow uma_ctor specifications
1413  *      udata  Actually uma_kctor_args
1414  */
1415 static int
1416 keg_ctor(void *mem, int size, void *udata, int flags)
1417 {
1418         struct uma_kctor_args *arg = udata;
1419         uma_keg_t keg = mem;
1420         uma_zone_t zone;
1421
1422         bzero(keg, size);
1423         keg->uk_size = arg->size;
1424         keg->uk_init = arg->uminit;
1425         keg->uk_fini = arg->fini;
1426         keg->uk_align = arg->align;
1427         keg->uk_free = 0;
1428         keg->uk_reserve = 0;
1429         keg->uk_pages = 0;
1430         keg->uk_flags = arg->flags;
1431         keg->uk_allocf = page_alloc;
1432         keg->uk_freef = page_free;
1433         keg->uk_slabzone = NULL;
1434
1435         /*
1436          * The master zone is passed to us at keg-creation time.
1437          */
1438         zone = arg->zone;
1439         keg->uk_name = zone->uz_name;
1440
1441         if (arg->flags & UMA_ZONE_VM)
1442                 keg->uk_flags |= UMA_ZFLAG_CACHEONLY;
1443
1444         if (arg->flags & UMA_ZONE_ZINIT)
1445                 keg->uk_init = zero_init;
1446
1447         if (arg->flags & UMA_ZONE_REFCNT || arg->flags & UMA_ZONE_MALLOC)
1448                 keg->uk_flags |= UMA_ZONE_VTOSLAB;
1449
1450         if (arg->flags & UMA_ZONE_PCPU)
1451 #ifdef SMP
1452                 keg->uk_flags |= UMA_ZONE_OFFPAGE;
1453 #else
1454                 keg->uk_flags &= ~UMA_ZONE_PCPU;
1455 #endif
1456
1457         if (keg->uk_flags & UMA_ZONE_CACHESPREAD) {
1458                 keg_cachespread_init(keg);
1459         } else if (keg->uk_flags & UMA_ZONE_REFCNT) {
1460                 if (keg->uk_size >
1461                     (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt) -
1462                     sizeof(uint32_t)))
1463                         keg_large_init(keg);
1464                 else
1465                         keg_small_init(keg);
1466         } else {
1467                 if (keg->uk_size > (UMA_SLAB_SIZE - sizeof(struct uma_slab)))
1468                         keg_large_init(keg);
1469                 else
1470                         keg_small_init(keg);
1471         }
1472
1473         if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
1474                 if (keg->uk_flags & UMA_ZONE_REFCNT) {
1475                         if (keg->uk_ipers > uma_max_ipers_ref)
1476                                 panic("Too many ref items per zone: %d > %d\n",
1477                                     keg->uk_ipers, uma_max_ipers_ref);
1478                         keg->uk_slabzone = slabrefzone;
1479                 } else
1480                         keg->uk_slabzone = slabzone;
1481         }
1482
1483         /*
1484          * If we haven't booted yet we need allocations to go through the
1485          * startup cache until the vm is ready.
1486          */
1487         if (keg->uk_ppera == 1) {
1488 #ifdef UMA_MD_SMALL_ALLOC
1489                 keg->uk_allocf = uma_small_alloc;
1490                 keg->uk_freef = uma_small_free;
1491
1492                 if (booted < UMA_STARTUP)
1493                         keg->uk_allocf = startup_alloc;
1494 #else
1495                 if (booted < UMA_STARTUP2)
1496                         keg->uk_allocf = startup_alloc;
1497 #endif
1498         } else if (booted < UMA_STARTUP2 &&
1499             (keg->uk_flags & UMA_ZFLAG_INTERNAL))
1500                 keg->uk_allocf = startup_alloc;
1501
1502         /*
1503          * Initialize keg's lock
1504          */
1505         KEG_LOCK_INIT(keg, (arg->flags & UMA_ZONE_MTXCLASS));
1506
1507         /*
1508          * If we're putting the slab header in the actual page we need to
1509          * figure out where in each page it goes.  This calculates a right
1510          * justified offset into the memory on an ALIGN_PTR boundary.
1511          */
1512         if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) {
1513                 u_int totsize;
1514
1515                 /* Size of the slab struct and free list */
1516                 totsize = sizeof(struct uma_slab);
1517
1518                 /* Size of the reference counts. */
1519                 if (keg->uk_flags & UMA_ZONE_REFCNT)
1520                         totsize += keg->uk_ipers * sizeof(uint32_t);
1521
1522                 if (totsize & UMA_ALIGN_PTR)
1523                         totsize = (totsize & ~UMA_ALIGN_PTR) +
1524                             (UMA_ALIGN_PTR + 1);
1525                 keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - totsize;
1526
1527                 /*
1528                  * The only way the following is possible is if with our
1529                  * UMA_ALIGN_PTR adjustments we are now bigger than
1530                  * UMA_SLAB_SIZE.  I haven't checked whether this is
1531                  * mathematically possible for all cases, so we make
1532                  * sure here anyway.
1533                  */
1534                 totsize = keg->uk_pgoff + sizeof(struct uma_slab);
1535                 if (keg->uk_flags & UMA_ZONE_REFCNT)
1536                         totsize += keg->uk_ipers * sizeof(uint32_t);
1537                 if (totsize > PAGE_SIZE * keg->uk_ppera) {
1538                         printf("zone %s ipers %d rsize %d size %d\n",
1539                             zone->uz_name, keg->uk_ipers, keg->uk_rsize,
1540                             keg->uk_size);
1541                         panic("UMA slab won't fit.");
1542                 }
1543         }
1544
1545         if (keg->uk_flags & UMA_ZONE_HASH)
1546                 hash_alloc(&keg->uk_hash);
1547
1548 #ifdef UMA_DEBUG
1549         printf("UMA: %s(%p) size %d(%d) flags %#x ipers %d ppera %d out %d free %d\n",
1550             zone->uz_name, zone, keg->uk_size, keg->uk_rsize, keg->uk_flags,
1551             keg->uk_ipers, keg->uk_ppera,
1552             (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free);
1553 #endif
1554
1555         LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
1556
1557         rw_wlock(&uma_rwlock);
1558         LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
1559         rw_wunlock(&uma_rwlock);
1560         return (0);
1561 }
1562
1563 /*
1564  * Zone header ctor.  This initializes all fields, locks, etc.
1565  *
1566  * Arguments/Returns follow uma_ctor specifications
1567  *      udata  Actually uma_zctor_args
1568  */
1569 static int
1570 zone_ctor(void *mem, int size, void *udata, int flags)
1571 {
1572         struct uma_zctor_args *arg = udata;
1573         uma_zone_t zone = mem;
1574         uma_zone_t z;
1575         uma_keg_t keg;
1576
1577         bzero(zone, size);
1578         zone->uz_name = arg->name;
1579         zone->uz_ctor = arg->ctor;
1580         zone->uz_dtor = arg->dtor;
1581         zone->uz_slab = zone_fetch_slab;
1582         zone->uz_init = NULL;
1583         zone->uz_fini = NULL;
1584         zone->uz_allocs = 0;
1585         zone->uz_frees = 0;
1586         zone->uz_fails = 0;
1587         zone->uz_sleeps = 0;
1588         zone->uz_count = 0;
1589         zone->uz_count_min = 0;
1590         zone->uz_flags = 0;
1591         zone->uz_warning = NULL;
1592         timevalclear(&zone->uz_ratecheck);
1593         zone->uz_maxaction = NULL;
1594         keg = arg->keg;
1595
1596         ZONE_LOCK_INIT(zone, (arg->flags & UMA_ZONE_MTXCLASS));
1597
1598         /*
1599          * This is a pure cache zone, no kegs.
1600          */
1601         if (arg->import) {
1602                 if (arg->flags & UMA_ZONE_VM)
1603                         arg->flags |= UMA_ZFLAG_CACHEONLY;
1604                 zone->uz_flags = arg->flags;
1605                 zone->uz_size = arg->size;
1606                 zone->uz_import = arg->import;
1607                 zone->uz_release = arg->release;
1608                 zone->uz_arg = arg->arg;
1609                 zone->uz_lockptr = &zone->uz_lock;
1610                 rw_wlock(&uma_rwlock);
1611                 LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link);
1612                 rw_wunlock(&uma_rwlock);
1613                 goto out;
1614         }
1615
1616         /*
1617          * Use the regular zone/keg/slab allocator.
1618          */
1619         zone->uz_import = (uma_import)zone_import;
1620         zone->uz_release = (uma_release)zone_release;
1621         zone->uz_arg = zone;
1622
1623         if (arg->flags & UMA_ZONE_SECONDARY) {
1624                 KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
1625                 zone->uz_init = arg->uminit;
1626                 zone->uz_fini = arg->fini;
1627                 zone->uz_lockptr = &keg->uk_lock;
1628                 zone->uz_flags |= UMA_ZONE_SECONDARY;
1629                 rw_wlock(&uma_rwlock);
1630                 ZONE_LOCK(zone);
1631                 LIST_FOREACH(z, &keg->uk_zones, uz_link) {
1632                         if (LIST_NEXT(z, uz_link) == NULL) {
1633                                 LIST_INSERT_AFTER(z, zone, uz_link);
1634                                 break;
1635                         }
1636                 }
1637                 ZONE_UNLOCK(zone);
1638                 rw_wunlock(&uma_rwlock);
1639         } else if (keg == NULL) {
1640                 if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
1641                     arg->align, arg->flags)) == NULL)
1642                         return (ENOMEM);
1643         } else {
1644                 struct uma_kctor_args karg;
1645                 int error;
1646
1647                 /* We should only be here from uma_startup() */
1648                 karg.size = arg->size;
1649                 karg.uminit = arg->uminit;
1650                 karg.fini = arg->fini;
1651                 karg.align = arg->align;
1652                 karg.flags = arg->flags;
1653                 karg.zone = zone;
1654                 error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg,
1655                     flags);
1656                 if (error)
1657                         return (error);
1658         }
1659
1660         /*
1661          * Link in the first keg.
1662          */
1663         zone->uz_klink.kl_keg = keg;
1664         LIST_INSERT_HEAD(&zone->uz_kegs, &zone->uz_klink, kl_link);
1665         zone->uz_lockptr = &keg->uk_lock;
1666         zone->uz_size = keg->uk_size;
1667         zone->uz_flags |= (keg->uk_flags &
1668             (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT));
1669
1670         /*
1671          * Some internal zones don't have room allocated for the per cpu
1672          * caches.  If we're internal, bail out here.
1673          */
1674         if (keg->uk_flags & UMA_ZFLAG_INTERNAL) {
1675                 KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0,
1676                     ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
1677                 return (0);
1678         }
1679
1680 out:
1681         if ((arg->flags & UMA_ZONE_MAXBUCKET) == 0)
1682                 zone->uz_count = bucket_select(zone->uz_size);
1683         else
1684                 zone->uz_count = BUCKET_MAX;
1685         zone->uz_count_min = zone->uz_count;
1686
1687         return (0);
1688 }
1689
1690 /*
1691  * Keg header dtor.  This frees all data, destroys locks, frees the hash
1692  * table and removes the keg from the global list.
1693  *
1694  * Arguments/Returns follow uma_dtor specifications
1695  *      udata  unused
1696  */
1697 static void
1698 keg_dtor(void *arg, int size, void *udata)
1699 {
1700         uma_keg_t keg;
1701
1702         keg = (uma_keg_t)arg;
1703         KEG_LOCK(keg);
1704         if (keg->uk_free != 0) {
1705                 printf("Freed UMA keg (%s) was not empty (%d items). "
1706                     " Lost %d pages of memory.\n",
1707                     keg->uk_name ? keg->uk_name : "",
1708                     keg->uk_free, keg->uk_pages);
1709         }
1710         KEG_UNLOCK(keg);
1711
1712         hash_free(&keg->uk_hash);
1713
1714         KEG_LOCK_FINI(keg);
1715 }
1716
1717 /*
1718  * Zone header dtor.
1719  *
1720  * Arguments/Returns follow uma_dtor specifications
1721  *      udata  unused
1722  */
1723 static void
1724 zone_dtor(void *arg, int size, void *udata)
1725 {
1726         uma_klink_t klink;
1727         uma_zone_t zone;
1728         uma_keg_t keg;
1729
1730         zone = (uma_zone_t)arg;
1731         keg = zone_first_keg(zone);
1732
1733         if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
1734                 cache_drain(zone);
1735
1736         rw_wlock(&uma_rwlock);
1737         LIST_REMOVE(zone, uz_link);
1738         rw_wunlock(&uma_rwlock);
1739         /*
1740          * XXX there are some races here where
1741          * the zone can be drained but zone lock
1742          * released and then refilled before we
1743          * remove it... we dont care for now
1744          */
1745         zone_drain_wait(zone, M_WAITOK);
1746         /*
1747          * Unlink all of our kegs.
1748          */
1749         while ((klink = LIST_FIRST(&zone->uz_kegs)) != NULL) {
1750                 klink->kl_keg = NULL;
1751                 LIST_REMOVE(klink, kl_link);
1752                 if (klink == &zone->uz_klink)
1753                         continue;
1754                 free(klink, M_TEMP);
1755         }
1756         /*
1757          * We only destroy kegs from non secondary zones.
1758          */
1759         if (keg != NULL && (zone->uz_flags & UMA_ZONE_SECONDARY) == 0)  {
1760                 rw_wlock(&uma_rwlock);
1761                 LIST_REMOVE(keg, uk_link);
1762                 rw_wunlock(&uma_rwlock);
1763                 zone_free_item(kegs, keg, NULL, SKIP_NONE);
1764         }
1765         ZONE_LOCK_FINI(zone);
1766 }
1767
1768 /*
1769  * Traverses every zone in the system and calls a callback
1770  *
1771  * Arguments:
1772  *      zfunc  A pointer to a function which accepts a zone
1773  *              as an argument.
1774  *
1775  * Returns:
1776  *      Nothing
1777  */
1778 static void
1779 zone_foreach(void (*zfunc)(uma_zone_t))
1780 {
1781         uma_keg_t keg;
1782         uma_zone_t zone;
1783
1784         rw_rlock(&uma_rwlock);
1785         LIST_FOREACH(keg, &uma_kegs, uk_link) {
1786                 LIST_FOREACH(zone, &keg->uk_zones, uz_link)
1787                         zfunc(zone);
1788         }
1789         rw_runlock(&uma_rwlock);
1790 }
1791
1792 /* Public functions */
1793 /* See uma.h */
1794 void
1795 uma_startup(void *bootmem, int boot_pages)
1796 {
1797         struct uma_zctor_args args;
1798         uma_slab_t slab;
1799         u_int slabsize;
1800         int i;
1801
1802 #ifdef UMA_DEBUG
1803         printf("Creating uma keg headers zone and keg.\n");
1804 #endif
1805         rw_init(&uma_rwlock, "UMA lock");
1806
1807         /* "manually" create the initial zone */
1808         memset(&args, 0, sizeof(args));
1809         args.name = "UMA Kegs";
1810         args.size = sizeof(struct uma_keg);
1811         args.ctor = keg_ctor;
1812         args.dtor = keg_dtor;
1813         args.uminit = zero_init;
1814         args.fini = NULL;
1815         args.keg = &masterkeg;
1816         args.align = 32 - 1;
1817         args.flags = UMA_ZFLAG_INTERNAL;
1818         /* The initial zone has no Per cpu queues so it's smaller */
1819         zone_ctor(kegs, sizeof(struct uma_zone), &args, M_WAITOK);
1820
1821 #ifdef UMA_DEBUG
1822         printf("Filling boot free list.\n");
1823 #endif
1824         for (i = 0; i < boot_pages; i++) {
1825                 slab = (uma_slab_t)((uint8_t *)bootmem + (i * UMA_SLAB_SIZE));
1826                 slab->us_data = (uint8_t *)slab;
1827                 slab->us_flags = UMA_SLAB_BOOT;
1828                 LIST_INSERT_HEAD(&uma_boot_pages, slab, us_link);
1829         }
1830         mtx_init(&uma_boot_pages_mtx, "UMA boot pages", NULL, MTX_DEF);
1831
1832 #ifdef UMA_DEBUG
1833         printf("Creating uma zone headers zone and keg.\n");
1834 #endif
1835         args.name = "UMA Zones";
1836         args.size = sizeof(struct uma_zone) +
1837             (sizeof(struct uma_cache) * (mp_maxid + 1));
1838         args.ctor = zone_ctor;
1839         args.dtor = zone_dtor;
1840         args.uminit = zero_init;
1841         args.fini = NULL;
1842         args.keg = NULL;
1843         args.align = 32 - 1;
1844         args.flags = UMA_ZFLAG_INTERNAL;
1845         /* The initial zone has no Per cpu queues so it's smaller */
1846         zone_ctor(zones, sizeof(struct uma_zone), &args, M_WAITOK);
1847
1848 #ifdef UMA_DEBUG
1849         printf("Creating slab and hash zones.\n");
1850 #endif
1851
1852         /* Now make a zone for slab headers */
1853         slabzone = uma_zcreate("UMA Slabs",
1854                                 sizeof(struct uma_slab),
1855                                 NULL, NULL, NULL, NULL,
1856                                 UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
1857
1858         /*
1859          * We also create a zone for the bigger slabs with reference
1860          * counts in them, to accomodate UMA_ZONE_REFCNT zones.
1861          */
1862         slabsize = sizeof(struct uma_slab_refcnt);
1863         slabsize += uma_max_ipers_ref * sizeof(uint32_t);
1864         slabrefzone = uma_zcreate("UMA RCntSlabs",
1865                                   slabsize,
1866                                   NULL, NULL, NULL, NULL,
1867                                   UMA_ALIGN_PTR,
1868                                   UMA_ZFLAG_INTERNAL);
1869
1870         hashzone = uma_zcreate("UMA Hash",
1871             sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
1872             NULL, NULL, NULL, NULL,
1873             UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
1874
1875         bucket_init();
1876
1877         booted = UMA_STARTUP;
1878
1879 #ifdef UMA_DEBUG
1880         printf("UMA startup complete.\n");
1881 #endif
1882 }
1883
1884 /* see uma.h */
1885 void
1886 uma_startup2(void)
1887 {
1888         booted = UMA_STARTUP2;
1889         bucket_enable();
1890         sx_init(&uma_drain_lock, "umadrain");
1891 #ifdef UMA_DEBUG
1892         printf("UMA startup2 complete.\n");
1893 #endif
1894 }
1895
1896 /*
1897  * Initialize our callout handle
1898  *
1899  */
1900
1901 static void
1902 uma_startup3(void)
1903 {
1904 #ifdef UMA_DEBUG
1905         printf("Starting callout.\n");
1906 #endif
1907         callout_init(&uma_callout, 1);
1908         callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
1909 #ifdef UMA_DEBUG
1910         printf("UMA startup3 complete.\n");
1911 #endif
1912 }
1913
1914 static uma_keg_t
1915 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
1916                 int align, uint32_t flags)
1917 {
1918         struct uma_kctor_args args;
1919
1920         args.size = size;
1921         args.uminit = uminit;
1922         args.fini = fini;
1923         args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
1924         args.flags = flags;
1925         args.zone = zone;
1926         return (zone_alloc_item(kegs, &args, M_WAITOK));
1927 }
1928
1929 /* See uma.h */
1930 void
1931 uma_set_align(int align)
1932 {
1933
1934         if (align != UMA_ALIGN_CACHE)
1935                 uma_align_cache = align;
1936 }
1937
1938 /* See uma.h */
1939 uma_zone_t
1940 uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
1941                 uma_init uminit, uma_fini fini, int align, uint32_t flags)
1942
1943 {
1944         struct uma_zctor_args args;
1945         uma_zone_t res;
1946         bool locked;
1947
1948         /* This stuff is essential for the zone ctor */
1949         memset(&args, 0, sizeof(args));
1950         args.name = name;
1951         args.size = size;
1952         args.ctor = ctor;
1953         args.dtor = dtor;
1954         args.uminit = uminit;
1955         args.fini = fini;
1956 #ifdef  INVARIANTS
1957         /*
1958          * If a zone is being created with an empty constructor and
1959          * destructor, pass UMA constructor/destructor which checks for
1960          * memory use after free.
1961          */
1962         if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOFREE))) &&
1963             ctor == NULL && dtor == NULL && uminit == NULL && fini == NULL) {
1964                 args.ctor = trash_ctor;
1965                 args.dtor = trash_dtor;
1966                 args.uminit = trash_init;
1967                 args.fini = trash_fini;
1968         }
1969 #endif
1970         args.align = align;
1971         args.flags = flags;
1972         args.keg = NULL;
1973
1974         if (booted < UMA_STARTUP2) {
1975                 locked = false;
1976         } else {
1977                 sx_slock(&uma_drain_lock);
1978                 locked = true;
1979         }
1980         res = zone_alloc_item(zones, &args, M_WAITOK);
1981         if (locked)
1982                 sx_sunlock(&uma_drain_lock);
1983         return (res);
1984 }
1985
1986 /* See uma.h */
1987 uma_zone_t
1988 uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
1989                     uma_init zinit, uma_fini zfini, uma_zone_t master)
1990 {
1991         struct uma_zctor_args args;
1992         uma_keg_t keg;
1993         uma_zone_t res;
1994         bool locked;
1995
1996         keg = zone_first_keg(master);
1997         memset(&args, 0, sizeof(args));
1998         args.name = name;
1999         args.size = keg->uk_size;
2000         args.ctor = ctor;
2001         args.dtor = dtor;
2002         args.uminit = zinit;
2003         args.fini = zfini;
2004         args.align = keg->uk_align;
2005         args.flags = keg->uk_flags | UMA_ZONE_SECONDARY;
2006         args.keg = keg;
2007
2008         if (booted < UMA_STARTUP2) {
2009                 locked = false;
2010         } else {
2011                 sx_slock(&uma_drain_lock);
2012                 locked = true;
2013         }
2014         /* XXX Attaches only one keg of potentially many. */
2015         res = zone_alloc_item(zones, &args, M_WAITOK);
2016         if (locked)
2017                 sx_sunlock(&uma_drain_lock);
2018         return (res);
2019 }
2020
2021 /* See uma.h */
2022 uma_zone_t
2023 uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor,
2024                     uma_init zinit, uma_fini zfini, uma_import zimport,
2025                     uma_release zrelease, void *arg, int flags)
2026 {
2027         struct uma_zctor_args args;
2028
2029         memset(&args, 0, sizeof(args));
2030         args.name = name;
2031         args.size = size;
2032         args.ctor = ctor;
2033         args.dtor = dtor;
2034         args.uminit = zinit;
2035         args.fini = zfini;
2036         args.import = zimport;
2037         args.release = zrelease;
2038         args.arg = arg;
2039         args.align = 0;
2040         args.flags = flags;
2041
2042         return (zone_alloc_item(zones, &args, M_WAITOK));
2043 }
2044
2045 static void
2046 zone_lock_pair(uma_zone_t a, uma_zone_t b)
2047 {
2048         if (a < b) {
2049                 ZONE_LOCK(a);
2050                 mtx_lock_flags(b->uz_lockptr, MTX_DUPOK);
2051         } else {
2052                 ZONE_LOCK(b);
2053                 mtx_lock_flags(a->uz_lockptr, MTX_DUPOK);
2054         }
2055 }
2056
2057 static void
2058 zone_unlock_pair(uma_zone_t a, uma_zone_t b)
2059 {
2060
2061         ZONE_UNLOCK(a);
2062         ZONE_UNLOCK(b);
2063 }
2064
2065 int
2066 uma_zsecond_add(uma_zone_t zone, uma_zone_t master)
2067 {
2068         uma_klink_t klink;
2069         uma_klink_t kl;
2070         int error;
2071
2072         error = 0;
2073         klink = malloc(sizeof(*klink), M_TEMP, M_WAITOK | M_ZERO);
2074
2075         zone_lock_pair(zone, master);
2076         /*
2077          * zone must use vtoslab() to resolve objects and must already be
2078          * a secondary.
2079          */
2080         if ((zone->uz_flags & (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY))
2081             != (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY)) {
2082                 error = EINVAL;
2083                 goto out;
2084         }
2085         /*
2086          * The new master must also use vtoslab().
2087          */
2088         if ((zone->uz_flags & UMA_ZONE_VTOSLAB) != UMA_ZONE_VTOSLAB) {
2089                 error = EINVAL;
2090                 goto out;
2091         }
2092         /*
2093          * Both must either be refcnt, or not be refcnt.
2094          */
2095         if ((zone->uz_flags & UMA_ZONE_REFCNT) !=
2096             (master->uz_flags & UMA_ZONE_REFCNT)) {
2097                 error = EINVAL;
2098                 goto out;
2099         }
2100         /*
2101          * The underlying object must be the same size.  rsize
2102          * may be different.
2103          */
2104         if (master->uz_size != zone->uz_size) {
2105                 error = E2BIG;
2106                 goto out;
2107         }
2108         /*
2109          * Put it at the end of the list.
2110          */
2111         klink->kl_keg = zone_first_keg(master);
2112         LIST_FOREACH(kl, &zone->uz_kegs, kl_link) {
2113                 if (LIST_NEXT(kl, kl_link) == NULL) {
2114                         LIST_INSERT_AFTER(kl, klink, kl_link);
2115                         break;
2116                 }
2117         }
2118         klink = NULL;
2119         zone->uz_flags |= UMA_ZFLAG_MULTI;
2120         zone->uz_slab = zone_fetch_slab_multi;
2121
2122 out:
2123         zone_unlock_pair(zone, master);
2124         if (klink != NULL)
2125                 free(klink, M_TEMP);
2126
2127         return (error);
2128 }
2129
2130
2131 /* See uma.h */
2132 void
2133 uma_zdestroy(uma_zone_t zone)
2134 {
2135
2136         sx_slock(&uma_drain_lock);
2137         zone_free_item(zones, zone, NULL, SKIP_NONE);
2138         sx_sunlock(&uma_drain_lock);
2139 }
2140
2141 /* See uma.h */
2142 void *
2143 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
2144 {
2145         void *item;
2146         uma_cache_t cache;
2147         uma_bucket_t bucket;
2148         int lockfail;
2149         int cpu;
2150
2151         /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
2152         random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA);
2153
2154         /* This is the fast path allocation */
2155 #ifdef UMA_DEBUG_ALLOC_1
2156         printf("Allocating one item from %s(%p)\n", zone->uz_name, zone);
2157 #endif
2158         CTR3(KTR_UMA, "uma_zalloc_arg thread %x zone %s flags %d", curthread,
2159             zone->uz_name, flags);
2160
2161         if (flags & M_WAITOK) {
2162                 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2163                     "uma_zalloc_arg: zone \"%s\"", zone->uz_name);
2164         }
2165         KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
2166             ("uma_zalloc_arg: called with spinlock or critical section held"));
2167
2168 #ifdef DEBUG_MEMGUARD
2169         if (memguard_cmp_zone(zone)) {
2170                 item = memguard_alloc(zone->uz_size, flags);
2171                 if (item != NULL) {
2172                         /*
2173                          * Avoid conflict with the use-after-free
2174                          * protecting infrastructure from INVARIANTS.
2175                          */
2176                         if (zone->uz_init != NULL &&
2177                             zone->uz_init != mtrash_init &&
2178                             zone->uz_init(item, zone->uz_size, flags) != 0)
2179                                 return (NULL);
2180                         if (zone->uz_ctor != NULL &&
2181                             zone->uz_ctor != mtrash_ctor &&
2182                             zone->uz_ctor(item, zone->uz_size, udata,
2183                             flags) != 0) {
2184                                 zone->uz_fini(item, zone->uz_size);
2185                                 return (NULL);
2186                         }
2187                         return (item);
2188                 }
2189                 /* This is unfortunate but should not be fatal. */
2190         }
2191 #endif
2192         /*
2193          * If possible, allocate from the per-CPU cache.  There are two
2194          * requirements for safe access to the per-CPU cache: (1) the thread
2195          * accessing the cache must not be preempted or yield during access,
2196          * and (2) the thread must not migrate CPUs without switching which
2197          * cache it accesses.  We rely on a critical section to prevent
2198          * preemption and migration.  We release the critical section in
2199          * order to acquire the zone mutex if we are unable to allocate from
2200          * the current cache; when we re-acquire the critical section, we
2201          * must detect and handle migration if it has occurred.
2202          */
2203         critical_enter();
2204         cpu = curcpu;
2205         cache = &zone->uz_cpu[cpu];
2206
2207 zalloc_start:
2208         bucket = cache->uc_allocbucket;
2209         if (bucket != NULL && bucket->ub_cnt > 0) {
2210                 bucket->ub_cnt--;
2211                 item = bucket->ub_bucket[bucket->ub_cnt];
2212 #ifdef INVARIANTS
2213                 bucket->ub_bucket[bucket->ub_cnt] = NULL;
2214 #endif
2215                 KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled."));
2216                 cache->uc_allocs++;
2217                 critical_exit();
2218                 if (zone->uz_ctor != NULL &&
2219                     zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
2220                         atomic_add_long(&zone->uz_fails, 1);
2221                         zone_free_item(zone, item, udata, SKIP_DTOR);
2222                         return (NULL);
2223                 }
2224 #ifdef INVARIANTS
2225                 uma_dbg_alloc(zone, NULL, item);
2226 #endif
2227                 if (flags & M_ZERO)
2228                         uma_zero_item(item, zone);
2229                 return (item);
2230         }
2231
2232         /*
2233          * We have run out of items in our alloc bucket.
2234          * See if we can switch with our free bucket.
2235          */
2236         bucket = cache->uc_freebucket;
2237         if (bucket != NULL && bucket->ub_cnt > 0) {
2238 #ifdef UMA_DEBUG_ALLOC
2239                 printf("uma_zalloc: Swapping empty with alloc.\n");
2240 #endif
2241                 cache->uc_freebucket = cache->uc_allocbucket;
2242                 cache->uc_allocbucket = bucket;
2243                 goto zalloc_start;
2244         }
2245
2246         /*
2247          * Discard any empty allocation bucket while we hold no locks.
2248          */
2249         bucket = cache->uc_allocbucket;
2250         cache->uc_allocbucket = NULL;
2251         critical_exit();
2252         if (bucket != NULL)
2253                 bucket_free(zone, bucket, udata);
2254
2255         /* Short-circuit for zones without buckets and low memory. */
2256         if (zone->uz_count == 0 || bucketdisable)
2257                 goto zalloc_item;
2258
2259         /*
2260          * Attempt to retrieve the item from the per-CPU cache has failed, so
2261          * we must go back to the zone.  This requires the zone lock, so we
2262          * must drop the critical section, then re-acquire it when we go back
2263          * to the cache.  Since the critical section is released, we may be
2264          * preempted or migrate.  As such, make sure not to maintain any
2265          * thread-local state specific to the cache from prior to releasing
2266          * the critical section.
2267          */
2268         lockfail = 0;
2269         if (ZONE_TRYLOCK(zone) == 0) {
2270                 /* Record contention to size the buckets. */
2271                 ZONE_LOCK(zone);
2272                 lockfail = 1;
2273         }
2274         critical_enter();
2275         cpu = curcpu;
2276         cache = &zone->uz_cpu[cpu];
2277
2278         /*
2279          * Since we have locked the zone we may as well send back our stats.
2280          */
2281         atomic_add_long(&zone->uz_allocs, cache->uc_allocs);
2282         atomic_add_long(&zone->uz_frees, cache->uc_frees);
2283         cache->uc_allocs = 0;
2284         cache->uc_frees = 0;
2285
2286         /* See if we lost the race to fill the cache. */
2287         if (cache->uc_allocbucket != NULL) {
2288                 ZONE_UNLOCK(zone);
2289                 goto zalloc_start;
2290         }
2291
2292         /*
2293          * Check the zone's cache of buckets.
2294          */
2295         if ((bucket = LIST_FIRST(&zone->uz_buckets)) != NULL) {
2296                 KASSERT(bucket->ub_cnt != 0,
2297                     ("uma_zalloc_arg: Returning an empty bucket."));
2298
2299                 LIST_REMOVE(bucket, ub_link);
2300                 cache->uc_allocbucket = bucket;
2301                 ZONE_UNLOCK(zone);
2302                 goto zalloc_start;
2303         }
2304         /* We are no longer associated with this CPU. */
2305         critical_exit();
2306
2307         /*
2308          * We bump the uz count when the cache size is insufficient to
2309          * handle the working set.
2310          */
2311         if (lockfail && zone->uz_count < BUCKET_MAX)
2312                 zone->uz_count++;
2313         ZONE_UNLOCK(zone);
2314
2315         /*
2316          * Now lets just fill a bucket and put it on the free list.  If that
2317          * works we'll restart the allocation from the begining and it
2318          * will use the just filled bucket.
2319          */
2320         bucket = zone_alloc_bucket(zone, udata, flags);
2321         if (bucket != NULL) {
2322                 ZONE_LOCK(zone);
2323                 critical_enter();
2324                 cpu = curcpu;
2325                 cache = &zone->uz_cpu[cpu];
2326                 /*
2327                  * See if we lost the race or were migrated.  Cache the
2328                  * initialized bucket to make this less likely or claim
2329                  * the memory directly.
2330                  */
2331                 if (cache->uc_allocbucket == NULL)
2332                         cache->uc_allocbucket = bucket;
2333                 else
2334                         LIST_INSERT_HEAD(&zone->uz_buckets, bucket, ub_link);
2335                 ZONE_UNLOCK(zone);
2336                 goto zalloc_start;
2337         }
2338
2339         /*
2340          * We may not be able to get a bucket so return an actual item.
2341          */
2342 #ifdef UMA_DEBUG
2343         printf("uma_zalloc_arg: Bucketzone returned NULL\n");
2344 #endif
2345
2346 zalloc_item:
2347         item = zone_alloc_item(zone, udata, flags);
2348
2349         return (item);
2350 }
2351
2352 static uma_slab_t
2353 keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int flags)
2354 {
2355         uma_slab_t slab;
2356         int reserve;
2357
2358         mtx_assert(&keg->uk_lock, MA_OWNED);
2359         slab = NULL;
2360         reserve = 0;
2361         if ((flags & M_USE_RESERVE) == 0)
2362                 reserve = keg->uk_reserve;
2363
2364         for (;;) {
2365                 /*
2366                  * Find a slab with some space.  Prefer slabs that are partially
2367                  * used over those that are totally full.  This helps to reduce
2368                  * fragmentation.
2369                  */
2370                 if (keg->uk_free > reserve) {
2371                         if (!LIST_EMPTY(&keg->uk_part_slab)) {
2372                                 slab = LIST_FIRST(&keg->uk_part_slab);
2373                         } else {
2374                                 slab = LIST_FIRST(&keg->uk_free_slab);
2375                                 LIST_REMOVE(slab, us_link);
2376                                 LIST_INSERT_HEAD(&keg->uk_part_slab, slab,
2377                                     us_link);
2378                         }
2379                         MPASS(slab->us_keg == keg);
2380                         return (slab);
2381                 }
2382
2383                 /*
2384                  * M_NOVM means don't ask at all!
2385                  */
2386                 if (flags & M_NOVM)
2387                         break;
2388
2389                 if (keg->uk_maxpages && keg->uk_pages >= keg->uk_maxpages) {
2390                         keg->uk_flags |= UMA_ZFLAG_FULL;
2391                         /*
2392                          * If this is not a multi-zone, set the FULL bit.
2393                          * Otherwise slab_multi() takes care of it.
2394                          */
2395                         if ((zone->uz_flags & UMA_ZFLAG_MULTI) == 0) {
2396                                 zone->uz_flags |= UMA_ZFLAG_FULL;
2397                                 zone_log_warning(zone);
2398                                 zone_maxaction(zone);
2399                         }
2400                         if (flags & M_NOWAIT)
2401                                 break;
2402                         zone->uz_sleeps++;
2403                         msleep(keg, &keg->uk_lock, PVM, "keglimit", 0);
2404                         continue;
2405                 }
2406                 slab = keg_alloc_slab(keg, zone, flags);
2407                 /*
2408                  * If we got a slab here it's safe to mark it partially used
2409                  * and return.  We assume that the caller is going to remove
2410                  * at least one item.
2411                  */
2412                 if (slab) {
2413                         MPASS(slab->us_keg == keg);
2414                         LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
2415                         return (slab);
2416                 }
2417                 /*
2418                  * We might not have been able to get a slab but another cpu
2419                  * could have while we were unlocked.  Check again before we
2420                  * fail.
2421                  */
2422                 flags |= M_NOVM;
2423         }
2424         return (slab);
2425 }
2426
2427 static uma_slab_t
2428 zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int flags)
2429 {
2430         uma_slab_t slab;
2431
2432         if (keg == NULL) {
2433                 keg = zone_first_keg(zone);
2434                 KEG_LOCK(keg);
2435         }
2436
2437         for (;;) {
2438                 slab = keg_fetch_slab(keg, zone, flags);
2439                 if (slab)
2440                         return (slab);
2441                 if (flags & (M_NOWAIT | M_NOVM))
2442                         break;
2443         }
2444         KEG_UNLOCK(keg);
2445         return (NULL);
2446 }
2447
2448 /*
2449  * uma_zone_fetch_slab_multi:  Fetches a slab from one available keg.  Returns
2450  * with the keg locked.  On NULL no lock is held.
2451  *
2452  * The last pointer is used to seed the search.  It is not required.
2453  */
2454 static uma_slab_t
2455 zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int rflags)
2456 {
2457         uma_klink_t klink;
2458         uma_slab_t slab;
2459         uma_keg_t keg;
2460         int flags;
2461         int empty;
2462         int full;
2463
2464         /*
2465          * Don't wait on the first pass.  This will skip limit tests
2466          * as well.  We don't want to block if we can find a provider
2467          * without blocking.
2468          */
2469         flags = (rflags & ~M_WAITOK) | M_NOWAIT;
2470         /*
2471          * Use the last slab allocated as a hint for where to start
2472          * the search.
2473          */
2474         if (last != NULL) {
2475                 slab = keg_fetch_slab(last, zone, flags);
2476                 if (slab)
2477                         return (slab);
2478                 KEG_UNLOCK(last);
2479         }
2480         /*
2481          * Loop until we have a slab incase of transient failures
2482          * while M_WAITOK is specified.  I'm not sure this is 100%
2483          * required but we've done it for so long now.
2484          */
2485         for (;;) {
2486                 empty = 0;
2487                 full = 0;
2488                 /*
2489                  * Search the available kegs for slabs.  Be careful to hold the
2490                  * correct lock while calling into the keg layer.
2491                  */
2492                 LIST_FOREACH(klink, &zone->uz_kegs, kl_link) {
2493                         keg = klink->kl_keg;
2494                         KEG_LOCK(keg);
2495                         if ((keg->uk_flags & UMA_ZFLAG_FULL) == 0) {
2496                                 slab = keg_fetch_slab(keg, zone, flags);
2497                                 if (slab)
2498                                         return (slab);
2499                         }
2500                         if (keg->uk_flags & UMA_ZFLAG_FULL)
2501                                 full++;
2502                         else
2503                                 empty++;
2504                         KEG_UNLOCK(keg);
2505                 }
2506                 if (rflags & (M_NOWAIT | M_NOVM))
2507                         break;
2508                 flags = rflags;
2509                 /*
2510                  * All kegs are full.  XXX We can't atomically check all kegs
2511                  * and sleep so just sleep for a short period and retry.
2512                  */
2513                 if (full && !empty) {
2514                         ZONE_LOCK(zone);
2515                         zone->uz_flags |= UMA_ZFLAG_FULL;
2516                         zone->uz_sleeps++;
2517                         zone_log_warning(zone);
2518                         zone_maxaction(zone);
2519                         msleep(zone, zone->uz_lockptr, PVM,
2520                             "zonelimit", hz/100);
2521                         zone->uz_flags &= ~UMA_ZFLAG_FULL;
2522                         ZONE_UNLOCK(zone);
2523                         continue;
2524                 }
2525         }
2526         return (NULL);
2527 }
2528
2529 static void *
2530 slab_alloc_item(uma_keg_t keg, uma_slab_t slab)
2531 {
2532         void *item;
2533         uint8_t freei;
2534
2535         MPASS(keg == slab->us_keg);
2536         mtx_assert(&keg->uk_lock, MA_OWNED);
2537
2538         freei = BIT_FFS(SLAB_SETSIZE, &slab->us_free) - 1;
2539         BIT_CLR(SLAB_SETSIZE, freei, &slab->us_free);
2540         item = slab->us_data + (keg->uk_rsize * freei);
2541         slab->us_freecount--;
2542         keg->uk_free--;
2543
2544         /* Move this slab to the full list */
2545         if (slab->us_freecount == 0) {
2546                 LIST_REMOVE(slab, us_link);
2547                 LIST_INSERT_HEAD(&keg->uk_full_slab, slab, us_link);
2548         }
2549
2550         return (item);
2551 }
2552
2553 static int
2554 zone_import(uma_zone_t zone, void **bucket, int max, int flags)
2555 {
2556         uma_slab_t slab;
2557         uma_keg_t keg;
2558         int i;
2559
2560         slab = NULL;
2561         keg = NULL;
2562         /* Try to keep the buckets totally full */
2563         for (i = 0; i < max; ) {
2564                 if ((slab = zone->uz_slab(zone, keg, flags)) == NULL)
2565                         break;
2566                 keg = slab->us_keg;
2567                 while (slab->us_freecount && i < max) {
2568                         bucket[i++] = slab_alloc_item(keg, slab);
2569                         if (keg->uk_free <= keg->uk_reserve)
2570                                 break;
2571                 }
2572                 /* Don't grab more than one slab at a time. */
2573                 flags &= ~M_WAITOK;
2574                 flags |= M_NOWAIT;
2575         }
2576         if (slab != NULL)
2577                 KEG_UNLOCK(keg);
2578
2579         return i;
2580 }
2581
2582 static uma_bucket_t
2583 zone_alloc_bucket(uma_zone_t zone, void *udata, int flags)
2584 {
2585         uma_bucket_t bucket;
2586         int max;
2587
2588         /* Don't wait for buckets, preserve caller's NOVM setting. */
2589         bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM));
2590         if (bucket == NULL)
2591                 return (NULL);
2592
2593         max = MIN(bucket->ub_entries, zone->uz_count);
2594         bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket,
2595             max, flags);
2596
2597         /*
2598          * Initialize the memory if necessary.
2599          */
2600         if (bucket->ub_cnt != 0 && zone->uz_init != NULL) {
2601                 int i;
2602
2603                 for (i = 0; i < bucket->ub_cnt; i++)
2604                         if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size,
2605                             flags) != 0)
2606                                 break;
2607                 /*
2608                  * If we couldn't initialize the whole bucket, put the
2609                  * rest back onto the freelist.
2610                  */
2611                 if (i != bucket->ub_cnt) {
2612                         zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i],
2613                             bucket->ub_cnt - i);
2614 #ifdef INVARIANTS
2615                         bzero(&bucket->ub_bucket[i],
2616                             sizeof(void *) * (bucket->ub_cnt - i));
2617 #endif
2618                         bucket->ub_cnt = i;
2619                 }
2620         }
2621
2622         if (bucket->ub_cnt == 0) {
2623                 bucket_free(zone, bucket, udata);
2624                 atomic_add_long(&zone->uz_fails, 1);
2625                 return (NULL);
2626         }
2627
2628         return (bucket);
2629 }
2630
2631 /*
2632  * Allocates a single item from a zone.
2633  *
2634  * Arguments
2635  *      zone   The zone to alloc for.
2636  *      udata  The data to be passed to the constructor.
2637  *      flags  M_WAITOK, M_NOWAIT, M_ZERO.
2638  *
2639  * Returns
2640  *      NULL if there is no memory and M_NOWAIT is set
2641  *      An item if successful
2642  */
2643
2644 static void *
2645 zone_alloc_item(uma_zone_t zone, void *udata, int flags)
2646 {
2647         void *item;
2648
2649         item = NULL;
2650
2651 #ifdef UMA_DEBUG_ALLOC
2652         printf("INTERNAL: Allocating one item from %s(%p)\n", zone->uz_name, zone);
2653 #endif
2654         if (zone->uz_import(zone->uz_arg, &item, 1, flags) != 1)
2655                 goto fail;
2656         atomic_add_long(&zone->uz_allocs, 1);
2657
2658         /*
2659          * We have to call both the zone's init (not the keg's init)
2660          * and the zone's ctor.  This is because the item is going from
2661          * a keg slab directly to the user, and the user is expecting it
2662          * to be both zone-init'd as well as zone-ctor'd.
2663          */
2664         if (zone->uz_init != NULL) {
2665                 if (zone->uz_init(item, zone->uz_size, flags) != 0) {
2666                         zone_free_item(zone, item, udata, SKIP_FINI);
2667                         goto fail;
2668                 }
2669         }
2670         if (zone->uz_ctor != NULL) {
2671                 if (zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
2672                         zone_free_item(zone, item, udata, SKIP_DTOR);
2673                         goto fail;
2674                 }
2675         }
2676 #ifdef INVARIANTS
2677         uma_dbg_alloc(zone, NULL, item);
2678 #endif
2679         if (flags & M_ZERO)
2680                 uma_zero_item(item, zone);
2681
2682         return (item);
2683
2684 fail:
2685         atomic_add_long(&zone->uz_fails, 1);
2686         return (NULL);
2687 }
2688
2689 /* See uma.h */
2690 void
2691 uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
2692 {
2693         uma_cache_t cache;
2694         uma_bucket_t bucket;
2695         int lockfail;
2696         int cpu;
2697
2698         /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
2699         random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA);
2700
2701 #ifdef UMA_DEBUG_ALLOC_1
2702         printf("Freeing item %p to %s(%p)\n", item, zone->uz_name, zone);
2703 #endif
2704         CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread,
2705             zone->uz_name);
2706
2707         KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
2708             ("uma_zfree_arg: called with spinlock or critical section held"));
2709
2710         /* uma_zfree(..., NULL) does nothing, to match free(9). */
2711         if (item == NULL)
2712                 return;
2713 #ifdef DEBUG_MEMGUARD
2714         if (is_memguard_addr(item)) {
2715                 if (zone->uz_dtor != NULL && zone->uz_dtor != mtrash_dtor)
2716                         zone->uz_dtor(item, zone->uz_size, udata);
2717                 if (zone->uz_fini != NULL && zone->uz_fini != mtrash_fini)
2718                         zone->uz_fini(item, zone->uz_size);
2719                 memguard_free(item);
2720                 return;
2721         }
2722 #endif
2723 #ifdef INVARIANTS
2724         if (zone->uz_flags & UMA_ZONE_MALLOC)
2725                 uma_dbg_free(zone, udata, item);
2726         else
2727                 uma_dbg_free(zone, NULL, item);
2728 #endif
2729         if (zone->uz_dtor != NULL)
2730                 zone->uz_dtor(item, zone->uz_size, udata);
2731
2732         /*
2733          * The race here is acceptable.  If we miss it we'll just have to wait
2734          * a little longer for the limits to be reset.
2735          */
2736         if (zone->uz_flags & UMA_ZFLAG_FULL)
2737                 goto zfree_item;
2738
2739         /*
2740          * If possible, free to the per-CPU cache.  There are two
2741          * requirements for safe access to the per-CPU cache: (1) the thread
2742          * accessing the cache must not be preempted or yield during access,
2743          * and (2) the thread must not migrate CPUs without switching which
2744          * cache it accesses.  We rely on a critical section to prevent
2745          * preemption and migration.  We release the critical section in
2746          * order to acquire the zone mutex if we are unable to free to the
2747          * current cache; when we re-acquire the critical section, we must
2748          * detect and handle migration if it has occurred.
2749          */
2750 zfree_restart:
2751         critical_enter();
2752         cpu = curcpu;
2753         cache = &zone->uz_cpu[cpu];
2754
2755 zfree_start:
2756         /*
2757          * Try to free into the allocbucket first to give LIFO ordering
2758          * for cache-hot datastructures.  Spill over into the freebucket
2759          * if necessary.  Alloc will swap them if one runs dry.
2760          */
2761         bucket = cache->uc_allocbucket;
2762         if (bucket == NULL || bucket->ub_cnt >= bucket->ub_entries)
2763                 bucket = cache->uc_freebucket;
2764         if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
2765                 KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL,
2766                     ("uma_zfree: Freeing to non free bucket index."));
2767                 bucket->ub_bucket[bucket->ub_cnt] = item;
2768                 bucket->ub_cnt++;
2769                 cache->uc_frees++;
2770                 critical_exit();
2771                 return;
2772         }
2773
2774         /*
2775          * We must go back the zone, which requires acquiring the zone lock,
2776          * which in turn means we must release and re-acquire the critical
2777          * section.  Since the critical section is released, we may be
2778          * preempted or migrate.  As such, make sure not to maintain any
2779          * thread-local state specific to the cache from prior to releasing
2780          * the critical section.
2781          */
2782         critical_exit();
2783         if (zone->uz_count == 0 || bucketdisable)
2784                 goto zfree_item;
2785
2786         lockfail = 0;
2787         if (ZONE_TRYLOCK(zone) == 0) {
2788                 /* Record contention to size the buckets. */
2789                 ZONE_LOCK(zone);
2790                 lockfail = 1;
2791         }
2792         critical_enter();
2793         cpu = curcpu;
2794         cache = &zone->uz_cpu[cpu];
2795
2796         /*
2797          * Since we have locked the zone we may as well send back our stats.
2798          */
2799         atomic_add_long(&zone->uz_allocs, cache->uc_allocs);
2800         atomic_add_long(&zone->uz_frees, cache->uc_frees);
2801         cache->uc_allocs = 0;
2802         cache->uc_frees = 0;
2803
2804         bucket = cache->uc_freebucket;
2805         if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
2806                 ZONE_UNLOCK(zone);
2807                 goto zfree_start;
2808         }
2809         cache->uc_freebucket = NULL;
2810
2811         /* Can we throw this on the zone full list? */
2812         if (bucket != NULL) {
2813 #ifdef UMA_DEBUG_ALLOC
2814                 printf("uma_zfree: Putting old bucket on the free list.\n");
2815 #endif
2816                 /* ub_cnt is pointing to the last free item */
2817                 KASSERT(bucket->ub_cnt != 0,
2818                     ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n"));
2819                 LIST_INSERT_HEAD(&zone->uz_buckets, bucket, ub_link);
2820         }
2821
2822         /* We are no longer associated with this CPU. */
2823         critical_exit();
2824
2825         /*
2826          * We bump the uz count when the cache size is insufficient to
2827          * handle the working set.
2828          */
2829         if (lockfail && zone->uz_count < BUCKET_MAX)
2830                 zone->uz_count++;
2831         ZONE_UNLOCK(zone);
2832
2833 #ifdef UMA_DEBUG_ALLOC
2834         printf("uma_zfree: Allocating new free bucket.\n");
2835 #endif
2836         bucket = bucket_alloc(zone, udata, M_NOWAIT);
2837         if (bucket) {
2838                 critical_enter();
2839                 cpu = curcpu;
2840                 cache = &zone->uz_cpu[cpu];
2841                 if (cache->uc_freebucket == NULL) {
2842                         cache->uc_freebucket = bucket;
2843                         goto zfree_start;
2844                 }
2845                 /*
2846                  * We lost the race, start over.  We have to drop our
2847                  * critical section to free the bucket.
2848                  */
2849                 critical_exit();
2850                 bucket_free(zone, bucket, udata);
2851                 goto zfree_restart;
2852         }
2853
2854         /*
2855          * If nothing else caught this, we'll just do an internal free.
2856          */
2857 zfree_item:
2858         zone_free_item(zone, item, udata, SKIP_DTOR);
2859
2860         return;
2861 }
2862
2863 static void
2864 slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item)
2865 {
2866         uint8_t freei;
2867
2868         mtx_assert(&keg->uk_lock, MA_OWNED);
2869         MPASS(keg == slab->us_keg);
2870
2871         /* Do we need to remove from any lists? */
2872         if (slab->us_freecount+1 == keg->uk_ipers) {
2873                 LIST_REMOVE(slab, us_link);
2874                 LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
2875         } else if (slab->us_freecount == 0) {
2876                 LIST_REMOVE(slab, us_link);
2877                 LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
2878         }
2879
2880         /* Slab management. */
2881         freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
2882         BIT_SET(SLAB_SETSIZE, freei, &slab->us_free);
2883         slab->us_freecount++;
2884
2885         /* Keg statistics. */
2886         keg->uk_free++;
2887 }
2888
2889 static void
2890 zone_release(uma_zone_t zone, void **bucket, int cnt)
2891 {
2892         void *item;
2893         uma_slab_t slab;
2894         uma_keg_t keg;
2895         uint8_t *mem;
2896         int clearfull;
2897         int i;
2898
2899         clearfull = 0;
2900         keg = zone_first_keg(zone);
2901         KEG_LOCK(keg);
2902         for (i = 0; i < cnt; i++) {
2903                 item = bucket[i];
2904                 if (!(zone->uz_flags & UMA_ZONE_VTOSLAB)) {
2905                         mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
2906                         if (zone->uz_flags & UMA_ZONE_HASH) {
2907                                 slab = hash_sfind(&keg->uk_hash, mem);
2908                         } else {
2909                                 mem += keg->uk_pgoff;
2910                                 slab = (uma_slab_t)mem;
2911                         }
2912                 } else {
2913                         slab = vtoslab((vm_offset_t)item);
2914                         if (slab->us_keg != keg) {
2915                                 KEG_UNLOCK(keg);
2916                                 keg = slab->us_keg;
2917                                 KEG_LOCK(keg);
2918                         }
2919                 }
2920                 slab_free_item(keg, slab, item);
2921                 if (keg->uk_flags & UMA_ZFLAG_FULL) {
2922                         if (keg->uk_pages < keg->uk_maxpages) {
2923                                 keg->uk_flags &= ~UMA_ZFLAG_FULL;
2924                                 clearfull = 1;
2925                         }
2926
2927                         /*
2928                          * We can handle one more allocation. Since we're
2929                          * clearing ZFLAG_FULL, wake up all procs blocked
2930                          * on pages. This should be uncommon, so keeping this
2931                          * simple for now (rather than adding count of blocked
2932                          * threads etc).
2933                          */
2934                         wakeup(keg);
2935                 }
2936         }
2937         KEG_UNLOCK(keg);
2938         if (clearfull) {
2939                 ZONE_LOCK(zone);
2940                 zone->uz_flags &= ~UMA_ZFLAG_FULL;
2941                 wakeup(zone);
2942                 ZONE_UNLOCK(zone);
2943         }
2944
2945 }
2946
2947 /*
2948  * Frees a single item to any zone.
2949  *
2950  * Arguments:
2951  *      zone   The zone to free to
2952  *      item   The item we're freeing
2953  *      udata  User supplied data for the dtor
2954  *      skip   Skip dtors and finis
2955  */
2956 static void
2957 zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip)
2958 {
2959
2960 #ifdef INVARIANTS
2961         if (skip == SKIP_NONE) {
2962                 if (zone->uz_flags & UMA_ZONE_MALLOC)
2963                         uma_dbg_free(zone, udata, item);
2964                 else
2965                         uma_dbg_free(zone, NULL, item);
2966         }
2967 #endif
2968         if (skip < SKIP_DTOR && zone->uz_dtor)
2969                 zone->uz_dtor(item, zone->uz_size, udata);
2970
2971         if (skip < SKIP_FINI && zone->uz_fini)
2972                 zone->uz_fini(item, zone->uz_size);
2973
2974         atomic_add_long(&zone->uz_frees, 1);
2975         zone->uz_release(zone->uz_arg, &item, 1);
2976 }
2977
2978 /* See uma.h */
2979 int
2980 uma_zone_set_max(uma_zone_t zone, int nitems)
2981 {
2982         uma_keg_t keg;
2983
2984         keg = zone_first_keg(zone);
2985         if (keg == NULL)
2986                 return (0);
2987         KEG_LOCK(keg);
2988         keg->uk_maxpages = (nitems / keg->uk_ipers) * keg->uk_ppera;
2989         if (keg->uk_maxpages * keg->uk_ipers < nitems)
2990                 keg->uk_maxpages += keg->uk_ppera;
2991         nitems = keg->uk_maxpages * keg->uk_ipers;
2992         KEG_UNLOCK(keg);
2993
2994         return (nitems);
2995 }
2996
2997 /* See uma.h */
2998 int
2999 uma_zone_get_max(uma_zone_t zone)
3000 {
3001         int nitems;
3002         uma_keg_t keg;
3003
3004         keg = zone_first_keg(zone);
3005         if (keg == NULL)
3006                 return (0);
3007         KEG_LOCK(keg);
3008         nitems = keg->uk_maxpages * keg->uk_ipers;
3009         KEG_UNLOCK(keg);
3010
3011         return (nitems);
3012 }
3013
3014 /* See uma.h */
3015 void
3016 uma_zone_set_warning(uma_zone_t zone, const char *warning)
3017 {
3018
3019         ZONE_LOCK(zone);
3020         zone->uz_warning = warning;
3021         ZONE_UNLOCK(zone);
3022 }
3023
3024 /* See uma.h */
3025 void
3026 uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction)
3027 {
3028
3029         ZONE_LOCK(zone);
3030         zone->uz_maxaction = maxaction;
3031         ZONE_UNLOCK(zone);
3032 }
3033
3034 /* See uma.h */
3035 int
3036 uma_zone_get_cur(uma_zone_t zone)
3037 {
3038         int64_t nitems;
3039         u_int i;
3040
3041         ZONE_LOCK(zone);
3042         nitems = zone->uz_allocs - zone->uz_frees;
3043         CPU_FOREACH(i) {
3044                 /*
3045                  * See the comment in sysctl_vm_zone_stats() regarding the
3046                  * safety of accessing the per-cpu caches. With the zone lock
3047                  * held, it is safe, but can potentially result in stale data.
3048                  */
3049                 nitems += zone->uz_cpu[i].uc_allocs -
3050                     zone->uz_cpu[i].uc_frees;
3051         }
3052         ZONE_UNLOCK(zone);
3053
3054         return (nitems < 0 ? 0 : nitems);
3055 }
3056
3057 /* See uma.h */
3058 void
3059 uma_zone_set_init(uma_zone_t zone, uma_init uminit)
3060 {
3061         uma_keg_t keg;
3062
3063         keg = zone_first_keg(zone);
3064         KASSERT(keg != NULL, ("uma_zone_set_init: Invalid zone type"));
3065         KEG_LOCK(keg);
3066         KASSERT(keg->uk_pages == 0,
3067             ("uma_zone_set_init on non-empty keg"));
3068         keg->uk_init = uminit;
3069         KEG_UNLOCK(keg);
3070 }
3071
3072 /* See uma.h */
3073 void
3074 uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
3075 {
3076         uma_keg_t keg;
3077
3078         keg = zone_first_keg(zone);
3079         KASSERT(keg != NULL, ("uma_zone_set_fini: Invalid zone type"));
3080         KEG_LOCK(keg);
3081         KASSERT(keg->uk_pages == 0,
3082             ("uma_zone_set_fini on non-empty keg"));
3083         keg->uk_fini = fini;
3084         KEG_UNLOCK(keg);
3085 }
3086
3087 /* See uma.h */
3088 void
3089 uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
3090 {
3091
3092         ZONE_LOCK(zone);
3093         KASSERT(zone_first_keg(zone)->uk_pages == 0,
3094             ("uma_zone_set_zinit on non-empty keg"));
3095         zone->uz_init = zinit;
3096         ZONE_UNLOCK(zone);
3097 }
3098
3099 /* See uma.h */
3100 void
3101 uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
3102 {
3103
3104         ZONE_LOCK(zone);
3105         KASSERT(zone_first_keg(zone)->uk_pages == 0,
3106             ("uma_zone_set_zfini on non-empty keg"));
3107         zone->uz_fini = zfini;
3108         ZONE_UNLOCK(zone);
3109 }
3110
3111 /* See uma.h */
3112 /* XXX uk_freef is not actually used with the zone locked */
3113 void
3114 uma_zone_set_freef(uma_zone_t zone, uma_free freef)
3115 {
3116         uma_keg_t keg;
3117
3118         keg = zone_first_keg(zone);
3119         KASSERT(keg != NULL, ("uma_zone_set_freef: Invalid zone type"));
3120         KEG_LOCK(keg);
3121         keg->uk_freef = freef;
3122         KEG_UNLOCK(keg);
3123 }
3124
3125 /* See uma.h */
3126 /* XXX uk_allocf is not actually used with the zone locked */
3127 void
3128 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
3129 {
3130         uma_keg_t keg;
3131
3132         keg = zone_first_keg(zone);
3133         KEG_LOCK(keg);
3134         keg->uk_allocf = allocf;
3135         KEG_UNLOCK(keg);
3136 }
3137
3138 /* See uma.h */
3139 void
3140 uma_zone_reserve(uma_zone_t zone, int items)
3141 {
3142         uma_keg_t keg;
3143
3144         keg = zone_first_keg(zone);
3145         if (keg == NULL)
3146                 return;
3147         KEG_LOCK(keg);
3148         keg->uk_reserve = items;
3149         KEG_UNLOCK(keg);
3150
3151         return;
3152 }
3153
3154 /* See uma.h */
3155 int
3156 uma_zone_reserve_kva(uma_zone_t zone, int count)
3157 {
3158         uma_keg_t keg;
3159         vm_offset_t kva;
3160         u_int pages;
3161
3162         keg = zone_first_keg(zone);
3163         if (keg == NULL)
3164                 return (0);
3165         pages = count / keg->uk_ipers;
3166
3167         if (pages * keg->uk_ipers < count)
3168                 pages++;
3169
3170 #ifdef UMA_MD_SMALL_ALLOC
3171         if (keg->uk_ppera > 1) {
3172 #else
3173         if (1) {
3174 #endif
3175                 kva = kva_alloc((vm_size_t)pages * UMA_SLAB_SIZE);
3176                 if (kva == 0)
3177                         return (0);
3178         } else
3179                 kva = 0;
3180         KEG_LOCK(keg);
3181         keg->uk_kva = kva;
3182         keg->uk_offset = 0;
3183         keg->uk_maxpages = pages;
3184 #ifdef UMA_MD_SMALL_ALLOC
3185         keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc;
3186 #else
3187         keg->uk_allocf = noobj_alloc;
3188 #endif
3189         keg->uk_flags |= UMA_ZONE_NOFREE;
3190         KEG_UNLOCK(keg);
3191
3192         return (1);
3193 }
3194
3195 /* See uma.h */
3196 void
3197 uma_prealloc(uma_zone_t zone, int items)
3198 {
3199         int slabs;
3200         uma_slab_t slab;
3201         uma_keg_t keg;
3202
3203         keg = zone_first_keg(zone);
3204         if (keg == NULL)
3205                 return;
3206         KEG_LOCK(keg);
3207         slabs = items / keg->uk_ipers;
3208         if (slabs * keg->uk_ipers < items)
3209                 slabs++;
3210         while (slabs > 0) {
3211                 slab = keg_alloc_slab(keg, zone, M_WAITOK);
3212                 if (slab == NULL)
3213                         break;
3214                 MPASS(slab->us_keg == keg);
3215                 LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
3216                 slabs--;
3217         }
3218         KEG_UNLOCK(keg);
3219 }
3220
3221 /* See uma.h */
3222 uint32_t *
3223 uma_find_refcnt(uma_zone_t zone, void *item)
3224 {
3225         uma_slabrefcnt_t slabref;
3226         uma_slab_t slab;
3227         uma_keg_t keg;
3228         uint32_t *refcnt;
3229         int idx;
3230
3231         slab = vtoslab((vm_offset_t)item & (~UMA_SLAB_MASK));
3232         slabref = (uma_slabrefcnt_t)slab;
3233         keg = slab->us_keg;
3234         KASSERT(keg->uk_flags & UMA_ZONE_REFCNT,
3235             ("uma_find_refcnt(): zone possibly not UMA_ZONE_REFCNT"));
3236         idx = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
3237         refcnt = &slabref->us_refcnt[idx];
3238         return refcnt;
3239 }
3240
3241 /* See uma.h */
3242 static void
3243 uma_reclaim_locked(bool kmem_danger)
3244 {
3245
3246 #ifdef UMA_DEBUG
3247         printf("UMA: vm asked us to release pages!\n");
3248 #endif
3249         sx_assert(&uma_drain_lock, SA_XLOCKED);
3250         bucket_enable();
3251         zone_foreach(zone_drain);
3252         if (vm_page_count_min() || kmem_danger) {
3253                 cache_drain_safe(NULL);
3254                 zone_foreach(zone_drain);
3255         }
3256         /*
3257          * Some slabs may have been freed but this zone will be visited early
3258          * we visit again so that we can free pages that are empty once other
3259          * zones are drained.  We have to do the same for buckets.
3260          */
3261         zone_drain(slabzone);
3262         zone_drain(slabrefzone);
3263         bucket_zone_drain();
3264 }
3265
3266 void
3267 uma_reclaim(void)
3268 {
3269
3270         sx_xlock(&uma_drain_lock);
3271         uma_reclaim_locked(false);
3272         sx_xunlock(&uma_drain_lock);
3273 }
3274
3275 static int uma_reclaim_needed;
3276
3277 void
3278 uma_reclaim_wakeup(void)
3279 {
3280
3281         uma_reclaim_needed = 1;
3282         wakeup(&uma_reclaim_needed);
3283 }
3284
3285 void
3286 uma_reclaim_worker(void *arg __unused)
3287 {
3288
3289         sx_xlock(&uma_drain_lock);
3290         for (;;) {
3291                 sx_sleep(&uma_reclaim_needed, &uma_drain_lock, PVM,
3292                     "umarcl", 0);
3293                 if (uma_reclaim_needed) {
3294                         uma_reclaim_needed = 0;
3295                         uma_reclaim_locked(true);
3296                 }
3297         }
3298 }
3299
3300 /* See uma.h */
3301 int
3302 uma_zone_exhausted(uma_zone_t zone)
3303 {
3304         int full;
3305
3306         ZONE_LOCK(zone);
3307         full = (zone->uz_flags & UMA_ZFLAG_FULL);
3308         ZONE_UNLOCK(zone);
3309         return (full);
3310 }
3311
3312 int
3313 uma_zone_exhausted_nolock(uma_zone_t zone)
3314 {
3315         return (zone->uz_flags & UMA_ZFLAG_FULL);
3316 }
3317
3318 void *
3319 uma_large_malloc(vm_size_t size, int wait)
3320 {
3321         void *mem;
3322         uma_slab_t slab;
3323         uint8_t flags;
3324
3325         slab = zone_alloc_item(slabzone, NULL, wait);
3326         if (slab == NULL)
3327                 return (NULL);
3328         mem = page_alloc(NULL, size, &flags, wait);
3329         if (mem) {
3330                 vsetslab((vm_offset_t)mem, slab);
3331                 slab->us_data = mem;
3332                 slab->us_flags = flags | UMA_SLAB_MALLOC;
3333                 slab->us_size = size;
3334         } else {
3335                 zone_free_item(slabzone, slab, NULL, SKIP_NONE);
3336         }
3337
3338         return (mem);
3339 }
3340
3341 void
3342 uma_large_free(uma_slab_t slab)
3343 {
3344
3345         page_free(slab->us_data, slab->us_size, slab->us_flags);
3346         zone_free_item(slabzone, slab, NULL, SKIP_NONE);
3347 }
3348
3349 static void
3350 uma_zero_item(void *item, uma_zone_t zone)
3351 {
3352
3353         if (zone->uz_flags & UMA_ZONE_PCPU) {
3354                 for (int i = 0; i < mp_ncpus; i++)
3355                         bzero(zpcpu_get_cpu(item, i), zone->uz_size);
3356         } else
3357                 bzero(item, zone->uz_size);
3358 }
3359
3360 void
3361 uma_print_stats(void)
3362 {
3363         zone_foreach(uma_print_zone);
3364 }
3365
3366 static void
3367 slab_print(uma_slab_t slab)
3368 {
3369         printf("slab: keg %p, data %p, freecount %d\n",
3370                 slab->us_keg, slab->us_data, slab->us_freecount);
3371 }
3372
3373 static void
3374 cache_print(uma_cache_t cache)
3375 {
3376         printf("alloc: %p(%d), free: %p(%d)\n",
3377                 cache->uc_allocbucket,
3378                 cache->uc_allocbucket?cache->uc_allocbucket->ub_cnt:0,
3379                 cache->uc_freebucket,
3380                 cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0);
3381 }
3382
3383 static void
3384 uma_print_keg(uma_keg_t keg)
3385 {
3386         uma_slab_t slab;
3387
3388         printf("keg: %s(%p) size %d(%d) flags %#x ipers %d ppera %d "
3389             "out %d free %d limit %d\n",
3390             keg->uk_name, keg, keg->uk_size, keg->uk_rsize, keg->uk_flags,
3391             keg->uk_ipers, keg->uk_ppera,
3392             (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free,
3393             (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers);
3394         printf("Part slabs:\n");
3395         LIST_FOREACH(slab, &keg->uk_part_slab, us_link)
3396                 slab_print(slab);
3397         printf("Free slabs:\n");
3398         LIST_FOREACH(slab, &keg->uk_free_slab, us_link)
3399                 slab_print(slab);
3400         printf("Full slabs:\n");
3401         LIST_FOREACH(slab, &keg->uk_full_slab, us_link)
3402                 slab_print(slab);
3403 }
3404
3405 void
3406 uma_print_zone(uma_zone_t zone)
3407 {
3408         uma_cache_t cache;
3409         uma_klink_t kl;
3410         int i;
3411
3412         printf("zone: %s(%p) size %d flags %#x\n",
3413             zone->uz_name, zone, zone->uz_size, zone->uz_flags);
3414         LIST_FOREACH(kl, &zone->uz_kegs, kl_link)
3415                 uma_print_keg(kl->kl_keg);
3416         CPU_FOREACH(i) {
3417                 cache = &zone->uz_cpu[i];
3418                 printf("CPU %d Cache:\n", i);
3419                 cache_print(cache);
3420         }
3421 }
3422
3423 #ifdef DDB
3424 /*
3425  * Generate statistics across both the zone and its per-cpu cache's.  Return
3426  * desired statistics if the pointer is non-NULL for that statistic.
3427  *
3428  * Note: does not update the zone statistics, as it can't safely clear the
3429  * per-CPU cache statistic.
3430  *
3431  * XXXRW: Following the uc_allocbucket and uc_freebucket pointers here isn't
3432  * safe from off-CPU; we should modify the caches to track this information
3433  * directly so that we don't have to.
3434  */
3435 static void
3436 uma_zone_sumstat(uma_zone_t z, int *cachefreep, uint64_t *allocsp,
3437     uint64_t *freesp, uint64_t *sleepsp)
3438 {
3439         uma_cache_t cache;
3440         uint64_t allocs, frees, sleeps;
3441         int cachefree, cpu;
3442
3443         allocs = frees = sleeps = 0;
3444         cachefree = 0;
3445         CPU_FOREACH(cpu) {
3446                 cache = &z->uz_cpu[cpu];
3447                 if (cache->uc_allocbucket != NULL)
3448                         cachefree += cache->uc_allocbucket->ub_cnt;
3449                 if (cache->uc_freebucket != NULL)
3450                         cachefree += cache->uc_freebucket->ub_cnt;
3451                 allocs += cache->uc_allocs;
3452                 frees += cache->uc_frees;
3453         }
3454         allocs += z->uz_allocs;
3455         frees += z->uz_frees;
3456         sleeps += z->uz_sleeps;
3457         if (cachefreep != NULL)
3458                 *cachefreep = cachefree;
3459         if (allocsp != NULL)
3460                 *allocsp = allocs;
3461         if (freesp != NULL)
3462                 *freesp = frees;
3463         if (sleepsp != NULL)
3464                 *sleepsp = sleeps;
3465 }
3466 #endif /* DDB */
3467
3468 static int
3469 sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS)
3470 {
3471         uma_keg_t kz;
3472         uma_zone_t z;
3473         int count;
3474
3475         count = 0;
3476         rw_rlock(&uma_rwlock);
3477         LIST_FOREACH(kz, &uma_kegs, uk_link) {
3478                 LIST_FOREACH(z, &kz->uk_zones, uz_link)
3479                         count++;
3480         }
3481         rw_runlock(&uma_rwlock);
3482         return (sysctl_handle_int(oidp, &count, 0, req));
3483 }
3484
3485 static int
3486 sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS)
3487 {
3488         struct uma_stream_header ush;
3489         struct uma_type_header uth;
3490         struct uma_percpu_stat ups;
3491         uma_bucket_t bucket;
3492         struct sbuf sbuf;
3493         uma_cache_t cache;
3494         uma_klink_t kl;
3495         uma_keg_t kz;
3496         uma_zone_t z;
3497         uma_keg_t k;
3498         int count, error, i;
3499
3500         error = sysctl_wire_old_buffer(req, 0);
3501         if (error != 0)
3502                 return (error);
3503         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
3504         sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL);
3505
3506         count = 0;
3507         rw_rlock(&uma_rwlock);
3508         LIST_FOREACH(kz, &uma_kegs, uk_link) {
3509                 LIST_FOREACH(z, &kz->uk_zones, uz_link)
3510                         count++;
3511         }
3512
3513         /*
3514          * Insert stream header.
3515          */
3516         bzero(&ush, sizeof(ush));
3517         ush.ush_version = UMA_STREAM_VERSION;
3518         ush.ush_maxcpus = (mp_maxid + 1);
3519         ush.ush_count = count;
3520         (void)sbuf_bcat(&sbuf, &ush, sizeof(ush));
3521
3522         LIST_FOREACH(kz, &uma_kegs, uk_link) {
3523                 LIST_FOREACH(z, &kz->uk_zones, uz_link) {
3524                         bzero(&uth, sizeof(uth));
3525                         ZONE_LOCK(z);
3526                         strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
3527                         uth.uth_align = kz->uk_align;
3528                         uth.uth_size = kz->uk_size;
3529                         uth.uth_rsize = kz->uk_rsize;
3530                         LIST_FOREACH(kl, &z->uz_kegs, kl_link) {
3531                                 k = kl->kl_keg;
3532                                 uth.uth_maxpages += k->uk_maxpages;
3533                                 uth.uth_pages += k->uk_pages;
3534                                 uth.uth_keg_free += k->uk_free;
3535                                 uth.uth_limit = (k->uk_maxpages / k->uk_ppera)
3536                                     * k->uk_ipers;
3537                         }
3538
3539                         /*
3540                          * A zone is secondary is it is not the first entry
3541                          * on the keg's zone list.
3542                          */
3543                         if ((z->uz_flags & UMA_ZONE_SECONDARY) &&
3544                             (LIST_FIRST(&kz->uk_zones) != z))
3545                                 uth.uth_zone_flags = UTH_ZONE_SECONDARY;
3546
3547                         LIST_FOREACH(bucket, &z->uz_buckets, ub_link)
3548                                 uth.uth_zone_free += bucket->ub_cnt;
3549                         uth.uth_allocs = z->uz_allocs;
3550                         uth.uth_frees = z->uz_frees;
3551                         uth.uth_fails = z->uz_fails;
3552                         uth.uth_sleeps = z->uz_sleeps;
3553                         (void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
3554                         /*
3555                          * While it is not normally safe to access the cache
3556                          * bucket pointers while not on the CPU that owns the
3557                          * cache, we only allow the pointers to be exchanged
3558                          * without the zone lock held, not invalidated, so
3559                          * accept the possible race associated with bucket
3560                          * exchange during monitoring.
3561                          */
3562                         for (i = 0; i < (mp_maxid + 1); i++) {
3563                                 bzero(&ups, sizeof(ups));
3564                                 if (kz->uk_flags & UMA_ZFLAG_INTERNAL)
3565                                         goto skip;
3566                                 if (CPU_ABSENT(i))
3567                                         goto skip;
3568                                 cache = &z->uz_cpu[i];
3569                                 if (cache->uc_allocbucket != NULL)
3570                                         ups.ups_cache_free +=
3571                                             cache->uc_allocbucket->ub_cnt;
3572                                 if (cache->uc_freebucket != NULL)
3573                                         ups.ups_cache_free +=
3574                                             cache->uc_freebucket->ub_cnt;
3575                                 ups.ups_allocs = cache->uc_allocs;
3576                                 ups.ups_frees = cache->uc_frees;
3577 skip:
3578                                 (void)sbuf_bcat(&sbuf, &ups, sizeof(ups));
3579                         }
3580                         ZONE_UNLOCK(z);
3581                 }
3582         }
3583         rw_runlock(&uma_rwlock);
3584         error = sbuf_finish(&sbuf);
3585         sbuf_delete(&sbuf);
3586         return (error);
3587 }
3588
3589 int
3590 sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS)
3591 {
3592         uma_zone_t zone = *(uma_zone_t *)arg1;
3593         int error, max;
3594
3595         max = uma_zone_get_max(zone);
3596         error = sysctl_handle_int(oidp, &max, 0, req);
3597         if (error || !req->newptr)
3598                 return (error);
3599
3600         uma_zone_set_max(zone, max);
3601
3602         return (0);
3603 }
3604
3605 int
3606 sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS)
3607 {
3608         uma_zone_t zone = *(uma_zone_t *)arg1;
3609         int cur;
3610
3611         cur = uma_zone_get_cur(zone);
3612         return (sysctl_handle_int(oidp, &cur, 0, req));
3613 }
3614
3615 #ifdef INVARIANTS
3616 static uma_slab_t
3617 uma_dbg_getslab(uma_zone_t zone, void *item)
3618 {
3619         uma_slab_t slab;
3620         uma_keg_t keg;
3621         uint8_t *mem;
3622
3623         mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
3624         if (zone->uz_flags & UMA_ZONE_VTOSLAB) {
3625                 slab = vtoslab((vm_offset_t)mem);
3626         } else {
3627                 /*
3628                  * It is safe to return the slab here even though the
3629                  * zone is unlocked because the item's allocation state
3630                  * essentially holds a reference.
3631                  */
3632                 ZONE_LOCK(zone);
3633                 keg = LIST_FIRST(&zone->uz_kegs)->kl_keg;
3634                 if (keg->uk_flags & UMA_ZONE_HASH)
3635                         slab = hash_sfind(&keg->uk_hash, mem);
3636                 else
3637                         slab = (uma_slab_t)(mem + keg->uk_pgoff);
3638                 ZONE_UNLOCK(zone);
3639         }
3640
3641         return (slab);
3642 }
3643
3644 /*
3645  * Set up the slab's freei data such that uma_dbg_free can function.
3646  *
3647  */
3648 static void
3649 uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
3650 {
3651         uma_keg_t keg;
3652         int freei;
3653
3654         if (zone_first_keg(zone) == NULL)
3655                 return;
3656         if (slab == NULL) {
3657                 slab = uma_dbg_getslab(zone, item);
3658                 if (slab == NULL)
3659                         panic("uma: item %p did not belong to zone %s\n",
3660                             item, zone->uz_name);
3661         }
3662         keg = slab->us_keg;
3663         freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
3664
3665         if (BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
3666                 panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n",
3667                     item, zone, zone->uz_name, slab, freei);
3668         BIT_SET_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
3669
3670         return;
3671 }
3672
3673 /*
3674  * Verifies freed addresses.  Checks for alignment, valid slab membership
3675  * and duplicate frees.
3676  *
3677  */
3678 static void
3679 uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
3680 {
3681         uma_keg_t keg;
3682         int freei;
3683
3684         if (zone_first_keg(zone) == NULL)
3685                 return;
3686         if (slab == NULL) {
3687                 slab = uma_dbg_getslab(zone, item);
3688                 if (slab == NULL)
3689                         panic("uma: Freed item %p did not belong to zone %s\n",
3690                             item, zone->uz_name);
3691         }
3692         keg = slab->us_keg;
3693         freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
3694
3695         if (freei >= keg->uk_ipers)
3696                 panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n",
3697                     item, zone, zone->uz_name, slab, freei);
3698
3699         if (((freei * keg->uk_rsize) + slab->us_data) != item)
3700                 panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n",
3701                     item, zone, zone->uz_name, slab, freei);
3702
3703         if (!BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
3704                 panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n",
3705                     item, zone, zone->uz_name, slab, freei);
3706
3707         BIT_CLR_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
3708 }
3709 #endif /* INVARIANTS */
3710
3711 #ifdef DDB
3712 DB_SHOW_COMMAND(uma, db_show_uma)
3713 {
3714         uint64_t allocs, frees, sleeps;
3715         uma_bucket_t bucket;
3716         uma_keg_t kz;
3717         uma_zone_t z;
3718         int cachefree;
3719
3720         db_printf("%18s %8s %8s %8s %12s %8s %8s\n", "Zone", "Size", "Used",
3721             "Free", "Requests", "Sleeps", "Bucket");
3722         LIST_FOREACH(kz, &uma_kegs, uk_link) {
3723                 LIST_FOREACH(z, &kz->uk_zones, uz_link) {
3724                         if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
3725                                 allocs = z->uz_allocs;
3726                                 frees = z->uz_frees;
3727                                 sleeps = z->uz_sleeps;
3728                                 cachefree = 0;
3729                         } else
3730                                 uma_zone_sumstat(z, &cachefree, &allocs,
3731                                     &frees, &sleeps);
3732                         if (!((z->uz_flags & UMA_ZONE_SECONDARY) &&
3733                             (LIST_FIRST(&kz->uk_zones) != z)))
3734                                 cachefree += kz->uk_free;
3735                         LIST_FOREACH(bucket, &z->uz_buckets, ub_link)
3736                                 cachefree += bucket->ub_cnt;
3737                         db_printf("%18s %8ju %8jd %8d %12ju %8ju %8u\n",
3738                             z->uz_name, (uintmax_t)kz->uk_size,
3739                             (intmax_t)(allocs - frees), cachefree,
3740                             (uintmax_t)allocs, sleeps, z->uz_count);
3741                         if (db_pager_quit)
3742                                 return;
3743                 }
3744         }
3745 }
3746
3747 DB_SHOW_COMMAND(umacache, db_show_umacache)
3748 {
3749         uint64_t allocs, frees;
3750         uma_bucket_t bucket;
3751         uma_zone_t z;
3752         int cachefree;
3753
3754         db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
3755             "Requests", "Bucket");
3756         LIST_FOREACH(z, &uma_cachezones, uz_link) {
3757                 uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL);
3758                 LIST_FOREACH(bucket, &z->uz_buckets, ub_link)
3759                         cachefree += bucket->ub_cnt;
3760                 db_printf("%18s %8ju %8jd %8d %12ju %8u\n",
3761                     z->uz_name, (uintmax_t)z->uz_size,
3762                     (intmax_t)(allocs - frees), cachefree,
3763                     (uintmax_t)allocs, z->uz_count);
3764                 if (db_pager_quit)
3765                         return;
3766         }
3767 }
3768 #endif  /* DDB */