sys/vm/uma_core.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3  *
   4  * Copyright (c) 2002-2019 Jeffrey Roberson <jeff@FreeBSD.org>
   5  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
   6  * Copyright (c) 2004-2006 Robert N. M. Watson
   7  * All rights reserved.
   8  *
   9  * Redistribution and use in source and binary forms, with or without
  10  * modification, are permitted provided that the following conditions
  11  * are met:
  12  * 1. Redistributions of source code must retain the above copyright
  13  *    notice unmodified, this list of conditions, and the following
  14  *    disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29  */
  30
  31 /*
  32  * uma_core.c  Implementation of the Universal Memory allocator
  33  *
  34  * This allocator is intended to replace the multitude of similar object caches
  35  * in the standard FreeBSD kernel.  The intent is to be flexible as well as
  36  * efficient.  A primary design goal is to return unused memory to the rest of
  37  * the system.  This will make the system as a whole more flexible due to the
  38  * ability to move memory to subsystems which most need it instead of leaving
  39  * pools of reserved memory unused.
  40  *
  41  * The basic ideas stem from similar slab/zone based allocators whose algorithms
  42  * are well known.
  43  *
  44  */
  45
  46 /*
  47  * TODO:
  48  *      - Improve memory usage for large allocations
  49  *      - Investigate cache size adjustments
  50  */
  51
  52 #include <sys/cdefs.h>
  53 __FBSDID("$FreeBSD$");
  54
  55 #include "opt_ddb.h"
  56 #include "opt_param.h"
  57 #include "opt_vm.h"
  58
  59 #include <sys/param.h>
  60 #include <sys/systm.h>
  61 #include <sys/bitset.h>
  62 #include <sys/domainset.h>
  63 #include <sys/eventhandler.h>
  64 #include <sys/kernel.h>
  65 #include <sys/types.h>
  66 #include <sys/limits.h>
  67 #include <sys/queue.h>
  68 #include <sys/malloc.h>
  69 #include <sys/ktr.h>
  70 #include <sys/lock.h>
  71 #include <sys/sysctl.h>
  72 #include <sys/mutex.h>
  73 #include <sys/proc.h>
  74 #include <sys/random.h>
  75 #include <sys/rwlock.h>
  76 #include <sys/sbuf.h>
  77 #include <sys/sched.h>
  78 #include <sys/sleepqueue.h>
  79 #include <sys/smp.h>
  80 #include <sys/smr.h>
  81 #include <sys/taskqueue.h>
  82 #include <sys/vmmeter.h>
  83
  84 #include <vm/vm.h>
  85 #include <vm/vm_domainset.h>
  86 #include <vm/vm_object.h>
  87 #include <vm/vm_page.h>
  88 #include <vm/vm_pageout.h>
  89 #include <vm/vm_param.h>
  90 #include <vm/vm_phys.h>
  91 #include <vm/vm_pagequeue.h>
  92 #include <vm/vm_map.h>
  93 #include <vm/vm_kern.h>
  94 #include <vm/vm_extern.h>
  95 #include <vm/uma.h>
  96 #include <vm/uma_int.h>
  97 #include <vm/uma_dbg.h>
  98
  99 #include <ddb/ddb.h>
 100
 101 #ifdef DEBUG_MEMGUARD
 102 #include <vm/memguard.h>
 103 #endif
 104
 105 #include <machine/md_var.h>
 106
 107 #ifdef INVARIANTS
 108 #define UMA_ALWAYS_CTORDTOR     1
 109 #else
 110 #define UMA_ALWAYS_CTORDTOR     0
 111 #endif
 112
 113 /*
 114  * This is the zone and keg from which all zones are spawned.
 115  */
 116 static uma_zone_t kegs;
 117 static uma_zone_t zones;
 118
 119 /*
 120  * These are the two zones from which all offpage uma_slab_ts are allocated.
 121  *
 122  * One zone is for slab headers that can represent a larger number of items,
 123  * making the slabs themselves more efficient, and the other zone is for
 124  * headers that are smaller and represent fewer items, making the headers more
 125  * efficient.
 126  */
 127 #define SLABZONE_SIZE(setsize)                                  \
 128     (sizeof(struct uma_hash_slab) + BITSET_SIZE(setsize) * SLAB_BITSETS)
 129 #define SLABZONE0_SETSIZE       (PAGE_SIZE / 16)
 130 #define SLABZONE1_SETSIZE       SLAB_MAX_SETSIZE
 131 #define SLABZONE0_SIZE  SLABZONE_SIZE(SLABZONE0_SETSIZE)
 132 #define SLABZONE1_SIZE  SLABZONE_SIZE(SLABZONE1_SETSIZE)
 133 static uma_zone_t slabzones[2];
 134
 135 /*
 136  * The initial hash tables come out of this zone so they can be allocated
 137  * prior to malloc coming up.
 138  */
 139 static uma_zone_t hashzone;
 140
 141 /* The boot-time adjusted value for cache line alignment. */
 142 int uma_align_cache = 64 - 1;
 143
 144 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
 145 static MALLOC_DEFINE(M_UMA, "UMA", "UMA Misc");
 146
 147 /*
 148  * Are we allowed to allocate buckets?
 149  */
 150 static int bucketdisable = 1;
 151
 152 /* Linked list of all kegs in the system */
 153 static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs);
 154
 155 /* Linked list of all cache-only zones in the system */
 156 static LIST_HEAD(,uma_zone) uma_cachezones =
 157     LIST_HEAD_INITIALIZER(uma_cachezones);
 158
 159 /* This RW lock protects the keg list */
 160 static struct rwlock_padalign __exclusive_cache_line uma_rwlock;
 161
 162 /*
 163  * First available virual address for boot time allocations.
 164  */
 165 static vm_offset_t bootstart;
 166 static vm_offset_t bootmem;
 167
 168 static struct sx uma_reclaim_lock;
 169
 170 /*
 171  * kmem soft limit, initialized by uma_set_limit().  Ensure that early
 172  * allocations don't trigger a wakeup of the reclaim thread.
 173  */
 174 unsigned long uma_kmem_limit = LONG_MAX;
 175 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_limit, CTLFLAG_RD, &uma_kmem_limit, 0,
 176     "UMA kernel memory soft limit");
 177 unsigned long uma_kmem_total;
 178 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_total, CTLFLAG_RD, &uma_kmem_total, 0,
 179     "UMA kernel memory usage");
 180
 181 /* Is the VM done starting up? */
 182 static enum {
 183         BOOT_COLD,
 184         BOOT_KVA,
 185         BOOT_RUNNING,
 186         BOOT_SHUTDOWN,
 187 } booted = BOOT_COLD;
 188
 189 /*
 190  * This is the handle used to schedule events that need to happen
 191  * outside of the allocation fast path.
 192  */
 193 static struct callout uma_callout;
 194 #define UMA_TIMEOUT     20              /* Seconds for callout interval. */
 195
 196 /*
 197  * This structure is passed as the zone ctor arg so that I don't have to create
 198  * a special allocation function just for zones.
 199  */
 200 struct uma_zctor_args {
 201         const char *name;
 202         size_t size;
 203         uma_ctor ctor;
 204         uma_dtor dtor;
 205         uma_init uminit;
 206         uma_fini fini;
 207         uma_import import;
 208         uma_release release;
 209         void *arg;
 210         uma_keg_t keg;
 211         int align;
 212         uint32_t flags;
 213 };
 214
 215 struct uma_kctor_args {
 216         uma_zone_t zone;
 217         size_t size;
 218         uma_init uminit;
 219         uma_fini fini;
 220         int align;
 221         uint32_t flags;
 222 };
 223
 224 struct uma_bucket_zone {
 225         uma_zone_t      ubz_zone;
 226         const char      *ubz_name;
 227         int             ubz_entries;    /* Number of items it can hold. */
 228         int             ubz_maxsize;    /* Maximum allocation size per-item. */
 229 };
 230
 231 /*
 232  * Compute the actual number of bucket entries to pack them in power
 233  * of two sizes for more efficient space utilization.
 234  */
 235 #define BUCKET_SIZE(n)                                          \
 236     (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *))
 237
 238 #define BUCKET_MAX      BUCKET_SIZE(256)
 239 #define BUCKET_MIN      2
 240
 241 struct uma_bucket_zone bucket_zones[] = {
 242         /* Literal bucket sizes. */
 243         { NULL, "2 Bucket", 2, 4096 },
 244         { NULL, "4 Bucket", 4, 3072 },
 245         { NULL, "8 Bucket", 8, 2048 },
 246         { NULL, "16 Bucket", 16, 1024 },
 247         /* Rounded down power of 2 sizes for efficiency. */
 248         { NULL, "32 Bucket", BUCKET_SIZE(32), 512 },
 249         { NULL, "64 Bucket", BUCKET_SIZE(64), 256 },
 250         { NULL, "128 Bucket", BUCKET_SIZE(128), 128 },
 251         { NULL, "256 Bucket", BUCKET_SIZE(256), 64 },
 252         { NULL, NULL, 0}
 253 };
 254
 255 /*
 256  * Flags and enumerations to be passed to internal functions.
 257  */
 258 enum zfreeskip {
 259         SKIP_NONE =     0,
 260         SKIP_CNT =      0x00000001,
 261         SKIP_DTOR =     0x00010000,
 262         SKIP_FINI =     0x00020000,
 263 };
 264
 265 /* Prototypes.. */
 266
 267 void    uma_startup1(vm_offset_t);
 268 void    uma_startup2(void);
 269
 270 static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 271 static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 272 static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 273 static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 274 static void *contig_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 275 static void page_free(void *, vm_size_t, uint8_t);
 276 static void pcpu_page_free(void *, vm_size_t, uint8_t);
 277 static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int, int);
 278 static void cache_drain(uma_zone_t);
 279 static void bucket_drain(uma_zone_t, uma_bucket_t);
 280 static void bucket_cache_reclaim(uma_zone_t zone, bool);
 281 static int keg_ctor(void *, int, void *, int);
 282 static void keg_dtor(void *, int, void *);
 283 static int zone_ctor(void *, int, void *, int);
 284 static void zone_dtor(void *, int, void *);
 285 static inline void item_dtor(uma_zone_t zone, void *item, int size,
 286     void *udata, enum zfreeskip skip);
 287 static int zero_init(void *, int, int);
 288 static void zone_free_bucket(uma_zone_t zone, uma_bucket_t bucket, void *udata,
 289     int itemdomain, bool ws);
 290 static void zone_foreach(void (*zfunc)(uma_zone_t, void *), void *);
 291 static void zone_foreach_unlocked(void (*zfunc)(uma_zone_t, void *), void *);
 292 static void zone_timeout(uma_zone_t zone, void *);
 293 static int hash_alloc(struct uma_hash *, u_int);
 294 static int hash_expand(struct uma_hash *, struct uma_hash *);
 295 static void hash_free(struct uma_hash *hash);
 296 static void uma_timeout(void *);
 297 static void uma_startup3(void);
 298 static void uma_shutdown(void);
 299 static void *zone_alloc_item(uma_zone_t, void *, int, int);
 300 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip);
 301 static int zone_alloc_limit(uma_zone_t zone, int count, int flags);
 302 static void zone_free_limit(uma_zone_t zone, int count);
 303 static void bucket_enable(void);
 304 static void bucket_init(void);
 305 static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int);
 306 static void bucket_free(uma_zone_t zone, uma_bucket_t, void *);
 307 static void bucket_zone_drain(void);
 308 static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int);
 309 static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab);
 310 static void slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item);
 311 static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
 312     uma_fini fini, int align, uint32_t flags);
 313 static int zone_import(void *, void **, int, int, int);
 314 static void zone_release(void *, void **, int);
 315 static bool cache_alloc(uma_zone_t, uma_cache_t, void *, int);
 316 static bool cache_free(uma_zone_t, uma_cache_t, void *, void *, int);
 317
 318 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
 319 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
 320 static int sysctl_handle_uma_zone_allocs(SYSCTL_HANDLER_ARGS);
 321 static int sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS);
 322 static int sysctl_handle_uma_zone_flags(SYSCTL_HANDLER_ARGS);
 323 static int sysctl_handle_uma_slab_efficiency(SYSCTL_HANDLER_ARGS);
 324 static int sysctl_handle_uma_zone_items(SYSCTL_HANDLER_ARGS);
 325
 326 static uint64_t uma_zone_get_allocs(uma_zone_t zone);
 327
 328 static SYSCTL_NODE(_vm, OID_AUTO, debug, CTLFLAG_RD, 0,
 329     "Memory allocation debugging");
 330
 331 #ifdef INVARIANTS
 332 static uint64_t uma_keg_get_allocs(uma_keg_t zone);
 333 static inline struct noslabbits *slab_dbg_bits(uma_slab_t slab, uma_keg_t keg);
 334
 335 static bool uma_dbg_kskip(uma_keg_t keg, void *mem);
 336 static bool uma_dbg_zskip(uma_zone_t zone, void *mem);
 337 static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item);
 338 static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item);
 339
 340 static u_int dbg_divisor = 1;
 341 SYSCTL_UINT(_vm_debug, OID_AUTO, divisor,
 342     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &dbg_divisor, 0,
 343     "Debug & thrash every this item in memory allocator");
 344
 345 static counter_u64_t uma_dbg_cnt = EARLY_COUNTER;
 346 static counter_u64_t uma_skip_cnt = EARLY_COUNTER;
 347 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, trashed, CTLFLAG_RD,
 348     &uma_dbg_cnt, "memory items debugged");
 349 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, skipped, CTLFLAG_RD,
 350     &uma_skip_cnt, "memory items skipped, not debugged");
 351 #endif
 352
 353 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
 354
 355 SYSCTL_NODE(_vm, OID_AUTO, uma, CTLFLAG_RW, 0, "Universal Memory Allocator");
 356
 357 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLTYPE_INT,
 358     0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones");
 359
 360 SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLTYPE_STRUCT,
 361     0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
 362
 363 static int zone_warnings = 1;
 364 SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0,
 365     "Warn when UMA zones becomes full");
 366
 367 static int multipage_slabs = 1;
 368 TUNABLE_INT("vm.debug.uma_multipage_slabs", &multipage_slabs);
 369 SYSCTL_INT(_vm_debug, OID_AUTO, uma_multipage_slabs,
 370     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &multipage_slabs, 0,
 371     "UMA may choose larger slab sizes for better efficiency");
 372
 373 /*
 374  * Select the slab zone for an offpage slab with the given maximum item count.
 375  */
 376 static inline uma_zone_t
 377 slabzone(int ipers)
 378 {
 379
 380         return (slabzones[ipers > SLABZONE0_SETSIZE]);
 381 }
 382
 383 /*
 384  * This routine checks to see whether or not it's safe to enable buckets.
 385  */
 386 static void
 387 bucket_enable(void)
 388 {
 389
 390         KASSERT(booted >= BOOT_KVA, ("Bucket enable before init"));
 391         bucketdisable = vm_page_count_min();
 392 }
 393
 394 /*
 395  * Initialize bucket_zones, the array of zones of buckets of various sizes.
 396  *
 397  * For each zone, calculate the memory required for each bucket, consisting
 398  * of the header and an array of pointers.
 399  */
 400 static void
 401 bucket_init(void)
 402 {
 403         struct uma_bucket_zone *ubz;
 404         int size;
 405
 406         for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) {
 407                 size = roundup(sizeof(struct uma_bucket), sizeof(void *));
 408                 size += sizeof(void *) * ubz->ubz_entries;
 409                 ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
 410                     NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 411                     UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET |
 412                     UMA_ZONE_FIRSTTOUCH);
 413         }
 414 }
 415
 416 /*
 417  * Given a desired number of entries for a bucket, return the zone from which
 418  * to allocate the bucket.
 419  */
 420 static struct uma_bucket_zone *
 421 bucket_zone_lookup(int entries)
 422 {
 423         struct uma_bucket_zone *ubz;
 424
 425         for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
 426                 if (ubz->ubz_entries >= entries)
 427                         return (ubz);
 428         ubz--;
 429         return (ubz);
 430 }
 431
 432 static struct uma_bucket_zone *
 433 bucket_zone_max(uma_zone_t zone, int nitems)
 434 {
 435         struct uma_bucket_zone *ubz;
 436         int bpcpu;
 437
 438         bpcpu = 2;
 439         if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0)
 440                 /* Count the cross-domain bucket. */
 441                 bpcpu++;
 442
 443         for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
 444                 if (ubz->ubz_entries * bpcpu * mp_ncpus > nitems)
 445                         break;
 446         if (ubz == &bucket_zones[0])
 447                 ubz = NULL;
 448         else
 449                 ubz--;
 450         return (ubz);
 451 }
 452
 453 static int
 454 bucket_select(int size)
 455 {
 456         struct uma_bucket_zone *ubz;
 457
 458         ubz = &bucket_zones[0];
 459         if (size > ubz->ubz_maxsize)
 460                 return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1);
 461
 462         for (; ubz->ubz_entries != 0; ubz++)
 463                 if (ubz->ubz_maxsize < size)
 464                         break;
 465         ubz--;
 466         return (ubz->ubz_entries);
 467 }
 468
 469 static uma_bucket_t
 470 bucket_alloc(uma_zone_t zone, void *udata, int flags)
 471 {
 472         struct uma_bucket_zone *ubz;
 473         uma_bucket_t bucket;
 474
 475         /*
 476          * Don't allocate buckets early in boot.
 477          */
 478         if (__predict_false(booted < BOOT_KVA))
 479                 return (NULL);
 480
 481         /*
 482          * To limit bucket recursion we store the original zone flags
 483          * in a cookie passed via zalloc_arg/zfree_arg.  This allows the
 484          * NOVM flag to persist even through deep recursions.  We also
 485          * store ZFLAG_BUCKET once we have recursed attempting to allocate
 486          * a bucket for a bucket zone so we do not allow infinite bucket
 487          * recursion.  This cookie will even persist to frees of unused
 488          * buckets via the allocation path or bucket allocations in the
 489          * free path.
 490          */
 491         if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
 492                 udata = (void *)(uintptr_t)zone->uz_flags;
 493         else {
 494                 if ((uintptr_t)udata & UMA_ZFLAG_BUCKET)
 495                         return (NULL);
 496                 udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET);
 497         }
 498         if (((uintptr_t)udata & UMA_ZONE_VM) != 0)
 499                 flags |= M_NOVM;
 500         ubz = bucket_zone_lookup(zone->uz_bucket_size);
 501         if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0)
 502                 ubz++;
 503         bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags);
 504         if (bucket) {
 505 #ifdef INVARIANTS
 506                 bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
 507 #endif
 508                 bucket->ub_cnt = 0;
 509                 bucket->ub_entries = ubz->ubz_entries;
 510                 bucket->ub_seq = SMR_SEQ_INVALID;
 511                 CTR3(KTR_UMA, "bucket_alloc: zone %s(%p) allocated bucket %p",
 512                     zone->uz_name, zone, bucket);
 513         }
 514
 515         return (bucket);
 516 }
 517
 518 static void
 519 bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata)
 520 {
 521         struct uma_bucket_zone *ubz;
 522
 523         if (bucket->ub_cnt != 0)
 524                 bucket_drain(zone, bucket);
 525
 526         KASSERT(bucket->ub_cnt == 0,
 527             ("bucket_free: Freeing a non free bucket."));
 528         KASSERT(bucket->ub_seq == SMR_SEQ_INVALID,
 529             ("bucket_free: Freeing an SMR bucket."));
 530         if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
 531                 udata = (void *)(uintptr_t)zone->uz_flags;
 532         ubz = bucket_zone_lookup(bucket->ub_entries);
 533         uma_zfree_arg(ubz->ubz_zone, bucket, udata);
 534 }
 535
 536 static void
 537 bucket_zone_drain(void)
 538 {
 539         struct uma_bucket_zone *ubz;
 540
 541         for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
 542                 uma_zone_reclaim(ubz->ubz_zone, UMA_RECLAIM_DRAIN);
 543 }
 544
 545 /*
 546  * Acquire the domain lock and record contention.
 547  */
 548 static uma_zone_domain_t
 549 zone_domain_lock(uma_zone_t zone, int domain)
 550 {
 551         uma_zone_domain_t zdom;
 552         bool lockfail;
 553
 554         zdom = ZDOM_GET(zone, domain);
 555         lockfail = false;
 556         if (ZDOM_OWNED(zdom))
 557                 lockfail = true;
 558         ZDOM_LOCK(zdom);
 559         /* This is unsynchronized.  The counter does not need to be precise. */
 560         if (lockfail && zone->uz_bucket_size < zone->uz_bucket_size_max)
 561                 zone->uz_bucket_size++;
 562         return (zdom);
 563 }
 564
 565 /*
 566  * Search for the domain with the least cached items and return it, breaking
 567  * ties with a preferred domain by returning it.
 568  */
 569 static __noinline int
 570 zone_domain_lowest(uma_zone_t zone, int pref)
 571 {
 572         long least, nitems;
 573         int domain;
 574         int i;
 575
 576         least = LONG_MAX;
 577         domain = 0;
 578         for (i = 0; i < vm_ndomains; i++) {
 579                 nitems = ZDOM_GET(zone, i)->uzd_nitems;
 580                 if (nitems < least) {
 581                         domain = i;
 582                         least = nitems;
 583                 } else if (nitems == least && (i == pref || domain == pref))
 584                         domain = pref;
 585         }
 586
 587         return (domain);
 588 }
 589
 590 /*
 591  * Search for the domain with the most cached items and return it or the
 592  * preferred domain if it has enough to proceed.
 593  */
 594 static __noinline int
 595 zone_domain_highest(uma_zone_t zone, int pref)
 596 {
 597         long most, nitems;
 598         int domain;
 599         int i;
 600
 601         if (ZDOM_GET(zone, pref)->uzd_nitems > BUCKET_MAX)
 602                 return (pref);
 603
 604         most = 0;
 605         domain = 0;
 606         for (i = 0; i < vm_ndomains; i++) {
 607                 nitems = ZDOM_GET(zone, i)->uzd_nitems;
 608                 if (nitems > most) {
 609                         domain = i;
 610                         most = nitems;
 611                 }
 612         }
 613
 614         return (domain);
 615 }
 616
 617 /*
 618  * Safely subtract cnt from imax.
 619  */
 620 static void
 621 zone_domain_imax_sub(uma_zone_domain_t zdom, int cnt)
 622 {
 623         long new;
 624         long old;
 625
 626         old = zdom->uzd_imax;
 627         do {
 628                 if (old <= cnt)
 629                         new = 0;
 630                 else
 631                         new = old - cnt;
 632         } while (atomic_fcmpset_long(&zdom->uzd_imax, &old, new) == 0);
 633 }
 634
 635 /*
 636  * Set the maximum imax value.
 637  */
 638 static void
 639 zone_domain_imax_set(uma_zone_domain_t zdom, int nitems)
 640 {
 641         long old;
 642
 643         old = zdom->uzd_imax;
 644         do {
 645                 if (old >= nitems)
 646                         break;
 647         } while (atomic_fcmpset_long(&zdom->uzd_imax, &old, nitems) == 0);
 648 }
 649
 650 /*
 651  * Attempt to satisfy an allocation by retrieving a full bucket from one of the
 652  * zone's caches.  If a bucket is found the zone is not locked on return.
 653  */
 654 static uma_bucket_t
 655 zone_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom, bool reclaim)
 656 {
 657         uma_bucket_t bucket;
 658         int i;
 659         bool dtor = false;
 660
 661         ZDOM_LOCK_ASSERT(zdom);
 662
 663         if ((bucket = STAILQ_FIRST(&zdom->uzd_buckets)) == NULL)
 664                 return (NULL);
 665
 666         /* SMR Buckets can not be re-used until readers expire. */
 667         if ((zone->uz_flags & UMA_ZONE_SMR) != 0 &&
 668             bucket->ub_seq != SMR_SEQ_INVALID) {
 669                 if (!smr_poll(zone->uz_smr, bucket->ub_seq, false))
 670                         return (NULL);
 671                 bucket->ub_seq = SMR_SEQ_INVALID;
 672                 dtor = (zone->uz_dtor != NULL) || UMA_ALWAYS_CTORDTOR;
 673                 if (STAILQ_NEXT(bucket, ub_link) != NULL)
 674                         zdom->uzd_seq = STAILQ_NEXT(bucket, ub_link)->ub_seq;
 675         }
 676         MPASS(zdom->uzd_nitems >= bucket->ub_cnt);
 677         STAILQ_REMOVE_HEAD(&zdom->uzd_buckets, ub_link);
 678         zdom->uzd_nitems -= bucket->ub_cnt;
 679
 680         /*
 681          * Shift the bounds of the current WSS interval to avoid
 682          * perturbing the estimate.
 683          */
 684         if (reclaim) {
 685                 zdom->uzd_imin -= lmin(zdom->uzd_imin, bucket->ub_cnt);
 686                 zone_domain_imax_sub(zdom, bucket->ub_cnt);
 687         } else if (zdom->uzd_imin > zdom->uzd_nitems)
 688                 zdom->uzd_imin = zdom->uzd_nitems;
 689
 690         ZDOM_UNLOCK(zdom);
 691         if (dtor)
 692                 for (i = 0; i < bucket->ub_cnt; i++)
 693                         item_dtor(zone, bucket->ub_bucket[i], zone->uz_size,
 694                             NULL, SKIP_NONE);
 695
 696         return (bucket);
 697 }
 698
 699 /*
 700  * Insert a full bucket into the specified cache.  The "ws" parameter indicates
 701  * whether the bucket's contents should be counted as part of the zone's working
 702  * set.  The bucket may be freed if it exceeds the bucket limit.
 703  */
 704 static void
 705 zone_put_bucket(uma_zone_t zone, int domain, uma_bucket_t bucket, void *udata,
 706     const bool ws)
 707 {
 708         uma_zone_domain_t zdom;
 709
 710         /* We don't cache empty buckets.  This can happen after a reclaim. */
 711         if (bucket->ub_cnt == 0)
 712                 goto out;
 713         zdom = zone_domain_lock(zone, domain);
 714
 715         KASSERT(!ws || zdom->uzd_nitems < zone->uz_bucket_max,
 716             ("%s: zone %p overflow", __func__, zone));
 717
 718         /*
 719          * Conditionally set the maximum number of items.
 720          */
 721         zdom->uzd_nitems += bucket->ub_cnt;
 722         if (__predict_true(zdom->uzd_nitems < zone->uz_bucket_max)) {
 723                 if (ws)
 724                         zone_domain_imax_set(zdom, zdom->uzd_nitems);
 725                 if (STAILQ_EMPTY(&zdom->uzd_buckets))
 726                         zdom->uzd_seq = bucket->ub_seq;
 727                 STAILQ_INSERT_TAIL(&zdom->uzd_buckets, bucket, ub_link);
 728                 ZDOM_UNLOCK(zdom);
 729                 return;
 730         }
 731         zdom->uzd_nitems -= bucket->ub_cnt;
 732         ZDOM_UNLOCK(zdom);
 733 out:
 734         bucket_free(zone, bucket, udata);
 735 }
 736
 737 /* Pops an item out of a per-cpu cache bucket. */
 738 static inline void *
 739 cache_bucket_pop(uma_cache_t cache, uma_cache_bucket_t bucket)
 740 {
 741         void *item;
 742
 743         CRITICAL_ASSERT(curthread);
 744
 745         bucket->ucb_cnt--;
 746         item = bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt];
 747 #ifdef INVARIANTS
 748         bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] = NULL;
 749         KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled."));
 750 #endif
 751         cache->uc_allocs++;
 752
 753         return (item);
 754 }
 755
 756 /* Pushes an item into a per-cpu cache bucket. */
 757 static inline void
 758 cache_bucket_push(uma_cache_t cache, uma_cache_bucket_t bucket, void *item)
 759 {
 760
 761         CRITICAL_ASSERT(curthread);
 762         KASSERT(bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] == NULL,
 763             ("uma_zfree: Freeing to non free bucket index."));
 764
 765         bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] = item;
 766         bucket->ucb_cnt++;
 767         cache->uc_frees++;
 768 }
 769
 770 /*
 771  * Unload a UMA bucket from a per-cpu cache.
 772  */
 773 static inline uma_bucket_t
 774 cache_bucket_unload(uma_cache_bucket_t bucket)
 775 {
 776         uma_bucket_t b;
 777
 778         b = bucket->ucb_bucket;
 779         if (b != NULL) {
 780                 MPASS(b->ub_entries == bucket->ucb_entries);
 781                 b->ub_cnt = bucket->ucb_cnt;
 782                 bucket->ucb_bucket = NULL;
 783                 bucket->ucb_entries = bucket->ucb_cnt = 0;
 784         }
 785
 786         return (b);
 787 }
 788
 789 static inline uma_bucket_t
 790 cache_bucket_unload_alloc(uma_cache_t cache)
 791 {
 792
 793         return (cache_bucket_unload(&cache->uc_allocbucket));
 794 }
 795
 796 static inline uma_bucket_t
 797 cache_bucket_unload_free(uma_cache_t cache)
 798 {
 799
 800         return (cache_bucket_unload(&cache->uc_freebucket));
 801 }
 802
 803 static inline uma_bucket_t
 804 cache_bucket_unload_cross(uma_cache_t cache)
 805 {
 806
 807         return (cache_bucket_unload(&cache->uc_crossbucket));
 808 }
 809
 810 /*
 811  * Load a bucket into a per-cpu cache bucket.
 812  */
 813 static inline void
 814 cache_bucket_load(uma_cache_bucket_t bucket, uma_bucket_t b)
 815 {
 816
 817         CRITICAL_ASSERT(curthread);
 818         MPASS(bucket->ucb_bucket == NULL);
 819         MPASS(b->ub_seq == SMR_SEQ_INVALID);
 820
 821         bucket->ucb_bucket = b;
 822         bucket->ucb_cnt = b->ub_cnt;
 823         bucket->ucb_entries = b->ub_entries;
 824 }
 825
 826 static inline void
 827 cache_bucket_load_alloc(uma_cache_t cache, uma_bucket_t b)
 828 {
 829
 830         cache_bucket_load(&cache->uc_allocbucket, b);
 831 }
 832
 833 static inline void
 834 cache_bucket_load_free(uma_cache_t cache, uma_bucket_t b)
 835 {
 836
 837         cache_bucket_load(&cache->uc_freebucket, b);
 838 }
 839
 840 #ifdef NUMA
 841 static inline void
 842 cache_bucket_load_cross(uma_cache_t cache, uma_bucket_t b)
 843 {
 844
 845         cache_bucket_load(&cache->uc_crossbucket, b);
 846 }
 847 #endif
 848
 849 /*
 850  * Copy and preserve ucb_spare.
 851  */
 852 static inline void
 853 cache_bucket_copy(uma_cache_bucket_t b1, uma_cache_bucket_t b2)
 854 {
 855
 856         b1->ucb_bucket = b2->ucb_bucket;
 857         b1->ucb_entries = b2->ucb_entries;
 858         b1->ucb_cnt = b2->ucb_cnt;
 859 }
 860
 861 /*
 862  * Swap two cache buckets.
 863  */
 864 static inline void
 865 cache_bucket_swap(uma_cache_bucket_t b1, uma_cache_bucket_t b2)
 866 {
 867         struct uma_cache_bucket b3;
 868
 869         CRITICAL_ASSERT(curthread);
 870
 871         cache_bucket_copy(&b3, b1);
 872         cache_bucket_copy(b1, b2);
 873         cache_bucket_copy(b2, &b3);
 874 }
 875
 876 /*
 877  * Attempt to fetch a bucket from a zone on behalf of the current cpu cache.
 878  */
 879 static uma_bucket_t
 880 cache_fetch_bucket(uma_zone_t zone, uma_cache_t cache, int domain)
 881 {
 882         uma_zone_domain_t zdom;
 883         uma_bucket_t bucket;
 884
 885         /*
 886          * Avoid the lock if possible.
 887          */
 888         zdom = ZDOM_GET(zone, domain);
 889         if (zdom->uzd_nitems == 0)
 890                 return (NULL);
 891
 892         if ((cache_uz_flags(cache) & UMA_ZONE_SMR) != 0 &&
 893             !smr_poll(zone->uz_smr, zdom->uzd_seq, false))
 894                 return (NULL);
 895
 896         /*
 897          * Check the zone's cache of buckets.
 898          */
 899         zdom = zone_domain_lock(zone, domain);
 900         if ((bucket = zone_fetch_bucket(zone, zdom, false)) != NULL) {
 901                 KASSERT(bucket->ub_cnt != 0,
 902                     ("cache_fetch_bucket: Returning an empty bucket."));
 903                 return (bucket);
 904         }
 905         ZDOM_UNLOCK(zdom);
 906
 907         return (NULL);
 908 }
 909
 910 static void
 911 zone_log_warning(uma_zone_t zone)
 912 {
 913         static const struct timeval warninterval = { 300, 0 };
 914
 915         if (!zone_warnings || zone->uz_warning == NULL)
 916                 return;
 917
 918         if (ratecheck(&zone->uz_ratecheck, &warninterval))
 919                 printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning);
 920 }
 921
 922 static inline void
 923 zone_maxaction(uma_zone_t zone)
 924 {
 925
 926         if (zone->uz_maxaction.ta_func != NULL)
 927                 taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction);
 928 }
 929
 930 /*
 931  * Routine called by timeout which is used to fire off some time interval
 932  * based calculations.  (stats, hash size, etc.)
 933  *
 934  * Arguments:
 935  *      arg   Unused
 936  *
 937  * Returns:
 938  *      Nothing
 939  */
 940 static void
 941 uma_timeout(void *unused)
 942 {
 943         bucket_enable();
 944         zone_foreach(zone_timeout, NULL);
 945
 946         /* Reschedule this event */
 947         callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
 948 }
 949
 950 /*
 951  * Update the working set size estimate for the zone's bucket cache.
 952  * The constants chosen here are somewhat arbitrary.  With an update period of
 953  * 20s (UMA_TIMEOUT), this estimate is dominated by zone activity over the
 954  * last 100s.
 955  */
 956 static void
 957 zone_domain_update_wss(uma_zone_domain_t zdom)
 958 {
 959         long wss;
 960
 961         ZDOM_LOCK(zdom);
 962         MPASS(zdom->uzd_imax >= zdom->uzd_imin);
 963         wss = zdom->uzd_imax - zdom->uzd_imin;
 964         zdom->uzd_imax = zdom->uzd_imin = zdom->uzd_nitems;
 965         zdom->uzd_wss = (4 * wss + zdom->uzd_wss) / 5;
 966         ZDOM_UNLOCK(zdom);
 967 }
 968
 969 /*
 970  * Routine to perform timeout driven calculations.  This expands the
 971  * hashes and does per cpu statistics aggregation.
 972  *
 973  *  Returns nothing.
 974  */
 975 static void
 976 zone_timeout(uma_zone_t zone, void *unused)
 977 {
 978         uma_keg_t keg;
 979         u_int slabs, pages;
 980
 981         if ((zone->uz_flags & UMA_ZFLAG_HASH) == 0)
 982                 goto update_wss;
 983
 984         keg = zone->uz_keg;
 985
 986         /*
 987          * Hash zones are non-numa by definition so the first domain
 988          * is the only one present.
 989          */
 990         KEG_LOCK(keg, 0);
 991         pages = keg->uk_domain[0].ud_pages;
 992
 993         /*
 994          * Expand the keg hash table.
 995          *
 996          * This is done if the number of slabs is larger than the hash size.
 997          * What I'm trying to do here is completely reduce collisions.  This
 998          * may be a little aggressive.  Should I allow for two collisions max?
 999          */
1000         if ((slabs = pages / keg->uk_ppera) > keg->uk_hash.uh_hashsize) {
1001                 struct uma_hash newhash;
1002                 struct uma_hash oldhash;
1003                 int ret;
1004
1005                 /*
1006                  * This is so involved because allocating and freeing
1007                  * while the keg lock is held will lead to deadlock.
1008                  * I have to do everything in stages and check for
1009                  * races.
1010                  */
1011                 KEG_UNLOCK(keg, 0);
1012                 ret = hash_alloc(&newhash, 1 << fls(slabs));
1013                 KEG_LOCK(keg, 0);
1014                 if (ret) {
1015                         if (hash_expand(&keg->uk_hash, &newhash)) {
1016                                 oldhash = keg->uk_hash;
1017                                 keg->uk_hash = newhash;
1018                         } else
1019                                 oldhash = newhash;
1020
1021                         KEG_UNLOCK(keg, 0);
1022                         hash_free(&oldhash);
1023                         goto update_wss;
1024                 }
1025         }
1026         KEG_UNLOCK(keg, 0);
1027
1028 update_wss:
1029         for (int i = 0; i < vm_ndomains; i++)
1030                 zone_domain_update_wss(ZDOM_GET(zone, i));
1031 }
1032
1033 /*
1034  * Allocate and zero fill the next sized hash table from the appropriate
1035  * backing store.
1036  *
1037  * Arguments:
1038  *      hash  A new hash structure with the old hash size in uh_hashsize
1039  *
1040  * Returns:
1041  *      1 on success and 0 on failure.
1042  */
1043 static int
1044 hash_alloc(struct uma_hash *hash, u_int size)
1045 {
1046         size_t alloc;
1047
1048         KASSERT(powerof2(size), ("hash size must be power of 2"));
1049         if (size > UMA_HASH_SIZE_INIT)  {
1050                 hash->uh_hashsize = size;
1051                 alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
1052                 hash->uh_slab_hash = malloc(alloc, M_UMAHASH, M_NOWAIT);
1053         } else {
1054                 alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
1055                 hash->uh_slab_hash = zone_alloc_item(hashzone, NULL,
1056                     UMA_ANYDOMAIN, M_WAITOK);
1057                 hash->uh_hashsize = UMA_HASH_SIZE_INIT;
1058         }
1059         if (hash->uh_slab_hash) {
1060                 bzero(hash->uh_slab_hash, alloc);
1061                 hash->uh_hashmask = hash->uh_hashsize - 1;
1062                 return (1);
1063         }
1064
1065         return (0);
1066 }
1067
1068 /*
1069  * Expands the hash table for HASH zones.  This is done from zone_timeout
1070  * to reduce collisions.  This must not be done in the regular allocation
1071  * path, otherwise, we can recurse on the vm while allocating pages.
1072  *
1073  * Arguments:
1074  *      oldhash  The hash you want to expand
1075  *      newhash  The hash structure for the new table
1076  *
1077  * Returns:
1078  *      Nothing
1079  *
1080  * Discussion:
1081  */
1082 static int
1083 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
1084 {
1085         uma_hash_slab_t slab;
1086         u_int hval;
1087         u_int idx;
1088
1089         if (!newhash->uh_slab_hash)
1090                 return (0);
1091
1092         if (oldhash->uh_hashsize >= newhash->uh_hashsize)
1093                 return (0);
1094
1095         /*
1096          * I need to investigate hash algorithms for resizing without a
1097          * full rehash.
1098          */
1099
1100         for (idx = 0; idx < oldhash->uh_hashsize; idx++)
1101                 while (!LIST_EMPTY(&oldhash->uh_slab_hash[idx])) {
1102                         slab = LIST_FIRST(&oldhash->uh_slab_hash[idx]);
1103                         LIST_REMOVE(slab, uhs_hlink);
1104                         hval = UMA_HASH(newhash, slab->uhs_data);
1105                         LIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
1106                             slab, uhs_hlink);
1107                 }
1108
1109         return (1);
1110 }
1111
1112 /*
1113  * Free the hash bucket to the appropriate backing store.
1114  *
1115  * Arguments:
1116  *      slab_hash  The hash bucket we're freeing
1117  *      hashsize   The number of entries in that hash bucket
1118  *
1119  * Returns:
1120  *      Nothing
1121  */
1122 static void
1123 hash_free(struct uma_hash *hash)
1124 {
1125         if (hash->uh_slab_hash == NULL)
1126                 return;
1127         if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
1128                 zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE);
1129         else
1130                 free(hash->uh_slab_hash, M_UMAHASH);
1131 }
1132
1133 /*
1134  * Frees all outstanding items in a bucket
1135  *
1136  * Arguments:
1137  *      zone   The zone to free to, must be unlocked.
1138  *      bucket The free/alloc bucket with items.
1139  *
1140  * Returns:
1141  *      Nothing
1142  */
1143 static void
1144 bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
1145 {
1146         int i;
1147
1148         if (bucket->ub_cnt == 0)
1149                 return;
1150
1151         if ((zone->uz_flags & UMA_ZONE_SMR) != 0 &&
1152             bucket->ub_seq != SMR_SEQ_INVALID) {
1153                 smr_wait(zone->uz_smr, bucket->ub_seq);
1154                 bucket->ub_seq = SMR_SEQ_INVALID;
1155                 for (i = 0; i < bucket->ub_cnt; i++)
1156                         item_dtor(zone, bucket->ub_bucket[i],
1157                             zone->uz_size, NULL, SKIP_NONE);
1158         }
1159         if (zone->uz_fini)
1160                 for (i = 0; i < bucket->ub_cnt; i++)
1161                         zone->uz_fini(bucket->ub_bucket[i], zone->uz_size);
1162         zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt);
1163         if (zone->uz_max_items > 0)
1164                 zone_free_limit(zone, bucket->ub_cnt);
1165 #ifdef INVARIANTS
1166         bzero(bucket->ub_bucket, sizeof(void *) * bucket->ub_cnt);
1167 #endif
1168         bucket->ub_cnt = 0;
1169 }
1170
1171 /*
1172  * Drains the per cpu caches for a zone.
1173  *
1174  * NOTE: This may only be called while the zone is being torn down, and not
1175  * during normal operation.  This is necessary in order that we do not have
1176  * to migrate CPUs to drain the per-CPU caches.
1177  *
1178  * Arguments:
1179  *      zone     The zone to drain, must be unlocked.
1180  *
1181  * Returns:
1182  *      Nothing
1183  */
1184 static void
1185 cache_drain(uma_zone_t zone)
1186 {
1187         uma_cache_t cache;
1188         uma_bucket_t bucket;
1189         smr_seq_t seq;
1190         int cpu;
1191
1192         /*
1193          * XXX: It is safe to not lock the per-CPU caches, because we're
1194          * tearing down the zone anyway.  I.e., there will be no further use
1195          * of the caches at this point.
1196          *
1197          * XXX: It would good to be able to assert that the zone is being
1198          * torn down to prevent improper use of cache_drain().
1199          */
1200         seq = SMR_SEQ_INVALID;
1201         if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
1202                 seq = smr_advance(zone->uz_smr);
1203         CPU_FOREACH(cpu) {
1204                 cache = &zone->uz_cpu[cpu];
1205                 bucket = cache_bucket_unload_alloc(cache);
1206                 if (bucket != NULL)
1207                         bucket_free(zone, bucket, NULL);
1208                 bucket = cache_bucket_unload_free(cache);
1209                 if (bucket != NULL) {
1210                         bucket->ub_seq = seq;
1211                         bucket_free(zone, bucket, NULL);
1212                 }
1213                 bucket = cache_bucket_unload_cross(cache);
1214                 if (bucket != NULL) {
1215                         bucket->ub_seq = seq;
1216                         bucket_free(zone, bucket, NULL);
1217                 }
1218         }
1219         bucket_cache_reclaim(zone, true);
1220 }
1221
1222 static void
1223 cache_shrink(uma_zone_t zone, void *unused)
1224 {
1225
1226         if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
1227                 return;
1228
1229         zone->uz_bucket_size =
1230             (zone->uz_bucket_size_min + zone->uz_bucket_size) / 2;
1231 }
1232
1233 static void
1234 cache_drain_safe_cpu(uma_zone_t zone, void *unused)
1235 {
1236         uma_cache_t cache;
1237         uma_bucket_t b1, b2, b3;
1238         int domain;
1239
1240         if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
1241                 return;
1242
1243         b1 = b2 = b3 = NULL;
1244         critical_enter();
1245         cache = &zone->uz_cpu[curcpu];
1246         domain = PCPU_GET(domain);
1247         b1 = cache_bucket_unload_alloc(cache);
1248
1249         /*
1250          * Don't flush SMR zone buckets.  This leaves the zone without a
1251          * bucket and forces every free to synchronize().
1252          */
1253         if ((zone->uz_flags & UMA_ZONE_SMR) == 0) {
1254                 b2 = cache_bucket_unload_free(cache);
1255                 b3 = cache_bucket_unload_cross(cache);
1256         }
1257         critical_exit();
1258
1259         if (b1 != NULL)
1260                 zone_free_bucket(zone, b1, NULL, domain, false);
1261         if (b2 != NULL)
1262                 zone_free_bucket(zone, b2, NULL, domain, false);
1263         if (b3 != NULL) {
1264                 /* Adjust the domain so it goes to zone_free_cross. */
1265                 domain = (domain + 1) % vm_ndomains;
1266                 zone_free_bucket(zone, b3, NULL, domain, false);
1267         }
1268 }
1269
1270 /*
1271  * Safely drain per-CPU caches of a zone(s) to alloc bucket.
1272  * This is an expensive call because it needs to bind to all CPUs
1273  * one by one and enter a critical section on each of them in order
1274  * to safely access their cache buckets.
1275  * Zone lock must not be held on call this function.
1276  */
1277 static void
1278 pcpu_cache_drain_safe(uma_zone_t zone)
1279 {
1280         int cpu;
1281
1282         /*
1283          * Polite bucket sizes shrinking was not enough, shrink aggressively.
1284          */
1285         if (zone)
1286                 cache_shrink(zone, NULL);
1287         else
1288                 zone_foreach(cache_shrink, NULL);
1289
1290         CPU_FOREACH(cpu) {
1291                 thread_lock(curthread);
1292                 sched_bind(curthread, cpu);
1293                 thread_unlock(curthread);
1294
1295                 if (zone)
1296                         cache_drain_safe_cpu(zone, NULL);
1297                 else
1298                         zone_foreach(cache_drain_safe_cpu, NULL);
1299         }
1300         thread_lock(curthread);
1301         sched_unbind(curthread);
1302         thread_unlock(curthread);
1303 }
1304
1305 /*
1306  * Reclaim cached buckets from a zone.  All buckets are reclaimed if the caller
1307  * requested a drain, otherwise the per-domain caches are trimmed to either
1308  * estimated working set size.
1309  */
1310 static void
1311 bucket_cache_reclaim(uma_zone_t zone, bool drain)
1312 {
1313         uma_zone_domain_t zdom;
1314         uma_bucket_t bucket;
1315         long target;
1316         int i;
1317
1318         /*
1319          * Shrink the zone bucket size to ensure that the per-CPU caches
1320          * don't grow too large.
1321          */
1322         if (zone->uz_bucket_size > zone->uz_bucket_size_min)
1323                 zone->uz_bucket_size--;
1324
1325         for (i = 0; i < vm_ndomains; i++) {
1326                 /*
1327                  * The cross bucket is partially filled and not part of
1328                  * the item count.  Reclaim it individually here.
1329                  */
1330                 zdom = ZDOM_GET(zone, i);
1331                 if ((zone->uz_flags & UMA_ZONE_SMR) == 0 || drain) {
1332                         ZONE_CROSS_LOCK(zone);
1333                         bucket = zdom->uzd_cross;
1334                         zdom->uzd_cross = NULL;
1335                         ZONE_CROSS_UNLOCK(zone);
1336                         if (bucket != NULL)
1337                                 bucket_free(zone, bucket, NULL);
1338                 }
1339
1340                 /*
1341                  * If we were asked to drain the zone, we are done only once
1342                  * this bucket cache is empty.  Otherwise, we reclaim items in
1343                  * excess of the zone's estimated working set size.  If the
1344                  * difference nitems - imin is larger than the WSS estimate,
1345                  * then the estimate will grow at the end of this interval and
1346                  * we ignore the historical average.
1347                  */
1348                 ZDOM_LOCK(zdom);
1349                 target = drain ? 0 : lmax(zdom->uzd_wss, zdom->uzd_nitems -
1350                     zdom->uzd_imin);
1351                 while (zdom->uzd_nitems > target) {
1352                         bucket = zone_fetch_bucket(zone, zdom, true);
1353                         if (bucket == NULL)
1354                                 break;
1355                         bucket_free(zone, bucket, NULL);
1356                         ZDOM_LOCK(zdom);
1357                 }
1358                 ZDOM_UNLOCK(zdom);
1359         }
1360 }
1361
1362 static void
1363 keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start)
1364 {
1365         uint8_t *mem;
1366         int i;
1367         uint8_t flags;
1368
1369         CTR4(KTR_UMA, "keg_free_slab keg %s(%p) slab %p, returning %d bytes",
1370             keg->uk_name, keg, slab, PAGE_SIZE * keg->uk_ppera);
1371
1372         mem = slab_data(slab, keg);
1373         flags = slab->us_flags;
1374         i = start;
1375         if (keg->uk_fini != NULL) {
1376                 for (i--; i > -1; i--)
1377 #ifdef INVARIANTS
1378                 /*
1379                  * trash_fini implies that dtor was trash_dtor. trash_fini
1380                  * would check that memory hasn't been modified since free,
1381                  * which executed trash_dtor.
1382                  * That's why we need to run uma_dbg_kskip() check here,
1383                  * albeit we don't make skip check for other init/fini
1384                  * invocations.
1385                  */
1386                 if (!uma_dbg_kskip(keg, slab_item(slab, keg, i)) ||
1387                     keg->uk_fini != trash_fini)
1388 #endif
1389                         keg->uk_fini(slab_item(slab, keg, i), keg->uk_size);
1390         }
1391         if (keg->uk_flags & UMA_ZFLAG_OFFPAGE)
1392                 zone_free_item(slabzone(keg->uk_ipers), slab_tohashslab(slab),
1393                     NULL, SKIP_NONE);
1394         keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera, flags);
1395         uma_total_dec(PAGE_SIZE * keg->uk_ppera);
1396 }
1397
1398 /*
1399  * Frees pages from a keg back to the system.  This is done on demand from
1400  * the pageout daemon.
1401  *
1402  * Returns nothing.
1403  */
1404 static void
1405 keg_drain(uma_keg_t keg)
1406 {
1407         struct slabhead freeslabs;
1408         uma_domain_t dom;
1409         uma_slab_t slab, tmp;
1410         int i, n;
1411
1412         if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)
1413                 return;
1414
1415         for (i = 0; i < vm_ndomains; i++) {
1416                 CTR4(KTR_UMA, "keg_drain %s(%p) domain %d free items: %u",
1417                     keg->uk_name, keg, i, dom->ud_free_items);
1418                 dom = &keg->uk_domain[i];
1419                 LIST_INIT(&freeslabs);
1420
1421                 KEG_LOCK(keg, i);
1422                 if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0) {
1423                         LIST_FOREACH(slab, &dom->ud_free_slab, us_link)
1424                                 UMA_HASH_REMOVE(&keg->uk_hash, slab);
1425                 }
1426                 n = dom->ud_free_slabs;
1427                 LIST_SWAP(&freeslabs, &dom->ud_free_slab, uma_slab, us_link);
1428                 dom->ud_free_slabs = 0;
1429                 dom->ud_free_items -= n * keg->uk_ipers;
1430                 dom->ud_pages -= n * keg->uk_ppera;
1431                 KEG_UNLOCK(keg, i);
1432
1433                 LIST_FOREACH_SAFE(slab, &freeslabs, us_link, tmp)
1434                         keg_free_slab(keg, slab, keg->uk_ipers);
1435         }
1436 }
1437
1438 static void
1439 zone_reclaim(uma_zone_t zone, int waitok, bool drain)
1440 {
1441
1442         /*
1443          * Set draining to interlock with zone_dtor() so we can release our
1444          * locks as we go.  Only dtor() should do a WAITOK call since it
1445          * is the only call that knows the structure will still be available
1446          * when it wakes up.
1447          */
1448         ZONE_LOCK(zone);
1449         while (zone->uz_flags & UMA_ZFLAG_RECLAIMING) {
1450                 if (waitok == M_NOWAIT)
1451                         goto out;
1452                 msleep(zone, &ZDOM_GET(zone, 0)->uzd_lock, PVM, "zonedrain",
1453                     1);
1454         }
1455         zone->uz_flags |= UMA_ZFLAG_RECLAIMING;
1456         ZONE_UNLOCK(zone);
1457         bucket_cache_reclaim(zone, drain);
1458
1459         /*
1460          * The DRAINING flag protects us from being freed while
1461          * we're running.  Normally the uma_rwlock would protect us but we
1462          * must be able to release and acquire the right lock for each keg.
1463          */
1464         if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0)
1465                 keg_drain(zone->uz_keg);
1466         ZONE_LOCK(zone);
1467         zone->uz_flags &= ~UMA_ZFLAG_RECLAIMING;
1468         wakeup(zone);
1469 out:
1470         ZONE_UNLOCK(zone);
1471 }
1472
1473 static void
1474 zone_drain(uma_zone_t zone, void *unused)
1475 {
1476
1477         zone_reclaim(zone, M_NOWAIT, true);
1478 }
1479
1480 static void
1481 zone_trim(uma_zone_t zone, void *unused)
1482 {
1483
1484         zone_reclaim(zone, M_NOWAIT, false);
1485 }
1486
1487 /*
1488  * Allocate a new slab for a keg and inserts it into the partial slab list.
1489  * The keg should be unlocked on entry.  If the allocation succeeds it will
1490  * be locked on return.
1491  *
1492  * Arguments:
1493  *      flags   Wait flags for the item initialization routine
1494  *      aflags  Wait flags for the slab allocation
1495  *
1496  * Returns:
1497  *      The slab that was allocated or NULL if there is no memory and the
1498  *      caller specified M_NOWAIT.
1499  */
1500 static uma_slab_t
1501 keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int flags,
1502     int aflags)
1503 {
1504         uma_domain_t dom;
1505         uma_alloc allocf;
1506         uma_slab_t slab;
1507         unsigned long size;
1508         uint8_t *mem;
1509         uint8_t sflags;
1510         int i;
1511
1512         KASSERT(domain >= 0 && domain < vm_ndomains,
1513             ("keg_alloc_slab: domain %d out of range", domain));
1514
1515         allocf = keg->uk_allocf;
1516         slab = NULL;
1517         mem = NULL;
1518         if (keg->uk_flags & UMA_ZFLAG_OFFPAGE) {
1519                 uma_hash_slab_t hslab;
1520                 hslab = zone_alloc_item(slabzone(keg->uk_ipers), NULL,
1521                     domain, aflags);
1522                 if (hslab == NULL)
1523                         goto fail;
1524                 slab = &hslab->uhs_slab;
1525         }
1526
1527         /*
1528          * This reproduces the old vm_zone behavior of zero filling pages the
1529          * first time they are added to a zone.
1530          *
1531          * Malloced items are zeroed in uma_zalloc.
1532          */
1533
1534         if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
1535                 aflags |= M_ZERO;
1536         else
1537                 aflags &= ~M_ZERO;
1538
1539         if (keg->uk_flags & UMA_ZONE_NODUMP)
1540                 aflags |= M_NODUMP;
1541
1542         /* zone is passed for legacy reasons. */
1543         size = keg->uk_ppera * PAGE_SIZE;
1544         mem = allocf(zone, size, domain, &sflags, aflags);
1545         if (mem == NULL) {
1546                 if (keg->uk_flags & UMA_ZFLAG_OFFPAGE)
1547                         zone_free_item(slabzone(keg->uk_ipers),
1548                             slab_tohashslab(slab), NULL, SKIP_NONE);
1549                 goto fail;
1550         }
1551         uma_total_inc(size);
1552
1553         /* For HASH zones all pages go to the same uma_domain. */
1554         if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0)
1555                 domain = 0;
1556
1557         /* Point the slab into the allocated memory */
1558         if (!(keg->uk_flags & UMA_ZFLAG_OFFPAGE))
1559                 slab = (uma_slab_t )(mem + keg->uk_pgoff);
1560         else
1561                 slab_tohashslab(slab)->uhs_data = mem;
1562
1563         if (keg->uk_flags & UMA_ZFLAG_VTOSLAB)
1564                 for (i = 0; i < keg->uk_ppera; i++)
1565                         vsetzoneslab((vm_offset_t)mem + (i * PAGE_SIZE),
1566                             zone, slab);
1567
1568         slab->us_freecount = keg->uk_ipers;
1569         slab->us_flags = sflags;
1570         slab->us_domain = domain;
1571
1572         BIT_FILL(keg->uk_ipers, &slab->us_free);
1573 #ifdef INVARIANTS
1574         BIT_ZERO(keg->uk_ipers, slab_dbg_bits(slab, keg));
1575 #endif
1576
1577         if (keg->uk_init != NULL) {
1578                 for (i = 0; i < keg->uk_ipers; i++)
1579                         if (keg->uk_init(slab_item(slab, keg, i),
1580                             keg->uk_size, flags) != 0)
1581                                 break;
1582                 if (i != keg->uk_ipers) {
1583                         keg_free_slab(keg, slab, i);
1584                         goto fail;
1585                 }
1586         }
1587         KEG_LOCK(keg, domain);
1588
1589         CTR3(KTR_UMA, "keg_alloc_slab: allocated slab %p for %s(%p)",
1590             slab, keg->uk_name, keg);
1591
1592         if (keg->uk_flags & UMA_ZFLAG_HASH)
1593                 UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
1594
1595         /*
1596          * If we got a slab here it's safe to mark it partially used
1597          * and return.  We assume that the caller is going to remove
1598          * at least one item.
1599          */
1600         dom = &keg->uk_domain[domain];
1601         LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
1602         dom->ud_pages += keg->uk_ppera;
1603         dom->ud_free_items += keg->uk_ipers;
1604
1605         return (slab);
1606
1607 fail:
1608         return (NULL);
1609 }
1610
1611 /*
1612  * This function is intended to be used early on in place of page_alloc() so
1613  * that we may use the boot time page cache to satisfy allocations before
1614  * the VM is ready.
1615  */
1616 static void *
1617 startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1618     int wait)
1619 {
1620         vm_paddr_t pa;
1621         vm_page_t m;
1622         void *mem;
1623         int pages;
1624         int i;
1625
1626         pages = howmany(bytes, PAGE_SIZE);
1627         KASSERT(pages > 0, ("%s can't reserve 0 pages", __func__));
1628
1629         *pflag = UMA_SLAB_BOOT;
1630         m = vm_page_alloc_contig_domain(NULL, 0, domain,
1631             malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED, pages,
1632             (vm_paddr_t)0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT);
1633         if (m == NULL)
1634                 return (NULL);
1635
1636         pa = VM_PAGE_TO_PHYS(m);
1637         for (i = 0; i < pages; i++, pa += PAGE_SIZE) {
1638 #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) || \
1639     defined(__riscv) || defined(__powerpc64__)
1640                 if ((wait & M_NODUMP) == 0)
1641                         dump_add_page(pa);
1642 #endif
1643         }
1644         /* Allocate KVA and indirectly advance bootmem. */
1645         mem = (void *)pmap_map(&bootmem, m->phys_addr,
1646             m->phys_addr + (pages * PAGE_SIZE), VM_PROT_READ | VM_PROT_WRITE);
1647         if ((wait & M_ZERO) != 0)
1648                 bzero(mem, pages * PAGE_SIZE);
1649
1650         return (mem);
1651 }
1652
1653 static void
1654 startup_free(void *mem, vm_size_t bytes)
1655 {
1656         vm_offset_t va;
1657         vm_page_t m;
1658
1659         va = (vm_offset_t)mem;
1660         m = PHYS_TO_VM_PAGE(pmap_kextract(va));
1661         pmap_remove(kernel_pmap, va, va + bytes);
1662         for (; bytes != 0; bytes -= PAGE_SIZE, m++) {
1663 #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) || \
1664     defined(__riscv) || defined(__powerpc64__)
1665                 dump_drop_page(VM_PAGE_TO_PHYS(m));
1666 #endif
1667                 vm_page_unwire_noq(m);
1668                 vm_page_free(m);
1669         }
1670 }
1671
1672 /*
1673  * Allocates a number of pages from the system
1674  *
1675  * Arguments:
1676  *      bytes  The number of bytes requested
1677  *      wait  Shall we wait?
1678  *
1679  * Returns:
1680  *      A pointer to the alloced memory or possibly
1681  *      NULL if M_NOWAIT is set.
1682  */
1683 static void *
1684 page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1685     int wait)
1686 {
1687         void *p;        /* Returned page */
1688
1689         *pflag = UMA_SLAB_KERNEL;
1690         p = (void *)kmem_malloc_domainset(DOMAINSET_FIXED(domain), bytes, wait);
1691
1692         return (p);
1693 }
1694
1695 static void *
1696 pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1697     int wait)
1698 {
1699         struct pglist alloctail;
1700         vm_offset_t addr, zkva;
1701         int cpu, flags;
1702         vm_page_t p, p_next;
1703 #ifdef NUMA
1704         struct pcpu *pc;
1705 #endif
1706
1707         MPASS(bytes == (mp_maxid + 1) * PAGE_SIZE);
1708
1709         TAILQ_INIT(&alloctail);
1710         flags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
1711             malloc2vm_flags(wait);
1712         *pflag = UMA_SLAB_KERNEL;
1713         for (cpu = 0; cpu <= mp_maxid; cpu++) {
1714                 if (CPU_ABSENT(cpu)) {
1715                         p = vm_page_alloc(NULL, 0, flags);
1716                 } else {
1717 #ifndef NUMA
1718                         p = vm_page_alloc(NULL, 0, flags);
1719 #else
1720                         pc = pcpu_find(cpu);
1721                         if (__predict_false(VM_DOMAIN_EMPTY(pc->pc_domain)))
1722                                 p = NULL;
1723                         else
1724                                 p = vm_page_alloc_domain(NULL, 0,
1725                                     pc->pc_domain, flags);
1726                         if (__predict_false(p == NULL))
1727                                 p = vm_page_alloc(NULL, 0, flags);
1728 #endif
1729                 }
1730                 if (__predict_false(p == NULL))
1731                         goto fail;
1732                 TAILQ_INSERT_TAIL(&alloctail, p, listq);
1733         }
1734         if ((addr = kva_alloc(bytes)) == 0)
1735                 goto fail;
1736         zkva = addr;
1737         TAILQ_FOREACH(p, &alloctail, listq) {
1738                 pmap_qenter(zkva, &p, 1);
1739                 zkva += PAGE_SIZE;
1740         }
1741         return ((void*)addr);
1742 fail:
1743         TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
1744                 vm_page_unwire_noq(p);
1745                 vm_page_free(p);
1746         }
1747         return (NULL);
1748 }
1749
1750 /*
1751  * Allocates a number of pages from within an object
1752  *
1753  * Arguments:
1754  *      bytes  The number of bytes requested
1755  *      wait   Shall we wait?
1756  *
1757  * Returns:
1758  *      A pointer to the alloced memory or possibly
1759  *      NULL if M_NOWAIT is set.
1760  */
1761 static void *
1762 noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
1763     int wait)
1764 {
1765         TAILQ_HEAD(, vm_page) alloctail;
1766         u_long npages;
1767         vm_offset_t retkva, zkva;
1768         vm_page_t p, p_next;
1769         uma_keg_t keg;
1770
1771         TAILQ_INIT(&alloctail);
1772         keg = zone->uz_keg;
1773
1774         npages = howmany(bytes, PAGE_SIZE);
1775         while (npages > 0) {
1776                 p = vm_page_alloc_domain(NULL, 0, domain, VM_ALLOC_INTERRUPT |
1777                     VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
1778                     ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK :
1779                     VM_ALLOC_NOWAIT));
1780                 if (p != NULL) {
1781                         /*
1782                          * Since the page does not belong to an object, its
1783                          * listq is unused.
1784                          */
1785                         TAILQ_INSERT_TAIL(&alloctail, p, listq);
1786                         npages--;
1787                         continue;
1788                 }
1789                 /*
1790                  * Page allocation failed, free intermediate pages and
1791                  * exit.
1792                  */
1793                 TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
1794                         vm_page_unwire_noq(p);
1795                         vm_page_free(p);
1796                 }
1797                 return (NULL);
1798         }
1799         *flags = UMA_SLAB_PRIV;
1800         zkva = keg->uk_kva +
1801             atomic_fetchadd_long(&keg->uk_offset, round_page(bytes));
1802         retkva = zkva;
1803         TAILQ_FOREACH(p, &alloctail, listq) {
1804                 pmap_qenter(zkva, &p, 1);
1805                 zkva += PAGE_SIZE;
1806         }
1807
1808         return ((void *)retkva);
1809 }
1810
1811 /*
1812  * Allocate physically contiguous pages.
1813  */
1814 static void *
1815 contig_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1816     int wait)
1817 {
1818
1819         *pflag = UMA_SLAB_KERNEL;
1820         return ((void *)kmem_alloc_contig_domainset(DOMAINSET_FIXED(domain),
1821             bytes, wait, 0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT));
1822 }
1823
1824 /*
1825  * Frees a number of pages to the system
1826  *
1827  * Arguments:
1828  *      mem   A pointer to the memory to be freed
1829  *      size  The size of the memory being freed
1830  *      flags The original p->us_flags field
1831  *
1832  * Returns:
1833  *      Nothing
1834  */
1835 static void
1836 page_free(void *mem, vm_size_t size, uint8_t flags)
1837 {
1838
1839         if ((flags & UMA_SLAB_BOOT) != 0) {
1840                 startup_free(mem, size);
1841                 return;
1842         }
1843
1844         KASSERT((flags & UMA_SLAB_KERNEL) != 0,
1845             ("UMA: page_free used with invalid flags %x", flags));
1846
1847         kmem_free((vm_offset_t)mem, size);
1848 }
1849
1850 /*
1851  * Frees pcpu zone allocations
1852  *
1853  * Arguments:
1854  *      mem   A pointer to the memory to be freed
1855  *      size  The size of the memory being freed
1856  *      flags The original p->us_flags field
1857  *
1858  * Returns:
1859  *      Nothing
1860  */
1861 static void
1862 pcpu_page_free(void *mem, vm_size_t size, uint8_t flags)
1863 {
1864         vm_offset_t sva, curva;
1865         vm_paddr_t paddr;
1866         vm_page_t m;
1867
1868         MPASS(size == (mp_maxid+1)*PAGE_SIZE);
1869
1870         if ((flags & UMA_SLAB_BOOT) != 0) {
1871                 startup_free(mem, size);
1872                 return;
1873         }
1874
1875         sva = (vm_offset_t)mem;
1876         for (curva = sva; curva < sva + size; curva += PAGE_SIZE) {
1877                 paddr = pmap_kextract(curva);
1878                 m = PHYS_TO_VM_PAGE(paddr);
1879                 vm_page_unwire_noq(m);
1880                 vm_page_free(m);
1881         }
1882         pmap_qremove(sva, size >> PAGE_SHIFT);
1883         kva_free(sva, size);
1884 }
1885
1886
1887 /*
1888  * Zero fill initializer
1889  *
1890  * Arguments/Returns follow uma_init specifications
1891  */
1892 static int
1893 zero_init(void *mem, int size, int flags)
1894 {
1895         bzero(mem, size);
1896         return (0);
1897 }
1898
1899 #ifdef INVARIANTS
1900 struct noslabbits *
1901 slab_dbg_bits(uma_slab_t slab, uma_keg_t keg)
1902 {
1903
1904         return ((void *)((char *)&slab->us_free + BITSET_SIZE(keg->uk_ipers)));
1905 }
1906 #endif
1907
1908 /*
1909  * Actual size of embedded struct slab (!OFFPAGE).
1910  */
1911 size_t
1912 slab_sizeof(int nitems)
1913 {
1914         size_t s;
1915
1916         s = sizeof(struct uma_slab) + BITSET_SIZE(nitems) * SLAB_BITSETS;
1917         return (roundup(s, UMA_ALIGN_PTR + 1));
1918 }
1919
1920 /*
1921  * Size of memory for embedded slabs (!OFFPAGE).
1922  */
1923 size_t
1924 slab_space(int nitems)
1925 {
1926         return (UMA_SLAB_SIZE - slab_sizeof(nitems));
1927 }
1928
1929 #define UMA_FIXPT_SHIFT 31
1930 #define UMA_FRAC_FIXPT(n, d)                                            \
1931         ((uint32_t)(((uint64_t)(n) << UMA_FIXPT_SHIFT) / (d)))
1932 #define UMA_FIXPT_PCT(f)                                                \
1933         ((u_int)(((uint64_t)100 * (f)) >> UMA_FIXPT_SHIFT))
1934 #define UMA_PCT_FIXPT(pct)      UMA_FRAC_FIXPT((pct), 100)
1935 #define UMA_MIN_EFF     UMA_PCT_FIXPT(100 - UMA_MAX_WASTE)
1936
1937 /*
1938  * Compute the number of items that will fit in a slab.  If hdr is true, the
1939  * item count may be limited to provide space in the slab for an inline slab
1940  * header.  Otherwise, all slab space will be provided for item storage.
1941  */
1942 static u_int
1943 slab_ipers_hdr(u_int size, u_int rsize, u_int slabsize, bool hdr)
1944 {
1945         u_int ipers;
1946         u_int padpi;
1947
1948         /* The padding between items is not needed after the last item. */
1949         padpi = rsize - size;
1950
1951         if (hdr) {
1952                 /*
1953                  * Start with the maximum item count and remove items until
1954                  * the slab header first alongside the allocatable memory.
1955                  */
1956                 for (ipers = MIN(SLAB_MAX_SETSIZE,
1957                     (slabsize + padpi - slab_sizeof(1)) / rsize);
1958                     ipers > 0 &&
1959                     ipers * rsize - padpi + slab_sizeof(ipers) > slabsize;
1960                     ipers--)
1961                         continue;
1962         } else {
1963                 ipers = MIN((slabsize + padpi) / rsize, SLAB_MAX_SETSIZE);
1964         }
1965
1966         return (ipers);
1967 }
1968
1969 /*
1970  * Compute the number of items that will fit in a slab for a startup zone.
1971  */
1972 int
1973 slab_ipers(size_t size, int align)
1974 {
1975         int rsize;
1976
1977         rsize = roundup(size, align + 1); /* Assume no CACHESPREAD */
1978         return (slab_ipers_hdr(size, rsize, UMA_SLAB_SIZE, true));
1979 }
1980
1981 struct keg_layout_result {
1982         u_int format;
1983         u_int slabsize;
1984         u_int ipers;
1985         u_int eff;
1986 };
1987
1988 static void
1989 keg_layout_one(uma_keg_t keg, u_int rsize, u_int slabsize, u_int fmt,
1990     struct keg_layout_result *kl)
1991 {
1992         u_int total;
1993
1994         kl->format = fmt;
1995         kl->slabsize = slabsize;
1996
1997         /* Handle INTERNAL as inline with an extra page. */
1998         if ((fmt & UMA_ZFLAG_INTERNAL) != 0) {
1999                 kl->format &= ~UMA_ZFLAG_INTERNAL;
2000                 kl->slabsize += PAGE_SIZE;
2001         }
2002
2003         kl->ipers = slab_ipers_hdr(keg->uk_size, rsize, kl->slabsize,
2004             (fmt & UMA_ZFLAG_OFFPAGE) == 0);
2005
2006         /* Account for memory used by an offpage slab header. */
2007         total = kl->slabsize;
2008         if ((fmt & UMA_ZFLAG_OFFPAGE) != 0)
2009                 total += slabzone(kl->ipers)->uz_keg->uk_rsize;
2010
2011         kl->eff = UMA_FRAC_FIXPT(kl->ipers * rsize, total);
2012 }
2013
2014 /*
2015  * Determine the format of a uma keg.  This determines where the slab header
2016  * will be placed (inline or offpage) and calculates ipers, rsize, and ppera.
2017  *
2018  * Arguments
2019  *      keg  The zone we should initialize
2020  *
2021  * Returns
2022  *      Nothing
2023  */
2024 static void
2025 keg_layout(uma_keg_t keg)
2026 {
2027         struct keg_layout_result kl = {}, kl_tmp;
2028         u_int fmts[2];
2029         u_int alignsize;
2030         u_int nfmt;
2031         u_int pages;
2032         u_int rsize;
2033         u_int slabsize;
2034         u_int i, j;
2035
2036         KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 ||
2037             (keg->uk_size <= UMA_PCPU_ALLOC_SIZE &&
2038              (keg->uk_flags & UMA_ZONE_CACHESPREAD) == 0),
2039             ("%s: cannot configure for PCPU: keg=%s, size=%u, flags=0x%b",
2040              __func__, keg->uk_name, keg->uk_size, keg->uk_flags,
2041              PRINT_UMA_ZFLAGS));
2042         KASSERT((keg->uk_flags & (UMA_ZFLAG_INTERNAL | UMA_ZONE_VM)) == 0 ||
2043             (keg->uk_flags & (UMA_ZONE_NOTOUCH | UMA_ZONE_PCPU)) == 0,
2044             ("%s: incompatible flags 0x%b", __func__, keg->uk_flags,
2045              PRINT_UMA_ZFLAGS));
2046
2047         alignsize = keg->uk_align + 1;
2048
2049         /*
2050          * Calculate the size of each allocation (rsize) according to
2051          * alignment.  If the requested size is smaller than we have
2052          * allocation bits for we round it up.
2053          */
2054         rsize = MAX(keg->uk_size, UMA_SMALLEST_UNIT);
2055         rsize = roundup2(rsize, alignsize);
2056
2057         if ((keg->uk_flags & UMA_ZONE_CACHESPREAD) != 0) {
2058                 /*
2059                  * We want one item to start on every align boundary in a page.
2060                  * To do this we will span pages.  We will also extend the item
2061                  * by the size of align if it is an even multiple of align.
2062                  * Otherwise, it would fall on the same boundary every time.
2063                  */
2064                 if ((rsize & alignsize) == 0)
2065                         rsize += alignsize;
2066                 slabsize = rsize * (PAGE_SIZE / alignsize);
2067                 slabsize = MIN(slabsize, rsize * SLAB_MAX_SETSIZE);
2068                 slabsize = MIN(slabsize, UMA_CACHESPREAD_MAX_SIZE);
2069                 slabsize = round_page(slabsize);
2070         } else {
2071                 /*
2072                  * Start with a slab size of as many pages as it takes to
2073                  * represent a single item.  We will try to fit as many
2074                  * additional items into the slab as possible.
2075                  */
2076                 slabsize = round_page(keg->uk_size);
2077         }
2078
2079         /* Build a list of all of the available formats for this keg. */
2080         nfmt = 0;
2081
2082         /* Evaluate an inline slab layout. */
2083         if ((keg->uk_flags & (UMA_ZONE_NOTOUCH | UMA_ZONE_PCPU)) == 0)
2084                 fmts[nfmt++] = 0;
2085
2086         /* TODO: vm_page-embedded slab. */
2087
2088         /*
2089          * We can't do OFFPAGE if we're internal or if we've been
2090          * asked to not go to the VM for buckets.  If we do this we
2091          * may end up going to the VM for slabs which we do not want
2092          * to do if we're UMA_ZONE_VM, which clearly forbids it.
2093          * In those cases, evaluate a pseudo-format called INTERNAL
2094          * which has an inline slab header and one extra page to
2095          * guarantee that it fits.
2096          *
2097          * Otherwise, see if using an OFFPAGE slab will improve our
2098          * efficiency.
2099          */
2100         if ((keg->uk_flags & (UMA_ZFLAG_INTERNAL | UMA_ZONE_VM)) != 0)
2101                 fmts[nfmt++] = UMA_ZFLAG_INTERNAL;
2102         else
2103                 fmts[nfmt++] = UMA_ZFLAG_OFFPAGE;
2104
2105         /*
2106          * Choose a slab size and format which satisfy the minimum efficiency.
2107          * Prefer the smallest slab size that meets the constraints.
2108          *
2109          * Start with a minimum slab size, to accommodate CACHESPREAD.  Then,
2110          * for small items (up to PAGE_SIZE), the iteration increment is one
2111          * page; and for large items, the increment is one item.
2112          */
2113         i = (slabsize + rsize - keg->uk_size) / MAX(PAGE_SIZE, rsize);
2114         KASSERT(i >= 1, ("keg %s(%p) flags=0x%b slabsize=%u, rsize=%u, i=%u",
2115             keg->uk_name, keg, keg->uk_flags, PRINT_UMA_ZFLAGS, slabsize,
2116             rsize, i));
2117         for ( ; ; i++) {
2118                 slabsize = (rsize <= PAGE_SIZE) ? ptoa(i) :
2119                     round_page(rsize * (i - 1) + keg->uk_size);
2120
2121                 for (j = 0; j < nfmt; j++) {
2122                         /* Only if we have no viable format yet. */
2123                         if ((fmts[j] & UMA_ZFLAG_INTERNAL) != 0 &&
2124                             kl.ipers > 0)
2125                                 continue;
2126
2127                         keg_layout_one(keg, rsize, slabsize, fmts[j], &kl_tmp);
2128                         if (kl_tmp.eff <= kl.eff)
2129                                 continue;
2130
2131                         kl = kl_tmp;
2132
2133                         CTR6(KTR_UMA, "keg %s layout: format %#x "
2134                             "(ipers %u * rsize %u) / slabsize %#x = %u%% eff",
2135                             keg->uk_name, kl.format, kl.ipers, rsize,
2136                             kl.slabsize, UMA_FIXPT_PCT(kl.eff));
2137
2138                         /* Stop when we reach the minimum efficiency. */
2139                         if (kl.eff >= UMA_MIN_EFF)
2140                                 break;
2141                 }
2142
2143                 if (kl.eff >= UMA_MIN_EFF || !multipage_slabs ||
2144                     slabsize >= SLAB_MAX_SETSIZE * rsize ||
2145                     (keg->uk_flags & (UMA_ZONE_PCPU | UMA_ZONE_CONTIG)) != 0)
2146                         break;
2147         }
2148
2149         pages = atop(kl.slabsize);
2150         if ((keg->uk_flags & UMA_ZONE_PCPU) != 0)
2151                 pages *= mp_maxid + 1;
2152
2153         keg->uk_rsize = rsize;
2154         keg->uk_ipers = kl.ipers;
2155         keg->uk_ppera = pages;
2156         keg->uk_flags |= kl.format;
2157
2158         /*
2159          * How do we find the slab header if it is offpage or if not all item
2160          * start addresses are in the same page?  We could solve the latter
2161          * case with vaddr alignment, but we don't.
2162          */
2163         if ((keg->uk_flags & UMA_ZFLAG_OFFPAGE) != 0 ||
2164             (keg->uk_ipers - 1) * rsize >= PAGE_SIZE) {
2165                 if ((keg->uk_flags & UMA_ZONE_NOTPAGE) != 0)
2166                         keg->uk_flags |= UMA_ZFLAG_HASH;
2167                 else
2168                         keg->uk_flags |= UMA_ZFLAG_VTOSLAB;
2169         }
2170
2171         CTR6(KTR_UMA, "%s: keg=%s, flags=%#x, rsize=%u, ipers=%u, ppera=%u",
2172             __func__, keg->uk_name, keg->uk_flags, rsize, keg->uk_ipers,
2173             pages);
2174         KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_MAX_SETSIZE,
2175             ("%s: keg=%s, flags=0x%b, rsize=%u, ipers=%u, ppera=%u", __func__,
2176              keg->uk_name, keg->uk_flags, PRINT_UMA_ZFLAGS, rsize,
2177              keg->uk_ipers, pages));
2178 }
2179
2180 /*
2181  * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
2182  * the keg onto the global keg list.
2183  *
2184  * Arguments/Returns follow uma_ctor specifications
2185  *      udata  Actually uma_kctor_args
2186  */
2187 static int
2188 keg_ctor(void *mem, int size, void *udata, int flags)
2189 {
2190         struct uma_kctor_args *arg = udata;
2191         uma_keg_t keg = mem;
2192         uma_zone_t zone;
2193         int i;
2194
2195         bzero(keg, size);
2196         keg->uk_size = arg->size;
2197         keg->uk_init = arg->uminit;
2198         keg->uk_fini = arg->fini;
2199         keg->uk_align = arg->align;
2200         keg->uk_reserve = 0;
2201         keg->uk_flags = arg->flags;
2202
2203         /*
2204          * We use a global round-robin policy by default.  Zones with
2205          * UMA_ZONE_FIRSTTOUCH set will use first-touch instead, in which
2206          * case the iterator is never run.
2207          */
2208         keg->uk_dr.dr_policy = DOMAINSET_RR();
2209         keg->uk_dr.dr_iter = 0;
2210
2211         /*
2212          * The master zone is passed to us at keg-creation time.
2213          */
2214         zone = arg->zone;
2215         keg->uk_name = zone->uz_name;
2216
2217         if (arg->flags & UMA_ZONE_ZINIT)
2218                 keg->uk_init = zero_init;
2219
2220         if (arg->flags & UMA_ZONE_MALLOC)
2221                 keg->uk_flags |= UMA_ZFLAG_VTOSLAB;
2222
2223 #ifndef SMP
2224         keg->uk_flags &= ~UMA_ZONE_PCPU;
2225 #endif
2226
2227         keg_layout(keg);
2228
2229         /*
2230          * Use a first-touch NUMA policy for kegs that pmap_extract() will
2231          * work on.  Use round-robin for everything else.
2232          *
2233          * Zones may override the default by specifying either.
2234          */
2235 #ifdef NUMA
2236         if ((keg->uk_flags &
2237             (UMA_ZONE_ROUNDROBIN | UMA_ZFLAG_CACHE | UMA_ZONE_NOTPAGE)) == 0)
2238                 keg->uk_flags |= UMA_ZONE_FIRSTTOUCH;
2239         else if ((keg->uk_flags & UMA_ZONE_FIRSTTOUCH) == 0)
2240                 keg->uk_flags |= UMA_ZONE_ROUNDROBIN;
2241 #endif
2242
2243         /*
2244          * If we haven't booted yet we need allocations to go through the
2245          * startup cache until the vm is ready.
2246          */
2247 #ifdef UMA_MD_SMALL_ALLOC
2248         if (keg->uk_ppera == 1)
2249                 keg->uk_allocf = uma_small_alloc;
2250         else
2251 #endif
2252         if (booted < BOOT_KVA)
2253                 keg->uk_allocf = startup_alloc;
2254         else if (keg->uk_flags & UMA_ZONE_PCPU)
2255                 keg->uk_allocf = pcpu_page_alloc;
2256         else if ((keg->uk_flags & UMA_ZONE_CONTIG) != 0 && keg->uk_ppera > 1)
2257                 keg->uk_allocf = contig_alloc;
2258         else
2259                 keg->uk_allocf = page_alloc;
2260 #ifdef UMA_MD_SMALL_ALLOC
2261         if (keg->uk_ppera == 1)
2262                 keg->uk_freef = uma_small_free;
2263         else
2264 #endif
2265         if (keg->uk_flags & UMA_ZONE_PCPU)
2266                 keg->uk_freef = pcpu_page_free;
2267         else
2268                 keg->uk_freef = page_free;
2269
2270         /*
2271          * Initialize keg's locks.
2272          */
2273         for (i = 0; i < vm_ndomains; i++)
2274                 KEG_LOCK_INIT(keg, i, (arg->flags & UMA_ZONE_MTXCLASS));
2275
2276         /*
2277          * If we're putting the slab header in the actual page we need to
2278          * figure out where in each page it goes.  See slab_sizeof
2279          * definition.
2280          */
2281         if (!(keg->uk_flags & UMA_ZFLAG_OFFPAGE)) {
2282                 size_t shsize;
2283
2284                 shsize = slab_sizeof(keg->uk_ipers);
2285                 keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - shsize;
2286                 /*
2287                  * The only way the following is possible is if with our
2288                  * UMA_ALIGN_PTR adjustments we are now bigger than
2289                  * UMA_SLAB_SIZE.  I haven't checked whether this is
2290                  * mathematically possible for all cases, so we make
2291                  * sure here anyway.
2292                  */
2293                 KASSERT(keg->uk_pgoff + shsize <= PAGE_SIZE * keg->uk_ppera,
2294                     ("zone %s ipers %d rsize %d size %d slab won't fit",
2295                     zone->uz_name, keg->uk_ipers, keg->uk_rsize, keg->uk_size));
2296         }
2297
2298         if (keg->uk_flags & UMA_ZFLAG_HASH)
2299                 hash_alloc(&keg->uk_hash, 0);
2300
2301         CTR3(KTR_UMA, "keg_ctor %p zone %s(%p)", keg, zone->uz_name, zone);
2302
2303         LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
2304
2305         rw_wlock(&uma_rwlock);
2306         LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
2307         rw_wunlock(&uma_rwlock);
2308         return (0);
2309 }
2310
2311 static void
2312 zone_kva_available(uma_zone_t zone, void *unused)
2313 {
2314         uma_keg_t keg;
2315
2316         if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0)
2317                 return;
2318         KEG_GET(zone, keg);
2319
2320         if (keg->uk_allocf == startup_alloc) {
2321                 /* Switch to the real allocator. */
2322                 if (keg->uk_flags & UMA_ZONE_PCPU)
2323                         keg->uk_allocf = pcpu_page_alloc;
2324                 else if ((keg->uk_flags & UMA_ZONE_CONTIG) != 0 &&
2325                     keg->uk_ppera > 1)
2326                         keg->uk_allocf = contig_alloc;
2327                 else
2328                         keg->uk_allocf = page_alloc;
2329         }
2330 }
2331
2332 static void
2333 zone_alloc_counters(uma_zone_t zone, void *unused)
2334 {
2335
2336         zone->uz_allocs = counter_u64_alloc(M_WAITOK);
2337         zone->uz_frees = counter_u64_alloc(M_WAITOK);
2338         zone->uz_fails = counter_u64_alloc(M_WAITOK);
2339         zone->uz_xdomain = counter_u64_alloc(M_WAITOK);
2340 }
2341
2342 static void
2343 zone_alloc_sysctl(uma_zone_t zone, void *unused)
2344 {
2345         uma_zone_domain_t zdom;
2346         uma_domain_t dom;
2347         uma_keg_t keg;
2348         struct sysctl_oid *oid, *domainoid;
2349         int domains, i, cnt;
2350         static const char *nokeg = "cache zone";
2351         char *c;
2352
2353         /*
2354          * Make a sysctl safe copy of the zone name by removing
2355          * any special characters and handling dups by appending
2356          * an index.
2357          */
2358         if (zone->uz_namecnt != 0) {
2359                 /* Count the number of decimal digits and '_' separator. */
2360                 for (i = 1, cnt = zone->uz_namecnt; cnt != 0; i++)
2361                         cnt /= 10;
2362                 zone->uz_ctlname = malloc(strlen(zone->uz_name) + i + 1,
2363                     M_UMA, M_WAITOK);
2364                 sprintf(zone->uz_ctlname, "%s_%d", zone->uz_name,
2365                     zone->uz_namecnt);
2366         } else
2367                 zone->uz_ctlname = strdup(zone->uz_name, M_UMA);
2368         for (c = zone->uz_ctlname; *c != '\0'; c++)
2369                 if (strchr("./\\ -", *c) != NULL)
2370                         *c = '_';
2371
2372         /*
2373          * Basic parameters at the root.
2374          */
2375         zone->uz_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_vm_uma),
2376             OID_AUTO, zone->uz_ctlname, CTLFLAG_RD, NULL, "");
2377         oid = zone->uz_oid;
2378         SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2379             "size", CTLFLAG_RD, &zone->uz_size, 0, "Allocation size");
2380         SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2381             "flags", CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_MPSAFE,
2382             zone, 0, sysctl_handle_uma_zone_flags, "A",
2383             "Allocator configuration flags");
2384         SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2385             "bucket_size", CTLFLAG_RD, &zone->uz_bucket_size, 0,
2386             "Desired per-cpu cache size");
2387         SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2388             "bucket_size_max", CTLFLAG_RD, &zone->uz_bucket_size_max, 0,
2389             "Maximum allowed per-cpu cache size");
2390
2391         /*
2392          * keg if present.
2393          */
2394         if ((zone->uz_flags & UMA_ZFLAG_HASH) == 0)
2395                 domains = vm_ndomains;
2396         else
2397                 domains = 1;
2398         oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO,
2399             "keg", CTLFLAG_RD, NULL, "");
2400         keg = zone->uz_keg;
2401         if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0) {
2402                 SYSCTL_ADD_CONST_STRING(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2403                     "name", CTLFLAG_RD, keg->uk_name, "Keg name");
2404                 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2405                     "rsize", CTLFLAG_RD, &keg->uk_rsize, 0,
2406                     "Real object size with alignment");
2407                 SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2408                     "ppera", CTLFLAG_RD, &keg->uk_ppera, 0,
2409                     "pages per-slab allocation");
2410                 SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2411                     "ipers", CTLFLAG_RD, &keg->uk_ipers, 0,
2412                     "items available per-slab");
2413                 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2414                     "align", CTLFLAG_RD, &keg->uk_align, 0,
2415                     "item alignment mask");
2416                 SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2417                     "efficiency", CTLFLAG_RD | CTLTYPE_INT | CTLFLAG_MPSAFE,
2418                     keg, 0, sysctl_handle_uma_slab_efficiency, "I",
2419                     "Slab utilization (100 - internal fragmentation %)");
2420                 domainoid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(oid),
2421                     OID_AUTO, "domain", CTLFLAG_RD, NULL, "");
2422                 for (i = 0; i < domains; i++) {
2423                         dom = &keg->uk_domain[i];
2424                         oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(domainoid),
2425                             OID_AUTO, VM_DOMAIN(i)->vmd_name, CTLFLAG_RD,
2426                             NULL, "");
2427                         SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2428                             "pages", CTLFLAG_RD, &dom->ud_pages, 0,
2429                             "Total pages currently allocated from VM");
2430                         SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2431                             "free_items", CTLFLAG_RD, &dom->ud_free_items, 0,
2432                             "items free in the slab layer");
2433                 }
2434         } else
2435                 SYSCTL_ADD_CONST_STRING(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2436                     "name", CTLFLAG_RD, nokeg, "Keg name");
2437
2438         /*
2439          * Information about zone limits.
2440          */
2441         oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO,
2442             "limit", CTLFLAG_RD, NULL, "");
2443         SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2444             "items", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE,
2445             zone, 0, sysctl_handle_uma_zone_items, "QU",
2446             "current number of allocated items if limit is set");
2447         SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2448             "max_items", CTLFLAG_RD, &zone->uz_max_items, 0,
2449             "Maximum number of cached items");
2450         SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2451             "sleepers", CTLFLAG_RD, &zone->uz_sleepers, 0,
2452             "Number of threads sleeping at limit");
2453         SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2454             "sleeps", CTLFLAG_RD, &zone->uz_sleeps, 0,
2455             "Total zone limit sleeps");
2456         SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2457             "bucket_max", CTLFLAG_RD, &zone->uz_bucket_max, 0,
2458             "Maximum number of items in each domain's bucket cache");
2459
2460         /*
2461          * Per-domain zone information.
2462          */
2463         domainoid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid),
2464             OID_AUTO, "domain", CTLFLAG_RD, NULL, "");
2465         for (i = 0; i < domains; i++) {
2466                 zdom = ZDOM_GET(zone, i);
2467                 oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(domainoid),
2468                     OID_AUTO, VM_DOMAIN(i)->vmd_name, CTLFLAG_RD, NULL, "");
2469                 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2470                     "nitems", CTLFLAG_RD, &zdom->uzd_nitems,
2471                     "number of items in this domain");
2472                 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2473                     "imax", CTLFLAG_RD, &zdom->uzd_imax,
2474                     "maximum item count in this period");
2475                 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2476                     "imin", CTLFLAG_RD, &zdom->uzd_imin,
2477                     "minimum item count in this period");
2478                 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2479                     "wss", CTLFLAG_RD, &zdom->uzd_wss,
2480                     "Working set size");
2481         }
2482
2483         /*
2484          * General statistics.
2485          */
2486         oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO,
2487             "stats", CTLFLAG_RD, NULL, "");
2488         SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2489             "current", CTLFLAG_RD | CTLTYPE_INT | CTLFLAG_MPSAFE,
2490             zone, 1, sysctl_handle_uma_zone_cur, "I",
2491             "Current number of allocated items");
2492         SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2493             "allocs", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE,
2494             zone, 0, sysctl_handle_uma_zone_allocs, "QU",
2495             "Total allocation calls");
2496         SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2497             "frees", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE,
2498             zone, 0, sysctl_handle_uma_zone_frees, "QU",
2499             "Total free calls");
2500         SYSCTL_ADD_COUNTER_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2501             "fails", CTLFLAG_RD, &zone->uz_fails,
2502             "Number of allocation failures");
2503         SYSCTL_ADD_COUNTER_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2504             "xdomain", CTLFLAG_RD, &zone->uz_xdomain,
2505             "Free calls from the wrong domain");
2506 }
2507
2508 struct uma_zone_count {
2509         const char      *name;
2510         int             count;
2511 };
2512
2513 static void
2514 zone_count(uma_zone_t zone, void *arg)
2515 {
2516         struct uma_zone_count *cnt;
2517
2518         cnt = arg;
2519         /*
2520          * Some zones are rapidly created with identical names and
2521          * destroyed out of order.  This can lead to gaps in the count.
2522          * Use one greater than the maximum observed for this name.
2523          */
2524         if (strcmp(zone->uz_name, cnt->name) == 0)
2525                 cnt->count = MAX(cnt->count,
2526                     zone->uz_namecnt + 1);
2527 }
2528
2529 static void
2530 zone_update_caches(uma_zone_t zone)
2531 {
2532         int i;
2533
2534         for (i = 0; i <= mp_maxid; i++) {
2535                 cache_set_uz_size(&zone->uz_cpu[i], zone->uz_size);
2536                 cache_set_uz_flags(&zone->uz_cpu[i], zone->uz_flags);
2537         }
2538 }
2539
2540 /*
2541  * Zone header ctor.  This initializes all fields, locks, etc.
2542  *
2543  * Arguments/Returns follow uma_ctor specifications
2544  *      udata  Actually uma_zctor_args
2545  */
2546 static int
2547 zone_ctor(void *mem, int size, void *udata, int flags)
2548 {
2549         struct uma_zone_count cnt;
2550         struct uma_zctor_args *arg = udata;
2551         uma_zone_domain_t zdom;
2552         uma_zone_t zone = mem;
2553         uma_zone_t z;
2554         uma_keg_t keg;
2555         int i;
2556
2557         bzero(zone, size);
2558         zone->uz_name = arg->name;
2559         zone->uz_ctor = arg->ctor;
2560         zone->uz_dtor = arg->dtor;
2561         zone->uz_init = NULL;
2562         zone->uz_fini = NULL;
2563         zone->uz_sleeps = 0;
2564         zone->uz_bucket_size = 0;
2565         zone->uz_bucket_size_min = 0;
2566         zone->uz_bucket_size_max = BUCKET_MAX;
2567         zone->uz_flags = (arg->flags & UMA_ZONE_SMR);
2568         zone->uz_warning = NULL;
2569         /* The domain structures follow the cpu structures. */
2570         zone->uz_bucket_max = ULONG_MAX;
2571         timevalclear(&zone->uz_ratecheck);
2572
2573         /* Count the number of duplicate names. */
2574         cnt.name = arg->name;
2575         cnt.count = 0;
2576         zone_foreach(zone_count, &cnt);
2577         zone->uz_namecnt = cnt.count;
2578         ZONE_CROSS_LOCK_INIT(zone);
2579
2580         for (i = 0; i < vm_ndomains; i++) {
2581                 zdom = ZDOM_GET(zone, i);
2582                 ZDOM_LOCK_INIT(zone, zdom, (arg->flags & UMA_ZONE_MTXCLASS));
2583                 STAILQ_INIT(&zdom->uzd_buckets);
2584         }
2585
2586 #ifdef INVARIANTS
2587         if (arg->uminit == trash_init && arg->fini == trash_fini)
2588                 zone->uz_flags |= UMA_ZFLAG_TRASH | UMA_ZFLAG_CTORDTOR;
2589 #endif
2590
2591         /*
2592          * This is a pure cache zone, no kegs.
2593          */
2594         if (arg->import) {
2595                 KASSERT((arg->flags & UMA_ZFLAG_CACHE) != 0,
2596                     ("zone_ctor: Import specified for non-cache zone."));
2597                 zone->uz_flags = arg->flags;
2598                 zone->uz_size = arg->size;
2599                 zone->uz_import = arg->import;
2600                 zone->uz_release = arg->release;
2601                 zone->uz_arg = arg->arg;
2602 #ifdef NUMA
2603                 /*
2604                  * Cache zones are round-robin unless a policy is
2605                  * specified because they may have incompatible
2606                  * constraints.
2607                  */
2608                 if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) == 0)
2609                         zone->uz_flags |= UMA_ZONE_ROUNDROBIN;
2610 #endif
2611                 rw_wlock(&uma_rwlock);
2612                 LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link);
2613                 rw_wunlock(&uma_rwlock);
2614                 goto out;
2615         }
2616
2617         /*
2618          * Use the regular zone/keg/slab allocator.
2619          */
2620         zone->uz_import = zone_import;
2621         zone->uz_release = zone_release;
2622         zone->uz_arg = zone;
2623         keg = arg->keg;
2624
2625         if (arg->flags & UMA_ZONE_SECONDARY) {
2626                 KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0,
2627                     ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
2628                 KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
2629                 zone->uz_init = arg->uminit;
2630                 zone->uz_fini = arg->fini;
2631                 zone->uz_flags |= UMA_ZONE_SECONDARY;
2632                 rw_wlock(&uma_rwlock);
2633                 ZONE_LOCK(zone);
2634                 LIST_FOREACH(z, &keg->uk_zones, uz_link) {
2635                         if (LIST_NEXT(z, uz_link) == NULL) {
2636                                 LIST_INSERT_AFTER(z, zone, uz_link);
2637                                 break;
2638                         }
2639                 }
2640                 ZONE_UNLOCK(zone);
2641                 rw_wunlock(&uma_rwlock);
2642         } else if (keg == NULL) {
2643                 if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
2644                     arg->align, arg->flags)) == NULL)
2645                         return (ENOMEM);
2646         } else {
2647                 struct uma_kctor_args karg;
2648                 int error;
2649
2650                 /* We should only be here from uma_startup() */
2651                 karg.size = arg->size;
2652                 karg.uminit = arg->uminit;
2653                 karg.fini = arg->fini;
2654                 karg.align = arg->align;
2655                 karg.flags = (arg->flags & ~UMA_ZONE_SMR);
2656                 karg.zone = zone;
2657                 error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg,
2658                     flags);
2659                 if (error)
2660                         return (error);
2661         }
2662
2663         /* Inherit properties from the keg. */
2664         zone->uz_keg = keg;
2665         zone->uz_size = keg->uk_size;
2666         zone->uz_flags |= (keg->uk_flags &
2667             (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT));
2668
2669 out:
2670         if (__predict_true(booted >= BOOT_RUNNING)) {
2671                 zone_alloc_counters(zone, NULL);
2672                 zone_alloc_sysctl(zone, NULL);
2673         } else {
2674                 zone->uz_allocs = EARLY_COUNTER;
2675                 zone->uz_frees = EARLY_COUNTER;
2676                 zone->uz_fails = EARLY_COUNTER;
2677         }
2678
2679         /* Caller requests a private SMR context. */
2680         if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
2681                 zone->uz_smr = smr_create(zone->uz_name, 0, 0);
2682
2683         KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) !=
2684             (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET),
2685             ("Invalid zone flag combination"));
2686         if (arg->flags & UMA_ZFLAG_INTERNAL)
2687                 zone->uz_bucket_size_max = zone->uz_bucket_size = 0;
2688         if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0)
2689                 zone->uz_bucket_size = BUCKET_MAX;
2690         else if ((arg->flags & UMA_ZONE_MINBUCKET) != 0)
2691                 zone->uz_bucket_size_max = zone->uz_bucket_size = BUCKET_MIN;
2692         else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0)
2693                 zone->uz_bucket_size = 0;
2694         else
2695                 zone->uz_bucket_size = bucket_select(zone->uz_size);
2696         zone->uz_bucket_size_min = zone->uz_bucket_size;
2697         if (zone->uz_dtor != NULL || zone->uz_ctor != NULL)
2698                 zone->uz_flags |= UMA_ZFLAG_CTORDTOR;
2699         zone_update_caches(zone);
2700
2701         return (0);
2702 }
2703
2704 /*
2705  * Keg header dtor.  This frees all data, destroys locks, frees the hash
2706  * table and removes the keg from the global list.
2707  *
2708  * Arguments/Returns follow uma_dtor specifications
2709  *      udata  unused
2710  */
2711 static void
2712 keg_dtor(void *arg, int size, void *udata)
2713 {
2714         uma_keg_t keg;
2715         uint32_t free, pages;
2716         int i;
2717
2718         keg = (uma_keg_t)arg;
2719         free = pages = 0;
2720         for (i = 0; i < vm_ndomains; i++) {
2721                 free += keg->uk_domain[i].ud_free_items;
2722                 pages += keg->uk_domain[i].ud_pages;
2723                 KEG_LOCK_FINI(keg, i);
2724         }
2725         if (pages != 0)
2726                 printf("Freed UMA keg (%s) was not empty (%u items). "
2727                     " Lost %u pages of memory.\n",
2728                     keg->uk_name ? keg->uk_name : "",
2729                     pages / keg->uk_ppera * keg->uk_ipers - free, pages);
2730
2731         hash_free(&keg->uk_hash);
2732 }
2733
2734 /*
2735  * Zone header dtor.
2736  *
2737  * Arguments/Returns follow uma_dtor specifications
2738  *      udata  unused
2739  */
2740 static void
2741 zone_dtor(void *arg, int size, void *udata)
2742 {
2743         uma_zone_t zone;
2744         uma_keg_t keg;
2745         int i;
2746
2747         zone = (uma_zone_t)arg;
2748
2749         sysctl_remove_oid(zone->uz_oid, 1, 1);
2750
2751         if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
2752                 cache_drain(zone);
2753
2754         rw_wlock(&uma_rwlock);
2755         LIST_REMOVE(zone, uz_link);
2756         rw_wunlock(&uma_rwlock);
2757         zone_reclaim(zone, M_WAITOK, true);
2758
2759         /*
2760          * We only destroy kegs from non secondary/non cache zones.
2761          */
2762         if ((zone->uz_flags & (UMA_ZONE_SECONDARY | UMA_ZFLAG_CACHE)) == 0) {
2763                 keg = zone->uz_keg;
2764                 rw_wlock(&uma_rwlock);
2765                 LIST_REMOVE(keg, uk_link);
2766                 rw_wunlock(&uma_rwlock);
2767                 zone_free_item(kegs, keg, NULL, SKIP_NONE);
2768         }
2769         counter_u64_free(zone->uz_allocs);
2770         counter_u64_free(zone->uz_frees);
2771         counter_u64_free(zone->uz_fails);
2772         counter_u64_free(zone->uz_xdomain);
2773         free(zone->uz_ctlname, M_UMA);
2774         for (i = 0; i < vm_ndomains; i++)
2775                 ZDOM_LOCK_FINI(ZDOM_GET(zone, i));
2776         ZONE_CROSS_LOCK_FINI(zone);
2777 }
2778
2779 static void
2780 zone_foreach_unlocked(void (*zfunc)(uma_zone_t, void *arg), void *arg)
2781 {
2782         uma_keg_t keg;
2783         uma_zone_t zone;
2784
2785         LIST_FOREACH(keg, &uma_kegs, uk_link) {
2786                 LIST_FOREACH(zone, &keg->uk_zones, uz_link)
2787                         zfunc(zone, arg);
2788         }
2789         LIST_FOREACH(zone, &uma_cachezones, uz_link)
2790                 zfunc(zone, arg);
2791 }
2792
2793 /*
2794  * Traverses every zone in the system and calls a callback
2795  *
2796  * Arguments:
2797  *      zfunc  A pointer to a function which accepts a zone
2798  *              as an argument.
2799  *
2800  * Returns:
2801  *      Nothing
2802  */
2803 static void
2804 zone_foreach(void (*zfunc)(uma_zone_t, void *arg), void *arg)
2805 {
2806
2807         rw_rlock(&uma_rwlock);
2808         zone_foreach_unlocked(zfunc, arg);
2809         rw_runlock(&uma_rwlock);
2810 }
2811
2812 /*
2813  * Initialize the kernel memory allocator.  This is done after pages can be
2814  * allocated but before general KVA is available.
2815  */
2816 void
2817 uma_startup1(vm_offset_t virtual_avail)
2818 {
2819         struct uma_zctor_args args;
2820         size_t ksize, zsize, size;
2821         uma_keg_t masterkeg;
2822         uintptr_t m;
2823         uint8_t pflag;
2824
2825         bootstart = bootmem = virtual_avail;
2826
2827         rw_init(&uma_rwlock, "UMA lock");
2828         sx_init(&uma_reclaim_lock, "umareclaim");
2829
2830         ksize = sizeof(struct uma_keg) +
2831             (sizeof(struct uma_domain) * vm_ndomains);
2832         ksize = roundup(ksize, UMA_SUPER_ALIGN);
2833         zsize = sizeof(struct uma_zone) +
2834             (sizeof(struct uma_cache) * (mp_maxid + 1)) +
2835             (sizeof(struct uma_zone_domain) * vm_ndomains);
2836         zsize = roundup(zsize, UMA_SUPER_ALIGN);
2837
2838         /* Allocate the zone of zones, zone of kegs, and zone of zones keg. */
2839         size = (zsize * 2) + ksize;
2840         m = (uintptr_t)startup_alloc(NULL, size, 0, &pflag, M_NOWAIT | M_ZERO);
2841         zones = (uma_zone_t)m;
2842         m += zsize;
2843         kegs = (uma_zone_t)m;
2844         m += zsize;
2845         masterkeg = (uma_keg_t)m;
2846
2847         /* "manually" create the initial zone */
2848         memset(&args, 0, sizeof(args));
2849         args.name = "UMA Kegs";
2850         args.size = ksize;
2851         args.ctor = keg_ctor;
2852         args.dtor = keg_dtor;
2853         args.uminit = zero_init;
2854         args.fini = NULL;
2855         args.keg = masterkeg;
2856         args.align = UMA_SUPER_ALIGN - 1;
2857         args.flags = UMA_ZFLAG_INTERNAL;
2858         zone_ctor(kegs, zsize, &args, M_WAITOK);
2859
2860         args.name = "UMA Zones";
2861         args.size = zsize;
2862         args.ctor = zone_ctor;
2863         args.dtor = zone_dtor;
2864         args.uminit = zero_init;
2865         args.fini = NULL;
2866         args.keg = NULL;
2867         args.align = UMA_SUPER_ALIGN - 1;
2868         args.flags = UMA_ZFLAG_INTERNAL;
2869         zone_ctor(zones, zsize, &args, M_WAITOK);
2870
2871         /* Now make zones for slab headers */
2872         slabzones[0] = uma_zcreate("UMA Slabs 0", SLABZONE0_SIZE,
2873             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
2874         slabzones[1] = uma_zcreate("UMA Slabs 1", SLABZONE1_SIZE,
2875             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
2876
2877         hashzone = uma_zcreate("UMA Hash",
2878             sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
2879             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
2880
2881         bucket_init();
2882         smr_init();
2883 }
2884
2885 #ifndef UMA_MD_SMALL_ALLOC
2886 extern void vm_radix_reserve_kva(void);
2887 #endif
2888
2889 /*
2890  * Advertise the availability of normal kva allocations and switch to
2891  * the default back-end allocator.  Marks the KVA we consumed on startup
2892  * as used in the map.
2893  */
2894 void
2895 uma_startup2(void)
2896 {
2897
2898         if (bootstart != bootmem) {
2899                 vm_map_lock(kernel_map);
2900                 (void)vm_map_insert(kernel_map, NULL, 0, bootstart, bootmem,
2901                     VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT);
2902                 vm_map_unlock(kernel_map);
2903         }
2904
2905 #ifndef UMA_MD_SMALL_ALLOC
2906         /* Set up radix zone to use noobj_alloc. */
2907         vm_radix_reserve_kva();
2908 #endif
2909
2910         booted = BOOT_KVA;
2911         zone_foreach_unlocked(zone_kva_available, NULL);
2912         bucket_enable();
2913 }
2914
2915 /*
2916  * Finish our initialization steps.
2917  */
2918 static void
2919 uma_startup3(void)
2920 {
2921
2922 #ifdef INVARIANTS
2923         TUNABLE_INT_FETCH("vm.debug.divisor", &dbg_divisor);
2924         uma_dbg_cnt = counter_u64_alloc(M_WAITOK);
2925         uma_skip_cnt = counter_u64_alloc(M_WAITOK);
2926 #endif
2927         zone_foreach_unlocked(zone_alloc_counters, NULL);
2928         zone_foreach_unlocked(zone_alloc_sysctl, NULL);
2929         callout_init(&uma_callout, 1);
2930         callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
2931         booted = BOOT_RUNNING;
2932
2933         EVENTHANDLER_REGISTER(shutdown_post_sync, uma_shutdown, NULL,
2934             EVENTHANDLER_PRI_FIRST);
2935 }
2936
2937 static void
2938 uma_shutdown(void)
2939 {
2940
2941         booted = BOOT_SHUTDOWN;
2942 }
2943
2944 static uma_keg_t
2945 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
2946                 int align, uint32_t flags)
2947 {
2948         struct uma_kctor_args args;
2949
2950         args.size = size;
2951         args.uminit = uminit;
2952         args.fini = fini;
2953         args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
2954         args.flags = flags;
2955         args.zone = zone;
2956         return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK));
2957 }
2958
2959 /* Public functions */
2960 /* See uma.h */
2961 void
2962 uma_set_align(int align)
2963 {
2964
2965         if (align != UMA_ALIGN_CACHE)
2966                 uma_align_cache = align;
2967 }
2968
2969 /* See uma.h */
2970 uma_zone_t
2971 uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
2972                 uma_init uminit, uma_fini fini, int align, uint32_t flags)
2973
2974 {
2975         struct uma_zctor_args args;
2976         uma_zone_t res;
2977
2978         KASSERT(powerof2(align + 1), ("invalid zone alignment %d for \"%s\"",
2979             align, name));
2980
2981         /* This stuff is essential for the zone ctor */
2982         memset(&args, 0, sizeof(args));
2983         args.name = name;
2984         args.size = size;
2985         args.ctor = ctor;
2986         args.dtor = dtor;
2987         args.uminit = uminit;
2988         args.fini = fini;
2989 #ifdef  INVARIANTS
2990         /*
2991          * Inject procedures which check for memory use after free if we are
2992          * allowed to scramble the memory while it is not allocated.  This
2993          * requires that: UMA is actually able to access the memory, no init
2994          * or fini procedures, no dependency on the initial value of the
2995          * memory, and no (legitimate) use of the memory after free.  Note,
2996          * the ctor and dtor do not need to be empty.
2997          */
2998         if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOTOUCH |
2999             UMA_ZONE_NOFREE))) && uminit == NULL && fini == NULL) {
3000                 args.uminit = trash_init;
3001                 args.fini = trash_fini;
3002         }
3003 #endif
3004         args.align = align;
3005         args.flags = flags;
3006         args.keg = NULL;
3007
3008         sx_slock(&uma_reclaim_lock);
3009         res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
3010         sx_sunlock(&uma_reclaim_lock);
3011
3012         return (res);
3013 }
3014
3015 /* See uma.h */
3016 uma_zone_t
3017 uma_zsecond_create(const char *name, uma_ctor ctor, uma_dtor dtor,
3018     uma_init zinit, uma_fini zfini, uma_zone_t master)
3019 {
3020         struct uma_zctor_args args;
3021         uma_keg_t keg;
3022         uma_zone_t res;
3023
3024         keg = master->uz_keg;
3025         memset(&args, 0, sizeof(args));
3026         args.name = name;
3027         args.size = keg->uk_size;
3028         args.ctor = ctor;
3029         args.dtor = dtor;
3030         args.uminit = zinit;
3031         args.fini = zfini;
3032         args.align = keg->uk_align;
3033         args.flags = keg->uk_flags | UMA_ZONE_SECONDARY;
3034         args.keg = keg;
3035
3036         sx_slock(&uma_reclaim_lock);
3037         res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
3038         sx_sunlock(&uma_reclaim_lock);
3039
3040         return (res);
3041 }
3042
3043 /* See uma.h */
3044 uma_zone_t
3045 uma_zcache_create(const char *name, int size, uma_ctor ctor, uma_dtor dtor,
3046     uma_init zinit, uma_fini zfini, uma_import zimport, uma_release zrelease,
3047     void *arg, int flags)
3048 {
3049         struct uma_zctor_args args;
3050
3051         memset(&args, 0, sizeof(args));
3052         args.name = name;
3053         args.size = size;
3054         args.ctor = ctor;
3055         args.dtor = dtor;
3056         args.uminit = zinit;
3057         args.fini = zfini;
3058         args.import = zimport;
3059         args.release = zrelease;
3060         args.arg = arg;
3061         args.align = 0;
3062         args.flags = flags | UMA_ZFLAG_CACHE;
3063
3064         return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK));
3065 }
3066
3067 /* See uma.h */
3068 void
3069 uma_zdestroy(uma_zone_t zone)
3070 {
3071
3072         /*
3073          * Large slabs are expensive to reclaim, so don't bother doing
3074          * unnecessary work if we're shutting down.
3075          */
3076         if (booted == BOOT_SHUTDOWN &&
3077             zone->uz_fini == NULL && zone->uz_release == zone_release)
3078                 return;
3079         sx_slock(&uma_reclaim_lock);
3080         zone_free_item(zones, zone, NULL, SKIP_NONE);
3081         sx_sunlock(&uma_reclaim_lock);
3082 }
3083
3084 void
3085 uma_zwait(uma_zone_t zone)
3086 {
3087
3088         if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
3089                 uma_zfree_smr(zone, uma_zalloc_smr(zone, M_WAITOK));
3090         else if ((zone->uz_flags & UMA_ZONE_PCPU) != 0)
3091                 uma_zfree_pcpu(zone, uma_zalloc_pcpu(zone, M_WAITOK));
3092         else
3093                 uma_zfree(zone, uma_zalloc(zone, M_WAITOK));
3094 }
3095
3096 void *
3097 uma_zalloc_pcpu_arg(uma_zone_t zone, void *udata, int flags)
3098 {
3099         void *item, *pcpu_item;
3100 #ifdef SMP
3101         int i;
3102
3103         MPASS(zone->uz_flags & UMA_ZONE_PCPU);
3104 #endif
3105         item = uma_zalloc_arg(zone, udata, flags & ~M_ZERO);
3106         if (item == NULL)
3107                 return (NULL);
3108         pcpu_item = zpcpu_base_to_offset(item);
3109         if (flags & M_ZERO) {
3110 #ifdef SMP
3111                 for (i = 0; i <= mp_maxid; i++)
3112                         bzero(zpcpu_get_cpu(pcpu_item, i), zone->uz_size);
3113 #else
3114                 bzero(item, zone->uz_size);
3115 #endif
3116         }
3117         return (pcpu_item);
3118 }
3119
3120 /*
3121  * A stub while both regular and pcpu cases are identical.
3122  */
3123 void
3124 uma_zfree_pcpu_arg(uma_zone_t zone, void *pcpu_item, void *udata)
3125 {
3126         void *item;
3127
3128 #ifdef SMP
3129         MPASS(zone->uz_flags & UMA_ZONE_PCPU);
3130 #endif
3131         item = zpcpu_offset_to_base(pcpu_item);
3132         uma_zfree_arg(zone, item, udata);
3133 }
3134
3135 static inline void *
3136 item_ctor(uma_zone_t zone, int uz_flags, int size, void *udata, int flags,
3137     void *item)
3138 {
3139 #ifdef INVARIANTS
3140         bool skipdbg;
3141
3142         skipdbg = uma_dbg_zskip(zone, item);
3143         if (!skipdbg && (zone->uz_flags & UMA_ZFLAG_TRASH) != 0 &&
3144             zone->uz_ctor != trash_ctor)
3145                 trash_ctor(item, size, udata, flags);
3146 #endif
3147         /* Check flags before loading ctor pointer. */
3148         if (__predict_false((uz_flags & UMA_ZFLAG_CTORDTOR) != 0) &&
3149             __predict_false(zone->uz_ctor != NULL) &&
3150             zone->uz_ctor(item, size, udata, flags) != 0) {
3151                 counter_u64_add(zone->uz_fails, 1);
3152                 zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT);
3153                 return (NULL);
3154         }
3155 #ifdef INVARIANTS
3156         if (!skipdbg)
3157                 uma_dbg_alloc(zone, NULL, item);
3158 #endif
3159         if (__predict_false(flags & M_ZERO))
3160                 return (memset(item, 0, size));
3161
3162         return (item);
3163 }
3164
3165 static inline void
3166 item_dtor(uma_zone_t zone, void *item, int size, void *udata,
3167     enum zfreeskip skip)
3168 {
3169 #ifdef INVARIANTS
3170         bool skipdbg;
3171
3172         skipdbg = uma_dbg_zskip(zone, item);
3173         if (skip == SKIP_NONE && !skipdbg) {
3174                 if ((zone->uz_flags & UMA_ZONE_MALLOC) != 0)
3175                         uma_dbg_free(zone, udata, item);
3176                 else
3177                         uma_dbg_free(zone, NULL, item);
3178         }
3179 #endif
3180         if (__predict_true(skip < SKIP_DTOR)) {
3181                 if (zone->uz_dtor != NULL)
3182                         zone->uz_dtor(item, size, udata);
3183 #ifdef INVARIANTS
3184                 if (!skipdbg && (zone->uz_flags & UMA_ZFLAG_TRASH) != 0 &&
3185                     zone->uz_dtor != trash_dtor)
3186                         trash_dtor(item, size, udata);
3187 #endif
3188         }
3189 }
3190
3191 #if defined(INVARIANTS) || defined(DEBUG_MEMGUARD) || defined(WITNESS)
3192 #define UMA_ZALLOC_DEBUG
3193 static int
3194 uma_zalloc_debug(uma_zone_t zone, void **itemp, void *udata, int flags)
3195 {
3196         int error;
3197
3198         error = 0;
3199 #ifdef WITNESS
3200         if (flags & M_WAITOK) {
3201                 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
3202                     "uma_zalloc_debug: zone \"%s\"", zone->uz_name);
3203         }
3204 #endif
3205
3206 #ifdef INVARIANTS
3207         KASSERT((flags & M_EXEC) == 0,
3208             ("uma_zalloc_debug: called with M_EXEC"));
3209         KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
3210             ("uma_zalloc_debug: called within spinlock or critical section"));
3211         KASSERT((zone->uz_flags & UMA_ZONE_PCPU) == 0 || (flags & M_ZERO) == 0,
3212             ("uma_zalloc_debug: allocating from a pcpu zone with M_ZERO"));
3213 #endif
3214
3215 #ifdef DEBUG_MEMGUARD
3216         if ((zone->uz_flags & UMA_ZONE_SMR) == 0 && memguard_cmp_zone(zone)) {
3217                 void *item;
3218                 item = memguard_alloc(zone->uz_size, flags);
3219                 if (item != NULL) {
3220                         error = EJUSTRETURN;
3221                         if (zone->uz_init != NULL &&
3222                             zone->uz_init(item, zone->uz_size, flags) != 0) {
3223                                 *itemp = NULL;
3224                                 return (error);
3225                         }
3226                         if (zone->uz_ctor != NULL &&
3227                             zone->uz_ctor(item, zone->uz_size, udata,
3228                             flags) != 0) {
3229                                 counter_u64_add(zone->uz_fails, 1);
3230                                 zone->uz_fini(item, zone->uz_size);
3231                                 *itemp = NULL;
3232                                 return (error);
3233                         }
3234                         *itemp = item;
3235                         return (error);
3236                 }
3237                 /* This is unfortunate but should not be fatal. */
3238         }
3239 #endif
3240         return (error);
3241 }
3242
3243 static int
3244 uma_zfree_debug(uma_zone_t zone, void *item, void *udata)
3245 {
3246         KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
3247             ("uma_zfree_debug: called with spinlock or critical section held"));
3248
3249 #ifdef DEBUG_MEMGUARD
3250         if ((zone->uz_flags & UMA_ZONE_SMR) == 0 && is_memguard_addr(item)) {
3251                 if (zone->uz_dtor != NULL)
3252                         zone->uz_dtor(item, zone->uz_size, udata);
3253                 if (zone->uz_fini != NULL)
3254                         zone->uz_fini(item, zone->uz_size);
3255                 memguard_free(item);
3256                 return (EJUSTRETURN);
3257         }
3258 #endif
3259         return (0);
3260 }
3261 #endif
3262
3263 static inline void *
3264 cache_alloc_item(uma_zone_t zone, uma_cache_t cache, uma_cache_bucket_t bucket,
3265     void *udata, int flags)
3266 {
3267         void *item;
3268         int size, uz_flags;
3269
3270         item = cache_bucket_pop(cache, bucket);
3271         size = cache_uz_size(cache);
3272         uz_flags = cache_uz_flags(cache);
3273         critical_exit();
3274         return (item_ctor(zone, uz_flags, size, udata, flags, item));
3275 }
3276
3277 static __noinline void *
3278 cache_alloc_retry(uma_zone_t zone, uma_cache_t cache, void *udata, int flags)
3279 {
3280         uma_cache_bucket_t bucket;
3281         int domain;
3282
3283         while (cache_alloc(zone, cache, udata, flags)) {
3284                 cache = &zone->uz_cpu[curcpu];
3285                 bucket = &cache->uc_allocbucket;
3286                 if (__predict_false(bucket->ucb_cnt == 0))
3287                         continue;
3288                 return (cache_alloc_item(zone, cache, bucket, udata, flags));
3289         }
3290         critical_exit();
3291
3292         /*
3293          * We can not get a bucket so try to return a single item.
3294          */
3295         if (zone->uz_flags & UMA_ZONE_FIRSTTOUCH)
3296                 domain = PCPU_GET(domain);
3297         else
3298                 domain = UMA_ANYDOMAIN;
3299         return (zone_alloc_item(zone, udata, domain, flags));
3300 }
3301
3302 /* See uma.h */
3303 void *
3304 uma_zalloc_smr(uma_zone_t zone, int flags)
3305 {
3306         uma_cache_bucket_t bucket;
3307         uma_cache_t cache;
3308
3309 #ifdef UMA_ZALLOC_DEBUG
3310         void *item;
3311
3312         KASSERT((zone->uz_flags & UMA_ZONE_SMR) != 0,
3313             ("uma_zalloc_arg: called with non-SMR zone.\n"));
3314         if (uma_zalloc_debug(zone, &item, NULL, flags) == EJUSTRETURN)
3315                 return (item);
3316 #endif
3317
3318         critical_enter();
3319         cache = &zone->uz_cpu[curcpu];
3320         bucket = &cache->uc_allocbucket;
3321         if (__predict_false(bucket->ucb_cnt == 0))
3322                 return (cache_alloc_retry(zone, cache, NULL, flags));
3323         return (cache_alloc_item(zone, cache, bucket, NULL, flags));
3324 }
3325
3326 /* See uma.h */
3327 void *
3328 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
3329 {
3330         uma_cache_bucket_t bucket;
3331         uma_cache_t cache;
3332
3333         /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
3334         random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
3335
3336         /* This is the fast path allocation */
3337         CTR3(KTR_UMA, "uma_zalloc_arg zone %s(%p) flags %d", zone->uz_name,
3338             zone, flags);
3339
3340 #ifdef UMA_ZALLOC_DEBUG
3341         void *item;
3342
3343         KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0,
3344             ("uma_zalloc_arg: called with SMR zone.\n"));
3345         if (uma_zalloc_debug(zone, &item, udata, flags) == EJUSTRETURN)
3346                 return (item);
3347 #endif
3348
3349         /*
3350          * If possible, allocate from the per-CPU cache.  There are two
3351          * requirements for safe access to the per-CPU cache: (1) the thread
3352          * accessing the cache must not be preempted or yield during access,
3353          * and (2) the thread must not migrate CPUs without switching which
3354          * cache it accesses.  We rely on a critical section to prevent
3355          * preemption and migration.  We release the critical section in
3356          * order to acquire the zone mutex if we are unable to allocate from
3357          * the current cache; when we re-acquire the critical section, we
3358          * must detect and handle migration if it has occurred.
3359          */
3360         critical_enter();
3361         cache = &zone->uz_cpu[curcpu];
3362         bucket = &cache->uc_allocbucket;
3363         if (__predict_false(bucket->ucb_cnt == 0))
3364                 return (cache_alloc_retry(zone, cache, udata, flags));
3365         return (cache_alloc_item(zone, cache, bucket, udata, flags));
3366 }
3367
3368 /*
3369  * Replenish an alloc bucket and possibly restore an old one.  Called in
3370  * a critical section.  Returns in a critical section.
3371  *
3372  * A false return value indicates an allocation failure.
3373  * A true return value indicates success and the caller should retry.
3374  */
3375 static __noinline bool
3376 cache_alloc(uma_zone_t zone, uma_cache_t cache, void *udata, int flags)
3377 {
3378         uma_bucket_t bucket;
3379         int domain;
3380         bool new;
3381
3382         CRITICAL_ASSERT(curthread);
3383
3384         /*
3385          * If we have run out of items in our alloc bucket see
3386          * if we can switch with the free bucket.
3387          *
3388          * SMR Zones can't re-use the free bucket until the sequence has
3389          * expired.
3390          */
3391         if ((cache_uz_flags(cache) & UMA_ZONE_SMR) == 0 &&
3392             cache->uc_freebucket.ucb_cnt != 0) {
3393                 cache_bucket_swap(&cache->uc_freebucket,
3394                     &cache->uc_allocbucket);
3395                 return (true);
3396         }
3397
3398         /*
3399          * Discard any empty allocation bucket while we hold no locks.
3400          */
3401         bucket = cache_bucket_unload_alloc(cache);
3402         critical_exit();
3403
3404         if (bucket != NULL) {
3405                 KASSERT(bucket->ub_cnt == 0,
3406                     ("cache_alloc: Entered with non-empty alloc bucket."));
3407                 bucket_free(zone, bucket, udata);
3408         }
3409
3410         /* Short-circuit for zones without buckets and low memory. */
3411         if (zone->uz_bucket_size == 0 || bucketdisable) {
3412                 critical_enter();
3413                 return (false);
3414         }
3415
3416         /*
3417          * Attempt to retrieve the item from the per-CPU cache has failed, so
3418          * we must go back to the zone.  This requires the zdom lock, so we
3419          * must drop the critical section, then re-acquire it when we go back
3420          * to the cache.  Since the critical section is released, we may be
3421          * preempted or migrate.  As such, make sure not to maintain any
3422          * thread-local state specific to the cache from prior to releasing
3423          * the critical section.
3424          */
3425         domain = PCPU_GET(domain);
3426         if ((cache_uz_flags(cache) & UMA_ZONE_ROUNDROBIN) != 0)
3427                 domain = zone_domain_highest(zone, domain);
3428         bucket = cache_fetch_bucket(zone, cache, domain);
3429         if (bucket == NULL) {
3430                 bucket = zone_alloc_bucket(zone, udata, domain, flags);
3431                 new = true;
3432         } else
3433                 new = false;
3434
3435         CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p",
3436             zone->uz_name, zone, bucket);
3437         if (bucket == NULL) {
3438                 critical_enter();
3439                 return (false);
3440         }
3441
3442         /*
3443          * See if we lost the race or were migrated.  Cache the
3444          * initialized bucket to make this less likely or claim
3445          * the memory directly.
3446          */
3447         critical_enter();
3448         cache = &zone->uz_cpu[curcpu];
3449         if (cache->uc_allocbucket.ucb_bucket == NULL &&
3450             ((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) == 0 ||
3451             domain == PCPU_GET(domain))) {
3452                 if (new)
3453                         atomic_add_long(&ZDOM_GET(zone, domain)->uzd_imax,
3454                             bucket->ub_cnt);
3455                 cache_bucket_load_alloc(cache, bucket);
3456                 return (true);
3457         }
3458
3459         /*
3460          * We lost the race, release this bucket and start over.
3461          */
3462         critical_exit();
3463         zone_put_bucket(zone, domain, bucket, udata, false);
3464         critical_enter();
3465
3466         return (true);
3467 }
3468
3469 void *
3470 uma_zalloc_domain(uma_zone_t zone, void *udata, int domain, int flags)
3471 {
3472
3473         /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
3474         random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
3475
3476         /* This is the fast path allocation */
3477         CTR4(KTR_UMA, "uma_zalloc_domain zone %s(%p) domain %d flags %d",
3478             zone->uz_name, zone, domain, flags);
3479
3480         if (flags & M_WAITOK) {
3481                 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
3482                     "uma_zalloc_domain: zone \"%s\"", zone->uz_name);
3483         }
3484         KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
3485             ("uma_zalloc_domain: called with spinlock or critical section held"));
3486
3487         return (zone_alloc_item(zone, udata, domain, flags));
3488 }
3489
3490 /*
3491  * Find a slab with some space.  Prefer slabs that are partially used over those
3492  * that are totally full.  This helps to reduce fragmentation.
3493  *
3494  * If 'rr' is 1, search all domains starting from 'domain'.  Otherwise check
3495  * only 'domain'.
3496  */
3497 static uma_slab_t
3498 keg_first_slab(uma_keg_t keg, int domain, bool rr)
3499 {
3500         uma_domain_t dom;
3501         uma_slab_t slab;
3502         int start;
3503
3504         KASSERT(domain >= 0 && domain < vm_ndomains,
3505             ("keg_first_slab: domain %d out of range", domain));
3506         KEG_LOCK_ASSERT(keg, domain);
3507
3508         slab = NULL;
3509         start = domain;
3510         do {
3511                 dom = &keg->uk_domain[domain];
3512                 if ((slab = LIST_FIRST(&dom->ud_part_slab)) != NULL)
3513                         return (slab);
3514                 if ((slab = LIST_FIRST(&dom->ud_free_slab)) != NULL) {
3515                         LIST_REMOVE(slab, us_link);
3516                         dom->ud_free_slabs--;
3517                         LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
3518                         return (slab);
3519                 }
3520                 if (rr)
3521                         domain = (domain + 1) % vm_ndomains;
3522         } while (domain != start);
3523
3524         return (NULL);
3525 }
3526
3527 /*
3528  * Fetch an existing slab from a free or partial list.  Returns with the
3529  * keg domain lock held if a slab was found or unlocked if not.
3530  */
3531 static uma_slab_t
3532 keg_fetch_free_slab(uma_keg_t keg, int domain, bool rr, int flags)
3533 {
3534         uma_slab_t slab;
3535         uint32_t reserve;
3536
3537         /* HASH has a single free list. */
3538         if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0)
3539                 domain = 0;
3540
3541         KEG_LOCK(keg, domain);
3542         reserve = (flags & M_USE_RESERVE) != 0 ? 0 : keg->uk_reserve;
3543         if (keg->uk_domain[domain].ud_free_items <= reserve ||
3544             (slab = keg_first_slab(keg, domain, rr)) == NULL) {
3545                 KEG_UNLOCK(keg, domain);
3546                 return (NULL);
3547         }
3548         return (slab);
3549 }
3550
3551 static uma_slab_t
3552 keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, const int flags)
3553 {
3554         struct vm_domainset_iter di;
3555         uma_slab_t slab;
3556         int aflags, domain;
3557         bool rr;
3558
3559 restart:
3560         /*
3561          * Use the keg's policy if upper layers haven't already specified a
3562          * domain (as happens with first-touch zones).
3563          *
3564          * To avoid races we run the iterator with the keg lock held, but that
3565          * means that we cannot allow the vm_domainset layer to sleep.  Thus,
3566          * clear M_WAITOK and handle low memory conditions locally.
3567          */
3568         rr = rdomain == UMA_ANYDOMAIN;
3569         if (rr) {
3570                 aflags = (flags & ~M_WAITOK) | M_NOWAIT;
3571                 vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
3572                     &aflags);
3573         } else {
3574                 aflags = flags;
3575                 domain = rdomain;
3576         }
3577
3578         for (;;) {
3579                 slab = keg_fetch_free_slab(keg, domain, rr, flags);
3580                 if (slab != NULL)
3581                         return (slab);
3582
3583                 /*
3584                  * M_NOVM means don't ask at all!
3585                  */
3586                 if (flags & M_NOVM)
3587                         break;
3588
3589                 slab = keg_alloc_slab(keg, zone, domain, flags, aflags);
3590                 if (slab != NULL)
3591                         return (slab);
3592                 if (!rr && (flags & M_WAITOK) == 0)
3593                         break;
3594                 if (rr && vm_domainset_iter_policy(&di, &domain) != 0) {
3595                         if ((flags & M_WAITOK) != 0) {
3596                                 vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask);
3597                                 goto restart;
3598                         }
3599                         break;
3600                 }
3601         }
3602
3603         /*
3604          * We might not have been able to get a slab but another cpu
3605          * could have while we were unlocked.  Check again before we
3606          * fail.
3607          */
3608         if ((slab = keg_fetch_free_slab(keg, domain, rr, flags)) != NULL)
3609                 return (slab);
3610
3611         return (NULL);
3612 }
3613
3614 static void *
3615 slab_alloc_item(uma_keg_t keg, uma_slab_t slab)
3616 {
3617         uma_domain_t dom;
3618         void *item;
3619         int freei;
3620
3621         KEG_LOCK_ASSERT(keg, slab->us_domain);
3622
3623         dom = &keg->uk_domain[slab->us_domain];
3624         freei = BIT_FFS(keg->uk_ipers, &slab->us_free) - 1;
3625         BIT_CLR(keg->uk_ipers, freei, &slab->us_free);
3626         item = slab_item(slab, keg, freei);
3627         slab->us_freecount--;
3628         dom->ud_free_items--;
3629
3630         /*
3631          * Move this slab to the full list.  It must be on the partial list, so
3632          * we do not need to update the free slab count.  In particular,
3633          * keg_fetch_slab() always returns slabs on the partial list.
3634          */
3635         if (slab->us_freecount == 0) {
3636                 LIST_REMOVE(slab, us_link);
3637                 LIST_INSERT_HEAD(&dom->ud_full_slab, slab, us_link);
3638         }
3639
3640         return (item);
3641 }
3642
3643 static int
3644 zone_import(void *arg, void **bucket, int max, int domain, int flags)
3645 {
3646         uma_domain_t dom;
3647         uma_zone_t zone;
3648         uma_slab_t slab;
3649         uma_keg_t keg;
3650 #ifdef NUMA
3651         int stripe;
3652 #endif
3653         int i;
3654
3655         zone = arg;
3656         slab = NULL;
3657         keg = zone->uz_keg;
3658         /* Try to keep the buckets totally full */
3659         for (i = 0; i < max; ) {
3660                 if ((slab = keg_fetch_slab(keg, zone, domain, flags)) == NULL)
3661                         break;
3662 #ifdef NUMA
3663                 stripe = howmany(max, vm_ndomains);
3664 #endif
3665                 dom = &keg->uk_domain[slab->us_domain];
3666                 while (slab->us_freecount && i < max) {
3667                         bucket[i++] = slab_alloc_item(keg, slab);
3668                         if (dom->ud_free_items <= keg->uk_reserve)
3669                                 break;
3670 #ifdef NUMA
3671                         /*
3672                          * If the zone is striped we pick a new slab for every
3673                          * N allocations.  Eliminating this conditional will
3674                          * instead pick a new domain for each bucket rather
3675                          * than stripe within each bucket.  The current option
3676                          * produces more fragmentation and requires more cpu
3677                          * time but yields better distribution.
3678                          */
3679                         if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0 &&
3680                             vm_ndomains > 1 && --stripe == 0)
3681                                 break;
3682 #endif
3683                 }
3684                 KEG_UNLOCK(keg, slab->us_domain);
3685                 /* Don't block if we allocated any successfully. */
3686                 flags &= ~M_WAITOK;
3687                 flags |= M_NOWAIT;
3688         }
3689
3690         return i;
3691 }
3692
3693 static int
3694 zone_alloc_limit_hard(uma_zone_t zone, int count, int flags)
3695 {
3696         uint64_t old, new, total, max;
3697
3698         /*
3699          * The hard case.  We're going to sleep because there were existing
3700          * sleepers or because we ran out of items.  This routine enforces
3701          * fairness by keeping fifo order.
3702          *
3703          * First release our ill gotten gains and make some noise.
3704          */
3705         for (;;) {
3706                 zone_free_limit(zone, count);
3707                 zone_log_warning(zone);
3708                 zone_maxaction(zone);
3709                 if (flags & M_NOWAIT)
3710                         return (0);
3711
3712                 /*
3713                  * We need to allocate an item or set ourself as a sleeper
3714                  * while the sleepq lock is held to avoid wakeup races.  This
3715                  * is essentially a home rolled semaphore.
3716                  */
3717                 sleepq_lock(&zone->uz_max_items);
3718                 old = zone->uz_items;
3719                 do {
3720                         MPASS(UZ_ITEMS_SLEEPERS(old) < UZ_ITEMS_SLEEPERS_MAX);
3721                         /* Cache the max since we will evaluate twice. */
3722                         max = zone->uz_max_items;
3723                         if (UZ_ITEMS_SLEEPERS(old) != 0 ||
3724                             UZ_ITEMS_COUNT(old) >= max)
3725                                 new = old + UZ_ITEMS_SLEEPER;
3726                         else
3727                                 new = old + MIN(count, max - old);
3728                 } while (atomic_fcmpset_64(&zone->uz_items, &old, new) == 0);
3729
3730                 /* We may have successfully allocated under the sleepq lock. */
3731                 if (UZ_ITEMS_SLEEPERS(new) == 0) {
3732                         sleepq_release(&zone->uz_max_items);
3733                         return (new - old);
3734                 }
3735
3736                 /*
3737                  * This is in a different cacheline from uz_items so that we
3738                  * don't constantly invalidate the fastpath cacheline when we
3739                  * adjust item counts.  This could be limited to toggling on
3740                  * transitions.
3741                  */
3742                 atomic_add_32(&zone->uz_sleepers, 1);
3743                 atomic_add_64(&zone->uz_sleeps, 1);
3744
3745                 /*
3746                  * We have added ourselves as a sleeper.  The sleepq lock
3747                  * protects us from wakeup races.  Sleep now and then retry.
3748                  */
3749                 sleepq_add(&zone->uz_max_items, NULL, "zonelimit", 0, 0);
3750                 sleepq_wait(&zone->uz_max_items, PVM);
3751
3752                 /*
3753                  * After wakeup, remove ourselves as a sleeper and try
3754                  * again.  We no longer have the sleepq lock for protection.
3755                  *
3756                  * Subract ourselves as a sleeper while attempting to add
3757                  * our count.
3758                  */
3759                 atomic_subtract_32(&zone->uz_sleepers, 1);
3760                 old = atomic_fetchadd_64(&zone->uz_items,
3761                     -(UZ_ITEMS_SLEEPER - count));
3762                 /* We're no longer a sleeper. */
3763                 old -= UZ_ITEMS_SLEEPER;
3764
3765                 /*
3766                  * If we're still at the limit, restart.  Notably do not
3767                  * block on other sleepers.  Cache the max value to protect
3768                  * against changes via sysctl.
3769                  */
3770                 total = UZ_ITEMS_COUNT(old);
3771                 max = zone->uz_max_items;
3772                 if (total >= max)
3773                         continue;
3774                 /* Truncate if necessary, otherwise wake other sleepers. */
3775                 if (total + count > max) {
3776                         zone_free_limit(zone, total + count - max);
3777                         count = max - total;
3778                 } else if (total + count < max && UZ_ITEMS_SLEEPERS(old) != 0)
3779                         wakeup_one(&zone->uz_max_items);
3780
3781                 return (count);
3782         }
3783 }
3784
3785 /*
3786  * Allocate 'count' items from our max_items limit.  Returns the number
3787  * available.  If M_NOWAIT is not specified it will sleep until at least
3788  * one item can be allocated.
3789  */
3790 static int
3791 zone_alloc_limit(uma_zone_t zone, int count, int flags)
3792 {
3793         uint64_t old;
3794         uint64_t max;
3795
3796         max = zone->uz_max_items;
3797         MPASS(max > 0);
3798
3799         /*
3800          * We expect normal allocations to succeed with a simple
3801          * fetchadd.
3802          */
3803         old = atomic_fetchadd_64(&zone->uz_items, count);
3804         if (__predict_true(old + count <= max))
3805                 return (count);
3806
3807         /*
3808          * If we had some items and no sleepers just return the
3809          * truncated value.  We have to release the excess space
3810          * though because that may wake sleepers who weren't woken
3811          * because we were temporarily over the limit.
3812          */
3813         if (old < max) {
3814                 zone_free_limit(zone, (old + count) - max);
3815                 return (max - old);
3816         }
3817         return (zone_alloc_limit_hard(zone, count, flags));
3818 }
3819
3820 /*
3821  * Free a number of items back to the limit.
3822  */
3823 static void
3824 zone_free_limit(uma_zone_t zone, int count)
3825 {
3826         uint64_t old;
3827
3828         MPASS(count > 0);
3829
3830         /*
3831          * In the common case we either have no sleepers or
3832          * are still over the limit and can just return.
3833          */
3834         old = atomic_fetchadd_64(&zone->uz_items, -count);
3835         if (__predict_true(UZ_ITEMS_SLEEPERS(old) == 0 ||
3836            UZ_ITEMS_COUNT(old) - count >= zone->uz_max_items))
3837                 return;
3838
3839         /*
3840          * Moderate the rate of wakeups.  Sleepers will continue
3841          * to generate wakeups if necessary.
3842          */
3843         wakeup_one(&zone->uz_max_items);
3844 }
3845
3846 static uma_bucket_t
3847 zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags)
3848 {
3849         uma_bucket_t bucket;
3850         int maxbucket, cnt;
3851
3852         CTR3(KTR_UMA, "zone_alloc_bucket zone %s(%p) domain %d", zone->uz_name,
3853             zone, domain);
3854
3855         /* Avoid allocs targeting empty domains. */
3856         if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain))
3857                 domain = UMA_ANYDOMAIN;
3858         if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0)
3859                 domain = UMA_ANYDOMAIN;
3860
3861         if (zone->uz_max_items > 0)
3862                 maxbucket = zone_alloc_limit(zone, zone->uz_bucket_size,
3863                     M_NOWAIT);
3864         else
3865                 maxbucket = zone->uz_bucket_size;
3866         if (maxbucket == 0)
3867                 return (false);
3868
3869         /* Don't wait for buckets, preserve caller's NOVM setting. */
3870         bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM));
3871         if (bucket == NULL) {
3872                 cnt = 0;
3873                 goto out;
3874         }
3875
3876         bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket,
3877             MIN(maxbucket, bucket->ub_entries), domain, flags);
3878
3879         /*
3880          * Initialize the memory if necessary.
3881          */
3882         if (bucket->ub_cnt != 0 && zone->uz_init != NULL) {
3883                 int i;
3884
3885                 for (i = 0; i < bucket->ub_cnt; i++)
3886                         if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size,
3887                             flags) != 0)
3888                                 break;
3889                 /*
3890                  * If we couldn't initialize the whole bucket, put the
3891                  * rest back onto the freelist.
3892                  */
3893                 if (i != bucket->ub_cnt) {
3894                         zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i],
3895                             bucket->ub_cnt - i);
3896 #ifdef INVARIANTS
3897                         bzero(&bucket->ub_bucket[i],
3898                             sizeof(void *) * (bucket->ub_cnt - i));
3899 #endif
3900                         bucket->ub_cnt = i;
3901                 }
3902         }
3903
3904         cnt = bucket->ub_cnt;
3905         if (bucket->ub_cnt == 0) {
3906                 bucket_free(zone, bucket, udata);
3907                 counter_u64_add(zone->uz_fails, 1);
3908                 bucket = NULL;
3909         }
3910 out:
3911         if (zone->uz_max_items > 0 && cnt < maxbucket)
3912                 zone_free_limit(zone, maxbucket - cnt);
3913
3914         return (bucket);
3915 }
3916
3917 /*
3918  * Allocates a single item from a zone.
3919  *
3920  * Arguments
3921  *      zone   The zone to alloc for.
3922  *      udata  The data to be passed to the constructor.
3923  *      domain The domain to allocate from or UMA_ANYDOMAIN.
3924  *      flags  M_WAITOK, M_NOWAIT, M_ZERO.
3925  *
3926  * Returns
3927  *      NULL if there is no memory and M_NOWAIT is set
3928  *      An item if successful
3929  */
3930
3931 static void *
3932 zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags)
3933 {
3934         void *item;
3935
3936         if (zone->uz_max_items > 0 && zone_alloc_limit(zone, 1, flags) == 0)
3937                 return (NULL);
3938
3939         /* Avoid allocs targeting empty domains. */
3940         if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain))
3941                 domain = UMA_ANYDOMAIN;
3942
3943         if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1)
3944                 goto fail_cnt;
3945
3946         /*
3947          * We have to call both the zone's init (not the keg's init)
3948          * and the zone's ctor.  This is because the item is going from
3949          * a keg slab directly to the user, and the user is expecting it
3950          * to be both zone-init'd as well as zone-ctor'd.
3951          */
3952         if (zone->uz_init != NULL) {
3953                 if (zone->uz_init(item, zone->uz_size, flags) != 0) {
3954                         zone_free_item(zone, item, udata, SKIP_FINI | SKIP_CNT);
3955                         goto fail_cnt;
3956                 }
3957         }
3958         item = item_ctor(zone, zone->uz_flags, zone->uz_size, udata, flags,
3959             item);
3960         if (item == NULL)
3961                 goto fail;
3962
3963         counter_u64_add(zone->uz_allocs, 1);
3964         CTR3(KTR_UMA, "zone_alloc_item item %p from %s(%p)", item,
3965             zone->uz_name, zone);
3966
3967         return (item);
3968
3969 fail_cnt:
3970         counter_u64_add(zone->uz_fails, 1);
3971 fail:
3972         if (zone->uz_max_items > 0)
3973                 zone_free_limit(zone, 1);
3974         CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)",
3975             zone->uz_name, zone);
3976
3977         return (NULL);
3978 }
3979
3980 /* See uma.h */
3981 void
3982 uma_zfree_smr(uma_zone_t zone, void *item)
3983 {
3984         uma_cache_t cache;
3985         uma_cache_bucket_t bucket;
3986         int itemdomain, uz_flags;
3987
3988 #ifdef UMA_ZALLOC_DEBUG
3989         KASSERT((zone->uz_flags & UMA_ZONE_SMR) != 0,
3990             ("uma_zfree_smr: called with non-SMR zone.\n"));
3991         KASSERT(item != NULL, ("uma_zfree_smr: Called with NULL pointer."));
3992         SMR_ASSERT_NOT_ENTERED(zone->uz_smr);
3993         if (uma_zfree_debug(zone, item, NULL) == EJUSTRETURN)
3994                 return;
3995 #endif
3996         cache = &zone->uz_cpu[curcpu];
3997         uz_flags = cache_uz_flags(cache);
3998         itemdomain = 0;
3999 #ifdef NUMA
4000         if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0)
4001                 itemdomain = _vm_phys_domain(pmap_kextract((vm_offset_t)item));
4002 #endif
4003         critical_enter();
4004         do {
4005                 cache = &zone->uz_cpu[curcpu];
4006                 /* SMR Zones must free to the free bucket. */
4007                 bucket = &cache->uc_freebucket;
4008 #ifdef NUMA
4009                 if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 &&
4010                     PCPU_GET(domain) != itemdomain) {
4011                         bucket = &cache->uc_crossbucket;
4012                 }
4013 #endif
4014                 if (__predict_true(bucket->ucb_cnt < bucket->ucb_entries)) {
4015                         cache_bucket_push(cache, bucket, item);
4016                         critical_exit();
4017                         return;
4018                 }
4019         } while (cache_free(zone, cache, NULL, item, itemdomain));
4020         critical_exit();
4021
4022         /*
4023          * If nothing else caught this, we'll just do an internal free.
4024          */
4025         zone_free_item(zone, item, NULL, SKIP_NONE);
4026 }
4027
4028 /* See uma.h */
4029 void
4030 uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
4031 {
4032         uma_cache_t cache;
4033         uma_cache_bucket_t bucket;
4034         int itemdomain, uz_flags;
4035
4036         /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
4037         random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
4038
4039         CTR2(KTR_UMA, "uma_zfree_arg zone %s(%p)", zone->uz_name, zone);
4040
4041 #ifdef UMA_ZALLOC_DEBUG
4042         KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0,
4043             ("uma_zfree_arg: called with SMR zone.\n"));
4044         if (uma_zfree_debug(zone, item, udata) == EJUSTRETURN)
4045                 return;
4046 #endif
4047         /* uma_zfree(..., NULL) does nothing, to match free(9). */
4048         if (item == NULL)
4049                 return;
4050
4051         /*
4052          * We are accessing the per-cpu cache without a critical section to
4053          * fetch size and flags.  This is acceptable, if we are preempted we
4054          * will simply read another cpu's line.
4055          */
4056         cache = &zone->uz_cpu[curcpu];
4057         uz_flags = cache_uz_flags(cache);
4058         if (UMA_ALWAYS_CTORDTOR ||
4059             __predict_false((uz_flags & UMA_ZFLAG_CTORDTOR) != 0))
4060                 item_dtor(zone, item, cache_uz_size(cache), udata, SKIP_NONE);
4061
4062         /*
4063          * The race here is acceptable.  If we miss it we'll just have to wait
4064          * a little longer for the limits to be reset.
4065          */
4066         if (__predict_false(uz_flags & UMA_ZFLAG_LIMIT)) {
4067                 if (zone->uz_sleepers > 0)
4068                         goto zfree_item;
4069         }
4070
4071         /*
4072          * If possible, free to the per-CPU cache.  There are two
4073          * requirements for safe access to the per-CPU cache: (1) the thread
4074          * accessing the cache must not be preempted or yield during access,
4075          * and (2) the thread must not migrate CPUs without switching which
4076          * cache it accesses.  We rely on a critical section to prevent
4077          * preemption and migration.  We release the critical section in
4078          * order to acquire the zone mutex if we are unable to free to the
4079          * current cache; when we re-acquire the critical section, we must
4080          * detect and handle migration if it has occurred.
4081          */
4082         itemdomain = 0;
4083 #ifdef NUMA
4084         if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0)
4085                 itemdomain = _vm_phys_domain(pmap_kextract((vm_offset_t)item));
4086 #endif
4087         critical_enter();
4088         do {
4089                 cache = &zone->uz_cpu[curcpu];
4090                 /*
4091                  * Try to free into the allocbucket first to give LIFO
4092                  * ordering for cache-hot datastructures.  Spill over
4093                  * into the freebucket if necessary.  Alloc will swap
4094                  * them if one runs dry.
4095                  */
4096                 bucket = &cache->uc_allocbucket;
4097 #ifdef NUMA
4098                 if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 &&
4099                     PCPU_GET(domain) != itemdomain) {
4100                         bucket = &cache->uc_crossbucket;
4101                 } else
4102 #endif
4103                 if (bucket->ucb_cnt >= bucket->ucb_entries)
4104                         bucket = &cache->uc_freebucket;
4105                 if (__predict_true(bucket->ucb_cnt < bucket->ucb_entries)) {
4106                         cache_bucket_push(cache, bucket, item);
4107                         critical_exit();
4108                         return;
4109                 }
4110         } while (cache_free(zone, cache, udata, item, itemdomain));
4111         critical_exit();
4112
4113         /*
4114          * If nothing else caught this, we'll just do an internal free.
4115          */
4116 zfree_item:
4117         zone_free_item(zone, item, udata, SKIP_DTOR);
4118 }
4119
4120 #ifdef NUMA
4121 /*
4122  * sort crossdomain free buckets to domain correct buckets and cache
4123  * them.
4124  */
4125 static void
4126 zone_free_cross(uma_zone_t zone, uma_bucket_t bucket, void *udata)
4127 {
4128         struct uma_bucketlist fullbuckets;
4129         uma_zone_domain_t zdom;
4130         uma_bucket_t b;
4131         smr_seq_t seq;
4132         void *item;
4133         int domain;
4134
4135         CTR3(KTR_UMA,
4136             "uma_zfree: zone %s(%p) draining cross bucket %p",
4137             zone->uz_name, zone, bucket);
4138
4139         /*
4140          * It is possible for buckets to arrive here out of order so we fetch
4141          * the current smr seq rather than accepting the bucket's.
4142          */
4143         seq = SMR_SEQ_INVALID;
4144         if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
4145                 seq = smr_advance(zone->uz_smr);
4146
4147         /*
4148          * To avoid having ndomain * ndomain buckets for sorting we have a
4149          * lock on the current crossfree bucket.  A full matrix with
4150          * per-domain locking could be used if necessary.
4151          */
4152         STAILQ_INIT(&fullbuckets);
4153         ZONE_CROSS_LOCK(zone);
4154         while (bucket->ub_cnt > 0) {
4155                 item = bucket->ub_bucket[bucket->ub_cnt - 1];
4156                 domain = _vm_phys_domain(pmap_kextract((vm_offset_t)item));
4157                 zdom = ZDOM_GET(zone, domain);
4158                 if (zdom->uzd_cross == NULL) {
4159                         zdom->uzd_cross = bucket_alloc(zone, udata, M_NOWAIT);
4160                         if (zdom->uzd_cross == NULL)
4161                                 break;
4162                 }
4163                 b = zdom->uzd_cross;
4164                 b->ub_bucket[b->ub_cnt++] = item;
4165                 b->ub_seq = seq;
4166                 if (b->ub_cnt == b->ub_entries) {
4167                         STAILQ_INSERT_HEAD(&fullbuckets, b, ub_link);
4168                         zdom->uzd_cross = NULL;
4169                 }
4170                 bucket->ub_cnt--;
4171         }
4172         ZONE_CROSS_UNLOCK(zone);
4173         if (bucket->ub_cnt == 0)
4174                 bucket->ub_seq = SMR_SEQ_INVALID;
4175         bucket_free(zone, bucket, udata);
4176
4177         while ((b = STAILQ_FIRST(&fullbuckets)) != NULL) {
4178                 STAILQ_REMOVE_HEAD(&fullbuckets, ub_link);
4179                 domain = _vm_phys_domain(pmap_kextract(
4180                     (vm_offset_t)b->ub_bucket[0]));
4181                 zone_put_bucket(zone, domain, b, udata, true);
4182         }
4183 }
4184 #endif
4185
4186 static void
4187 zone_free_bucket(uma_zone_t zone, uma_bucket_t bucket, void *udata,
4188     int itemdomain, bool ws)
4189 {
4190
4191 #ifdef NUMA
4192         /*
4193          * Buckets coming from the wrong domain will be entirely for the
4194          * only other domain on two domain systems.  In this case we can
4195          * simply cache them.  Otherwise we need to sort them back to
4196          * correct domains.
4197          */
4198         if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 &&
4199             vm_ndomains > 2 && PCPU_GET(domain) != itemdomain) {
4200                 zone_free_cross(zone, bucket, udata);
4201                 return;
4202         }
4203 #endif
4204
4205         /*
4206          * Attempt to save the bucket in the zone's domain bucket cache.
4207          */
4208         CTR3(KTR_UMA,
4209             "uma_zfree: zone %s(%p) putting bucket %p on free list",
4210             zone->uz_name, zone, bucket);
4211         /* ub_cnt is pointing to the last free item */
4212         if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0)
4213                 itemdomain = zone_domain_lowest(zone, itemdomain);
4214         zone_put_bucket(zone, itemdomain, bucket, udata, ws);
4215 }
4216
4217 /*
4218  * Populate a free or cross bucket for the current cpu cache.  Free any
4219  * existing full bucket either to the zone cache or back to the slab layer.
4220  *
4221  * Enters and returns in a critical section.  false return indicates that
4222  * we can not satisfy this free in the cache layer.  true indicates that
4223  * the caller should retry.
4224  */
4225 static __noinline bool
4226 cache_free(uma_zone_t zone, uma_cache_t cache, void *udata, void *item,
4227     int itemdomain)
4228 {
4229         uma_cache_bucket_t cbucket;
4230         uma_bucket_t newbucket, bucket;
4231
4232         CRITICAL_ASSERT(curthread);
4233
4234         if (zone->uz_bucket_size == 0)
4235                 return false;
4236
4237         cache = &zone->uz_cpu[curcpu];
4238         newbucket = NULL;
4239
4240         /*
4241          * FIRSTTOUCH domains need to free to the correct zdom.  When
4242          * enabled this is the zdom of the item.   The bucket is the
4243          * cross bucket if the current domain and itemdomain do not match.
4244          */
4245         cbucket = &cache->uc_freebucket;
4246 #ifdef NUMA
4247         if ((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) != 0) {
4248                 if (PCPU_GET(domain) != itemdomain) {
4249                         cbucket = &cache->uc_crossbucket;
4250                         if (cbucket->ucb_cnt != 0)
4251                                 counter_u64_add(zone->uz_xdomain,
4252                                     cbucket->ucb_cnt);
4253                 }
4254         }
4255 #endif
4256         bucket = cache_bucket_unload(cbucket);
4257         KASSERT(bucket == NULL || bucket->ub_cnt == bucket->ub_entries,
4258             ("cache_free: Entered with non-full free bucket."));
4259
4260         /* We are no longer associated with this CPU. */
4261         critical_exit();
4262
4263         /*
4264          * Don't let SMR zones operate without a free bucket.  Force
4265          * a synchronize and re-use this one.  We will only degrade
4266          * to a synchronize every bucket_size items rather than every
4267          * item if we fail to allocate a bucket.
4268          */
4269         if ((zone->uz_flags & UMA_ZONE_SMR) != 0) {
4270                 if (bucket != NULL)
4271                         bucket->ub_seq = smr_advance(zone->uz_smr);
4272                 newbucket = bucket_alloc(zone, udata, M_NOWAIT);
4273                 if (newbucket == NULL && bucket != NULL) {
4274                         bucket_drain(zone, bucket);
4275                         newbucket = bucket;
4276                         bucket = NULL;
4277                 }
4278         } else if (!bucketdisable)
4279                 newbucket = bucket_alloc(zone, udata, M_NOWAIT);
4280
4281         if (bucket != NULL)
4282                 zone_free_bucket(zone, bucket, udata, itemdomain, true);
4283
4284         critical_enter();
4285         if ((bucket = newbucket) == NULL)
4286                 return (false);
4287         cache = &zone->uz_cpu[curcpu];
4288 #ifdef NUMA
4289         /*
4290          * Check to see if we should be populating the cross bucket.  If it
4291          * is already populated we will fall through and attempt to populate
4292          * the free bucket.
4293          */
4294         if ((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) != 0) {
4295                 if (PCPU_GET(domain) != itemdomain &&
4296                     cache->uc_crossbucket.ucb_bucket == NULL) {
4297                         cache_bucket_load_cross(cache, bucket);
4298                         return (true);
4299                 }
4300         }
4301 #endif
4302         /*
4303          * We may have lost the race to fill the bucket or switched CPUs.
4304          */
4305         if (cache->uc_freebucket.ucb_bucket != NULL) {
4306                 critical_exit();
4307                 bucket_free(zone, bucket, udata);
4308                 critical_enter();
4309         } else
4310                 cache_bucket_load_free(cache, bucket);
4311
4312         return (true);
4313 }
4314
4315 void
4316 uma_zfree_domain(uma_zone_t zone, void *item, void *udata)
4317 {
4318
4319         /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
4320         random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
4321
4322         CTR2(KTR_UMA, "uma_zfree_domain zone %s(%p)", zone->uz_name, zone);
4323
4324         KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
4325             ("uma_zfree_domain: called with spinlock or critical section held"));
4326
4327         /* uma_zfree(..., NULL) does nothing, to match free(9). */
4328         if (item == NULL)
4329                 return;
4330         zone_free_item(zone, item, udata, SKIP_NONE);
4331 }
4332
4333 static void
4334 slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item)
4335 {
4336         uma_keg_t keg;
4337         uma_domain_t dom;
4338         int freei;
4339
4340         keg = zone->uz_keg;
4341         KEG_LOCK_ASSERT(keg, slab->us_domain);
4342
4343         /* Do we need to remove from any lists? */
4344         dom = &keg->uk_domain[slab->us_domain];
4345         if (slab->us_freecount + 1 == keg->uk_ipers) {
4346                 LIST_REMOVE(slab, us_link);
4347                 LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link);
4348                 dom->ud_free_slabs++;
4349         } else if (slab->us_freecount == 0) {
4350                 LIST_REMOVE(slab, us_link);
4351                 LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
4352         }
4353
4354         /* Slab management. */
4355         freei = slab_item_index(slab, keg, item);
4356         BIT_SET(keg->uk_ipers, freei, &slab->us_free);
4357         slab->us_freecount++;
4358
4359         /* Keg statistics. */
4360         dom->ud_free_items++;
4361 }
4362
4363 static void
4364 zone_release(void *arg, void **bucket, int cnt)
4365 {
4366         struct mtx *lock;
4367         uma_zone_t zone;
4368         uma_slab_t slab;
4369         uma_keg_t keg;
4370         uint8_t *mem;
4371         void *item;
4372         int i;
4373
4374         zone = arg;
4375         keg = zone->uz_keg;
4376         lock = NULL;
4377         if (__predict_false((zone->uz_flags & UMA_ZFLAG_HASH) != 0))
4378                 lock = KEG_LOCK(keg, 0);
4379         for (i = 0; i < cnt; i++) {
4380                 item = bucket[i];
4381                 if (__predict_true((zone->uz_flags & UMA_ZFLAG_VTOSLAB) != 0)) {
4382                         slab = vtoslab((vm_offset_t)item);
4383                 } else {
4384                         mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
4385                         if ((zone->uz_flags & UMA_ZFLAG_HASH) != 0)
4386                                 slab = hash_sfind(&keg->uk_hash, mem);
4387                         else
4388                                 slab = (uma_slab_t)(mem + keg->uk_pgoff);
4389                 }
4390                 if (lock != KEG_LOCKPTR(keg, slab->us_domain)) {
4391                         if (lock != NULL)
4392                                 mtx_unlock(lock);
4393                         lock = KEG_LOCK(keg, slab->us_domain);
4394                 }
4395                 slab_free_item(zone, slab, item);
4396         }
4397         if (lock != NULL)
4398                 mtx_unlock(lock);
4399 }
4400
4401 /*
4402  * Frees a single item to any zone.
4403  *
4404  * Arguments:
4405  *      zone   The zone to free to
4406  *      item   The item we're freeing
4407  *      udata  User supplied data for the dtor
4408  *      skip   Skip dtors and finis
4409  */
4410 static __noinline void
4411 zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip)
4412 {
4413
4414         /*
4415          * If a free is sent directly to an SMR zone we have to
4416          * synchronize immediately because the item can instantly
4417          * be reallocated. This should only happen in degenerate
4418          * cases when no memory is available for per-cpu caches.
4419          */
4420         if ((zone->uz_flags & UMA_ZONE_SMR) != 0 && skip == SKIP_NONE)
4421                 smr_synchronize(zone->uz_smr);
4422
4423         item_dtor(zone, item, zone->uz_size, udata, skip);
4424
4425         if (skip < SKIP_FINI && zone->uz_fini)
4426                 zone->uz_fini(item, zone->uz_size);
4427
4428         zone->uz_release(zone->uz_arg, &item, 1);
4429
4430         if (skip & SKIP_CNT)
4431                 return;
4432
4433         counter_u64_add(zone->uz_frees, 1);
4434
4435         if (zone->uz_max_items > 0)
4436                 zone_free_limit(zone, 1);
4437 }
4438
4439 /* See uma.h */
4440 int
4441 uma_zone_set_max(uma_zone_t zone, int nitems)
4442 {
4443         struct uma_bucket_zone *ubz;
4444         int count;
4445
4446         /*
4447          * XXX This can misbehave if the zone has any allocations with
4448          * no limit and a limit is imposed.  There is currently no
4449          * way to clear a limit.
4450          */
4451         ZONE_LOCK(zone);
4452         ubz = bucket_zone_max(zone, nitems);
4453         count = ubz != NULL ? ubz->ubz_entries : 0;
4454         zone->uz_bucket_size_max = zone->uz_bucket_size = count;
4455         if (zone->uz_bucket_size_min > zone->uz_bucket_size_max)
4456                 zone->uz_bucket_size_min = zone->uz_bucket_size_max;
4457         zone->uz_max_items = nitems;
4458         zone->uz_flags |= UMA_ZFLAG_LIMIT;
4459         zone_update_caches(zone);
4460         /* We may need to wake waiters. */
4461         wakeup(&zone->uz_max_items);
4462         ZONE_UNLOCK(zone);
4463
4464         return (nitems);
4465 }
4466
4467 /* See uma.h */
4468 void
4469 uma_zone_set_maxcache(uma_zone_t zone, int nitems)
4470 {
4471         struct uma_bucket_zone *ubz;
4472         int bpcpu;
4473
4474         ZONE_LOCK(zone);
4475         ubz = bucket_zone_max(zone, nitems);
4476         if (ubz != NULL) {
4477                 bpcpu = 2;
4478                 if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0)
4479                         /* Count the cross-domain bucket. */
4480                         bpcpu++;
4481                 nitems -= ubz->ubz_entries * bpcpu * mp_ncpus;
4482                 zone->uz_bucket_size_max = ubz->ubz_entries;
4483         } else {
4484                 zone->uz_bucket_size_max = zone->uz_bucket_size = 0;
4485         }
4486         if (zone->uz_bucket_size_min > zone->uz_bucket_size_max)
4487                 zone->uz_bucket_size_min = zone->uz_bucket_size_max;
4488         zone->uz_bucket_max = nitems / vm_ndomains;
4489         ZONE_UNLOCK(zone);
4490 }
4491
4492 /* See uma.h */
4493 int
4494 uma_zone_get_max(uma_zone_t zone)
4495 {
4496         int nitems;
4497
4498         nitems = atomic_load_64(&zone->uz_max_items);
4499
4500         return (nitems);
4501 }
4502
4503 /* See uma.h */
4504 void
4505 uma_zone_set_warning(uma_zone_t zone, const char *warning)
4506 {
4507
4508         ZONE_ASSERT_COLD(zone);
4509         zone->uz_warning = warning;
4510 }
4511
4512 /* See uma.h */
4513 void
4514 uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction)
4515 {
4516
4517         ZONE_ASSERT_COLD(zone);
4518         TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone);
4519 }
4520
4521 /* See uma.h */
4522 int
4523 uma_zone_get_cur(uma_zone_t zone)
4524 {
4525         int64_t nitems;
4526         u_int i;
4527
4528         nitems = 0;
4529         if (zone->uz_allocs != EARLY_COUNTER && zone->uz_frees != EARLY_COUNTER)
4530                 nitems = counter_u64_fetch(zone->uz_allocs) -
4531                     counter_u64_fetch(zone->uz_frees);
4532         CPU_FOREACH(i)
4533                 nitems += atomic_load_64(&zone->uz_cpu[i].uc_allocs) -
4534                     atomic_load_64(&zone->uz_cpu[i].uc_frees);
4535
4536         return (nitems < 0 ? 0 : nitems);
4537 }
4538
4539 static uint64_t
4540 uma_zone_get_allocs(uma_zone_t zone)
4541 {
4542         uint64_t nitems;
4543         u_int i;
4544
4545         nitems = 0;
4546         if (zone->uz_allocs != EARLY_COUNTER)
4547                 nitems = counter_u64_fetch(zone->uz_allocs);
4548         CPU_FOREACH(i)
4549                 nitems += atomic_load_64(&zone->uz_cpu[i].uc_allocs);
4550
4551         return (nitems);
4552 }
4553
4554 static uint64_t
4555 uma_zone_get_frees(uma_zone_t zone)
4556 {
4557         uint64_t nitems;
4558         u_int i;
4559
4560         nitems = 0;
4561         if (zone->uz_frees != EARLY_COUNTER)
4562                 nitems = counter_u64_fetch(zone->uz_frees);
4563         CPU_FOREACH(i)
4564                 nitems += atomic_load_64(&zone->uz_cpu[i].uc_frees);
4565
4566         return (nitems);
4567 }
4568
4569 #ifdef INVARIANTS
4570 /* Used only for KEG_ASSERT_COLD(). */
4571 static uint64_t
4572 uma_keg_get_allocs(uma_keg_t keg)
4573 {
4574         uma_zone_t z;
4575         uint64_t nitems;
4576
4577         nitems = 0;
4578         LIST_FOREACH(z, &keg->uk_zones, uz_link)
4579                 nitems += uma_zone_get_allocs(z);
4580
4581         return (nitems);
4582 }
4583 #endif
4584
4585 /* See uma.h */
4586 void
4587 uma_zone_set_init(uma_zone_t zone, uma_init uminit)
4588 {
4589         uma_keg_t keg;
4590
4591         KEG_GET(zone, keg);
4592         KEG_ASSERT_COLD(keg);
4593         keg->uk_init = uminit;
4594 }
4595
4596 /* See uma.h */
4597 void
4598 uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
4599 {
4600         uma_keg_t keg;
4601
4602         KEG_GET(zone, keg);
4603         KEG_ASSERT_COLD(keg);
4604         keg->uk_fini = fini;
4605 }
4606
4607 /* See uma.h */
4608 void
4609 uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
4610 {
4611
4612         ZONE_ASSERT_COLD(zone);
4613         zone->uz_init = zinit;
4614 }
4615
4616 /* See uma.h */
4617 void
4618 uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
4619 {
4620
4621         ZONE_ASSERT_COLD(zone);
4622         zone->uz_fini = zfini;
4623 }
4624
4625 /* See uma.h */
4626 void
4627 uma_zone_set_freef(uma_zone_t zone, uma_free freef)
4628 {
4629         uma_keg_t keg;
4630
4631         KEG_GET(zone, keg);
4632         KEG_ASSERT_COLD(keg);
4633         keg->uk_freef = freef;
4634 }
4635
4636 /* See uma.h */
4637 void
4638 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
4639 {
4640         uma_keg_t keg;
4641
4642         KEG_GET(zone, keg);
4643         KEG_ASSERT_COLD(keg);
4644         keg->uk_allocf = allocf;
4645 }
4646
4647 /* See uma.h */
4648 void
4649 uma_zone_set_smr(uma_zone_t zone, smr_t smr)
4650 {
4651
4652         ZONE_ASSERT_COLD(zone);
4653
4654         zone->uz_flags |= UMA_ZONE_SMR;
4655         zone->uz_smr = smr;
4656         zone_update_caches(zone);
4657 }
4658
4659 smr_t
4660 uma_zone_get_smr(uma_zone_t zone)
4661 {
4662
4663         return (zone->uz_smr);
4664 }
4665
4666 /* See uma.h */
4667 void
4668 uma_zone_reserve(uma_zone_t zone, int items)
4669 {
4670         uma_keg_t keg;
4671
4672         KEG_GET(zone, keg);
4673         KEG_ASSERT_COLD(keg);
4674         keg->uk_reserve = items;
4675 }
4676
4677 /* See uma.h */
4678 int
4679 uma_zone_reserve_kva(uma_zone_t zone, int count)
4680 {
4681         uma_keg_t keg;
4682         vm_offset_t kva;
4683         u_int pages;
4684
4685         KEG_GET(zone, keg);
4686         KEG_ASSERT_COLD(keg);
4687         ZONE_ASSERT_COLD(zone);
4688
4689         pages = howmany(count, keg->uk_ipers) * keg->uk_ppera;
4690
4691 #ifdef UMA_MD_SMALL_ALLOC
4692         if (keg->uk_ppera > 1) {
4693 #else
4694         if (1) {
4695 #endif
4696                 kva = kva_alloc((vm_size_t)pages * PAGE_SIZE);
4697                 if (kva == 0)
4698                         return (0);
4699         } else
4700                 kva = 0;
4701
4702         MPASS(keg->uk_kva == 0);
4703         keg->uk_kva = kva;
4704         keg->uk_offset = 0;
4705         zone->uz_max_items = pages * keg->uk_ipers;
4706 #ifdef UMA_MD_SMALL_ALLOC
4707         keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc;
4708 #else
4709         keg->uk_allocf = noobj_alloc;
4710 #endif
4711         keg->uk_flags |= UMA_ZFLAG_LIMIT | UMA_ZONE_NOFREE;
4712         zone->uz_flags |= UMA_ZFLAG_LIMIT | UMA_ZONE_NOFREE;
4713         zone_update_caches(zone);
4714
4715         return (1);
4716 }
4717
4718 /* See uma.h */
4719 void
4720 uma_prealloc(uma_zone_t zone, int items)
4721 {
4722         struct vm_domainset_iter di;
4723         uma_domain_t dom;
4724         uma_slab_t slab;
4725         uma_keg_t keg;
4726         int aflags, domain, slabs;
4727
4728         KEG_GET(zone, keg);
4729         slabs = howmany(items, keg->uk_ipers);
4730         while (slabs-- > 0) {
4731                 aflags = M_NOWAIT;
4732                 vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
4733                     &aflags);
4734                 for (;;) {
4735                         slab = keg_alloc_slab(keg, zone, domain, M_WAITOK,
4736                             aflags);
4737                         if (slab != NULL) {
4738                                 dom = &keg->uk_domain[slab->us_domain];
4739                                 /*
4740                                  * keg_alloc_slab() always returns a slab on the
4741                                  * partial list.
4742                                  */
4743                                 LIST_REMOVE(slab, us_link);
4744                                 LIST_INSERT_HEAD(&dom->ud_free_slab, slab,
4745                                     us_link);
4746                                 dom->ud_free_slabs++;
4747                                 KEG_UNLOCK(keg, slab->us_domain);
4748                                 break;
4749                         }
4750                         if (vm_domainset_iter_policy(&di, &domain) != 0)
4751                                 vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask);
4752                 }
4753         }
4754 }
4755
4756 /*
4757  * Returns a snapshot of memory consumption in bytes.
4758  */
4759 size_t
4760 uma_zone_memory(uma_zone_t zone)
4761 {
4762         size_t sz;
4763         int i;
4764
4765         sz = 0;
4766         if (zone->uz_flags & UMA_ZFLAG_CACHE) {
4767                 for (i = 0; i < vm_ndomains; i++)
4768                         sz += ZDOM_GET(zone, i)->uzd_nitems;
4769                 return (sz * zone->uz_size);
4770         }
4771         for (i = 0; i < vm_ndomains; i++)
4772                 sz += zone->uz_keg->uk_domain[i].ud_pages;
4773
4774         return (sz * PAGE_SIZE);
4775 }
4776
4777 /* See uma.h */
4778 void
4779 uma_reclaim(int req)
4780 {
4781
4782         CTR0(KTR_UMA, "UMA: vm asked us to release pages!");
4783         sx_xlock(&uma_reclaim_lock);
4784         bucket_enable();
4785
4786         switch (req) {
4787         case UMA_RECLAIM_TRIM:
4788                 zone_foreach(zone_trim, NULL);
4789                 break;
4790         case UMA_RECLAIM_DRAIN:
4791         case UMA_RECLAIM_DRAIN_CPU:
4792                 zone_foreach(zone_drain, NULL);
4793                 if (req == UMA_RECLAIM_DRAIN_CPU) {
4794                         pcpu_cache_drain_safe(NULL);
4795                         zone_foreach(zone_drain, NULL);
4796                 }
4797                 break;
4798         default:
4799                 panic("unhandled reclamation request %d", req);
4800         }
4801
4802         /*
4803          * Some slabs may have been freed but this zone will be visited early
4804          * we visit again so that we can free pages that are empty once other
4805          * zones are drained.  We have to do the same for buckets.
4806          */
4807         zone_drain(slabzones[0], NULL);
4808         zone_drain(slabzones[1], NULL);
4809         bucket_zone_drain();
4810         sx_xunlock(&uma_reclaim_lock);
4811 }
4812
4813 static volatile int uma_reclaim_needed;
4814
4815 void
4816 uma_reclaim_wakeup(void)
4817 {
4818
4819         if (atomic_fetchadd_int(&uma_reclaim_needed, 1) == 0)
4820                 wakeup(uma_reclaim);
4821 }
4822
4823 void
4824 uma_reclaim_worker(void *arg __unused)
4825 {
4826
4827         for (;;) {
4828                 sx_xlock(&uma_reclaim_lock);
4829                 while (atomic_load_int(&uma_reclaim_needed) == 0)
4830                         sx_sleep(uma_reclaim, &uma_reclaim_lock, PVM, "umarcl",
4831                             hz);
4832                 sx_xunlock(&uma_reclaim_lock);
4833                 EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM);
4834                 uma_reclaim(UMA_RECLAIM_DRAIN_CPU);
4835                 atomic_store_int(&uma_reclaim_needed, 0);
4836                 /* Don't fire more than once per-second. */
4837                 pause("umarclslp", hz);
4838         }
4839 }
4840
4841 /* See uma.h */
4842 void
4843 uma_zone_reclaim(uma_zone_t zone, int req)
4844 {
4845
4846         switch (req) {
4847         case UMA_RECLAIM_TRIM:
4848                 zone_trim(zone, NULL);
4849                 break;
4850         case UMA_RECLAIM_DRAIN:
4851                 zone_drain(zone, NULL);
4852                 break;
4853         case UMA_RECLAIM_DRAIN_CPU:
4854                 pcpu_cache_drain_safe(zone);
4855                 zone_drain(zone, NULL);
4856                 break;
4857         default:
4858                 panic("unhandled reclamation request %d", req);
4859         }
4860 }
4861
4862 /* See uma.h */
4863 int
4864 uma_zone_exhausted(uma_zone_t zone)
4865 {
4866
4867         return (atomic_load_32(&zone->uz_sleepers) > 0);
4868 }
4869
4870 unsigned long
4871 uma_limit(void)
4872 {
4873
4874         return (uma_kmem_limit);
4875 }
4876
4877 void
4878 uma_set_limit(unsigned long limit)
4879 {
4880
4881         uma_kmem_limit = limit;
4882 }
4883
4884 unsigned long
4885 uma_size(void)
4886 {
4887
4888         return (atomic_load_long(&uma_kmem_total));
4889 }
4890
4891 long
4892 uma_avail(void)
4893 {
4894
4895         return (uma_kmem_limit - uma_size());
4896 }
4897
4898 #ifdef DDB
4899 /*
4900  * Generate statistics across both the zone and its per-cpu cache's.  Return
4901  * desired statistics if the pointer is non-NULL for that statistic.
4902  *
4903  * Note: does not update the zone statistics, as it can't safely clear the
4904  * per-CPU cache statistic.
4905  *
4906  */
4907 static void
4908 uma_zone_sumstat(uma_zone_t z, long *cachefreep, uint64_t *allocsp,
4909     uint64_t *freesp, uint64_t *sleepsp, uint64_t *xdomainp)
4910 {
4911         uma_cache_t cache;
4912         uint64_t allocs, frees, sleeps, xdomain;
4913         int cachefree, cpu;
4914
4915         allocs = frees = sleeps = xdomain = 0;
4916         cachefree = 0;
4917         CPU_FOREACH(cpu) {
4918                 cache = &z->uz_cpu[cpu];
4919                 cachefree += cache->uc_allocbucket.ucb_cnt;
4920                 cachefree += cache->uc_freebucket.ucb_cnt;
4921                 xdomain += cache->uc_crossbucket.ucb_cnt;
4922                 cachefree += cache->uc_crossbucket.ucb_cnt;
4923                 allocs += cache->uc_allocs;
4924                 frees += cache->uc_frees;
4925         }
4926         allocs += counter_u64_fetch(z->uz_allocs);
4927         frees += counter_u64_fetch(z->uz_frees);
4928         xdomain += counter_u64_fetch(z->uz_xdomain);
4929         sleeps += z->uz_sleeps;
4930         if (cachefreep != NULL)
4931                 *cachefreep = cachefree;
4932         if (allocsp != NULL)
4933                 *allocsp = allocs;
4934         if (freesp != NULL)
4935                 *freesp = frees;
4936         if (sleepsp != NULL)
4937                 *sleepsp = sleeps;
4938         if (xdomainp != NULL)
4939                 *xdomainp = xdomain;
4940 }
4941 #endif /* DDB */
4942
4943 static int
4944 sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS)
4945 {
4946         uma_keg_t kz;
4947         uma_zone_t z;
4948         int count;
4949
4950         count = 0;
4951         rw_rlock(&uma_rwlock);
4952         LIST_FOREACH(kz, &uma_kegs, uk_link) {
4953                 LIST_FOREACH(z, &kz->uk_zones, uz_link)
4954                         count++;
4955         }
4956         LIST_FOREACH(z, &uma_cachezones, uz_link)
4957                 count++;
4958
4959         rw_runlock(&uma_rwlock);
4960         return (sysctl_handle_int(oidp, &count, 0, req));
4961 }
4962
4963 static void
4964 uma_vm_zone_stats(struct uma_type_header *uth, uma_zone_t z, struct sbuf *sbuf,
4965     struct uma_percpu_stat *ups, bool internal)
4966 {
4967         uma_zone_domain_t zdom;
4968         uma_cache_t cache;
4969         int i;
4970
4971
4972         for (i = 0; i < vm_ndomains; i++) {
4973                 zdom = ZDOM_GET(z, i);
4974                 uth->uth_zone_free += zdom->uzd_nitems;
4975         }
4976         uth->uth_allocs = counter_u64_fetch(z->uz_allocs);
4977         uth->uth_frees = counter_u64_fetch(z->uz_frees);
4978         uth->uth_fails = counter_u64_fetch(z->uz_fails);
4979         uth->uth_xdomain = counter_u64_fetch(z->uz_xdomain);
4980         uth->uth_sleeps = z->uz_sleeps;
4981
4982         for (i = 0; i < mp_maxid + 1; i++) {
4983                 bzero(&ups[i], sizeof(*ups));
4984                 if (internal || CPU_ABSENT(i))
4985                         continue;
4986                 cache = &z->uz_cpu[i];
4987                 ups[i].ups_cache_free += cache->uc_allocbucket.ucb_cnt;
4988                 ups[i].ups_cache_free += cache->uc_freebucket.ucb_cnt;
4989                 ups[i].ups_cache_free += cache->uc_crossbucket.ucb_cnt;
4990                 ups[i].ups_allocs = cache->uc_allocs;
4991                 ups[i].ups_frees = cache->uc_frees;
4992         }
4993 }
4994
4995 static int
4996 sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS)
4997 {
4998         struct uma_stream_header ush;
4999         struct uma_type_header uth;
5000         struct uma_percpu_stat *ups;
5001         struct sbuf sbuf;
5002         uma_keg_t kz;
5003         uma_zone_t z;
5004         uint64_t items;
5005         uint32_t kfree, pages;
5006         int count, error, i;
5007
5008         error = sysctl_wire_old_buffer(req, 0);
5009         if (error != 0)
5010                 return (error);
5011         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
5012         sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL);
5013         ups = malloc((mp_maxid + 1) * sizeof(*ups), M_TEMP, M_WAITOK);
5014
5015         count = 0;
5016         rw_rlock(&uma_rwlock);
5017         LIST_FOREACH(kz, &uma_kegs, uk_link) {
5018                 LIST_FOREACH(z, &kz->uk_zones, uz_link)
5019                         count++;
5020         }
5021
5022         LIST_FOREACH(z, &uma_cachezones, uz_link)
5023                 count++;
5024
5025         /*
5026          * Insert stream header.
5027          */
5028         bzero(&ush, sizeof(ush));
5029         ush.ush_version = UMA_STREAM_VERSION;
5030         ush.ush_maxcpus = (mp_maxid + 1);
5031         ush.ush_count = count;
5032         (void)sbuf_bcat(&sbuf, &ush, sizeof(ush));
5033
5034         LIST_FOREACH(kz, &uma_kegs, uk_link) {
5035                 kfree = pages = 0;
5036                 for (i = 0; i < vm_ndomains; i++) {
5037                         kfree += kz->uk_domain[i].ud_free_items;
5038                         pages += kz->uk_domain[i].ud_pages;
5039                 }
5040                 LIST_FOREACH(z, &kz->uk_zones, uz_link) {
5041                         bzero(&uth, sizeof(uth));
5042                         strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
5043                         uth.uth_align = kz->uk_align;
5044                         uth.uth_size = kz->uk_size;
5045                         uth.uth_rsize = kz->uk_rsize;
5046                         if (z->uz_max_items > 0) {
5047                                 items = UZ_ITEMS_COUNT(z->uz_items);
5048                                 uth.uth_pages = (items / kz->uk_ipers) *
5049                                         kz->uk_ppera;
5050                         } else
5051                                 uth.uth_pages = pages;
5052                         uth.uth_maxpages = (z->uz_max_items / kz->uk_ipers) *
5053                             kz->uk_ppera;
5054                         uth.uth_limit = z->uz_max_items;
5055                         uth.uth_keg_free = kfree;
5056
5057                         /*
5058                          * A zone is secondary is it is not the first entry
5059                          * on the keg's zone list.
5060                          */
5061                         if ((z->uz_flags & UMA_ZONE_SECONDARY) &&
5062                             (LIST_FIRST(&kz->uk_zones) != z))
5063                                 uth.uth_zone_flags = UTH_ZONE_SECONDARY;
5064                         uma_vm_zone_stats(&uth, z, &sbuf, ups,
5065                             kz->uk_flags & UMA_ZFLAG_INTERNAL);
5066                         (void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
5067                         for (i = 0; i < mp_maxid + 1; i++)
5068                                 (void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i]));
5069                 }
5070         }
5071         LIST_FOREACH(z, &uma_cachezones, uz_link) {
5072                 bzero(&uth, sizeof(uth));
5073                 strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
5074                 uth.uth_size = z->uz_size;
5075                 uma_vm_zone_stats(&uth, z, &sbuf, ups, false);
5076                 (void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
5077                 for (i = 0; i < mp_maxid + 1; i++)
5078                         (void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i]));
5079         }
5080
5081         rw_runlock(&uma_rwlock);
5082         error = sbuf_finish(&sbuf);
5083         sbuf_delete(&sbuf);
5084         free(ups, M_TEMP);
5085         return (error);
5086 }
5087
5088 int
5089 sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS)
5090 {
5091         uma_zone_t zone = *(uma_zone_t *)arg1;
5092         int error, max;
5093
5094         max = uma_zone_get_max(zone);
5095         error = sysctl_handle_int(oidp, &max, 0, req);
5096         if (error || !req->newptr)
5097                 return (error);
5098
5099         uma_zone_set_max(zone, max);
5100
5101         return (0);
5102 }
5103
5104 int
5105 sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS)
5106 {
5107         uma_zone_t zone;
5108         int cur;
5109
5110         /*
5111          * Some callers want to add sysctls for global zones that
5112          * may not yet exist so they pass a pointer to a pointer.
5113          */
5114         if (arg2 == 0)
5115                 zone = *(uma_zone_t *)arg1;
5116         else
5117                 zone = arg1;
5118         cur = uma_zone_get_cur(zone);
5119         return (sysctl_handle_int(oidp, &cur, 0, req));
5120 }
5121
5122 static int
5123 sysctl_handle_uma_zone_allocs(SYSCTL_HANDLER_ARGS)
5124 {
5125         uma_zone_t zone = arg1;
5126         uint64_t cur;
5127
5128         cur = uma_zone_get_allocs(zone);
5129         return (sysctl_handle_64(oidp, &cur, 0, req));
5130 }
5131
5132 static int
5133 sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS)
5134 {
5135         uma_zone_t zone = arg1;
5136         uint64_t cur;
5137
5138         cur = uma_zone_get_frees(zone);
5139         return (sysctl_handle_64(oidp, &cur, 0, req));
5140 }
5141
5142 static int
5143 sysctl_handle_uma_zone_flags(SYSCTL_HANDLER_ARGS)
5144 {
5145         struct sbuf sbuf;
5146         uma_zone_t zone = arg1;
5147         int error;
5148
5149         sbuf_new_for_sysctl(&sbuf, NULL, 0, req);
5150         if (zone->uz_flags != 0)
5151                 sbuf_printf(&sbuf, "0x%b", zone->uz_flags, PRINT_UMA_ZFLAGS);
5152         else
5153                 sbuf_printf(&sbuf, "0");
5154         error = sbuf_finish(&sbuf);
5155         sbuf_delete(&sbuf);
5156
5157         return (error);
5158 }
5159
5160 static int
5161 sysctl_handle_uma_slab_efficiency(SYSCTL_HANDLER_ARGS)
5162 {
5163         uma_keg_t keg = arg1;
5164         int avail, effpct, total;
5165
5166         total = keg->uk_ppera * PAGE_SIZE;
5167         if ((keg->uk_flags & UMA_ZFLAG_OFFPAGE) != 0)
5168                 total += slabzone(keg->uk_ipers)->uz_keg->uk_rsize;
5169         /*
5170          * We consider the client's requested size and alignment here, not the
5171          * real size determination uk_rsize, because we also adjust the real
5172          * size for internal implementation reasons (max bitset size).
5173          */
5174         avail = keg->uk_ipers * roundup2(keg->uk_size, keg->uk_align + 1);
5175         if ((keg->uk_flags & UMA_ZONE_PCPU) != 0)
5176                 avail *= mp_maxid + 1;
5177         effpct = 100 * avail / total;
5178         return (sysctl_handle_int(oidp, &effpct, 0, req));
5179 }
5180
5181 static int
5182 sysctl_handle_uma_zone_items(SYSCTL_HANDLER_ARGS)
5183 {
5184         uma_zone_t zone = arg1;
5185         uint64_t cur;
5186
5187         cur = UZ_ITEMS_COUNT(atomic_load_64(&zone->uz_items));
5188         return (sysctl_handle_64(oidp, &cur, 0, req));
5189 }
5190
5191 #ifdef INVARIANTS
5192 static uma_slab_t
5193 uma_dbg_getslab(uma_zone_t zone, void *item)
5194 {
5195         uma_slab_t slab;
5196         uma_keg_t keg;
5197         uint8_t *mem;
5198
5199         /*
5200          * It is safe to return the slab here even though the
5201          * zone is unlocked because the item's allocation state
5202          * essentially holds a reference.
5203          */
5204         mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
5205         if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0)
5206                 return (NULL);
5207         if (zone->uz_flags & UMA_ZFLAG_VTOSLAB)
5208                 return (vtoslab((vm_offset_t)mem));
5209         keg = zone->uz_keg;
5210         if ((keg->uk_flags & UMA_ZFLAG_HASH) == 0)
5211                 return ((uma_slab_t)(mem + keg->uk_pgoff));
5212         KEG_LOCK(keg, 0);
5213         slab = hash_sfind(&keg->uk_hash, mem);
5214         KEG_UNLOCK(keg, 0);
5215
5216         return (slab);
5217 }
5218
5219 static bool
5220 uma_dbg_zskip(uma_zone_t zone, void *mem)
5221 {
5222
5223         if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0)
5224                 return (true);
5225
5226         return (uma_dbg_kskip(zone->uz_keg, mem));
5227 }
5228
5229 static bool
5230 uma_dbg_kskip(uma_keg_t keg, void *mem)
5231 {
5232         uintptr_t idx;
5233
5234         if (dbg_divisor == 0)
5235                 return (true);
5236
5237         if (dbg_divisor == 1)
5238                 return (false);
5239
5240         idx = (uintptr_t)mem >> PAGE_SHIFT;
5241         if (keg->uk_ipers > 1) {
5242                 idx *= keg->uk_ipers;
5243                 idx += ((uintptr_t)mem & PAGE_MASK) / keg->uk_rsize;
5244         }
5245
5246         if ((idx / dbg_divisor) * dbg_divisor != idx) {
5247                 counter_u64_add(uma_skip_cnt, 1);
5248                 return (true);
5249         }
5250         counter_u64_add(uma_dbg_cnt, 1);
5251
5252         return (false);
5253 }
5254
5255 /*
5256  * Set up the slab's freei data such that uma_dbg_free can function.
5257  *
5258  */
5259 static void
5260 uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
5261 {
5262         uma_keg_t keg;
5263         int freei;
5264
5265         if (slab == NULL) {
5266                 slab = uma_dbg_getslab(zone, item);
5267                 if (slab == NULL)
5268                         panic("uma: item %p did not belong to zone %s\n",
5269                             item, zone->uz_name);
5270         }
5271         keg = zone->uz_keg;
5272         freei = slab_item_index(slab, keg, item);
5273
5274         if (BIT_ISSET(keg->uk_ipers, freei, slab_dbg_bits(slab, keg)))
5275                 panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n",
5276                     item, zone, zone->uz_name, slab, freei);
5277         BIT_SET_ATOMIC(keg->uk_ipers, freei, slab_dbg_bits(slab, keg));
5278 }
5279
5280 /*
5281  * Verifies freed addresses.  Checks for alignment, valid slab membership
5282  * and duplicate frees.
5283  *
5284  */
5285 static void
5286 uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
5287 {
5288         uma_keg_t keg;
5289         int freei;
5290
5291         if (slab == NULL) {
5292                 slab = uma_dbg_getslab(zone, item);
5293                 if (slab == NULL)
5294                         panic("uma: Freed item %p did not belong to zone %s\n",
5295                             item, zone->uz_name);
5296         }
5297         keg = zone->uz_keg;
5298         freei = slab_item_index(slab, keg, item);
5299
5300         if (freei >= keg->uk_ipers)
5301                 panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n",
5302                     item, zone, zone->uz_name, slab, freei);
5303
5304         if (slab_item(slab, keg, freei) != item)
5305                 panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n",
5306                     item, zone, zone->uz_name, slab, freei);
5307
5308         if (!BIT_ISSET(keg->uk_ipers, freei, slab_dbg_bits(slab, keg)))
5309                 panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n",
5310                     item, zone, zone->uz_name, slab, freei);
5311
5312         BIT_CLR_ATOMIC(keg->uk_ipers, freei, slab_dbg_bits(slab, keg));
5313 }
5314 #endif /* INVARIANTS */
5315
5316 #ifdef DDB
5317 static int64_t
5318 get_uma_stats(uma_keg_t kz, uma_zone_t z, uint64_t *allocs, uint64_t *used,
5319     uint64_t *sleeps, long *cachefree, uint64_t *xdomain)
5320 {
5321         uint64_t frees;
5322         int i;
5323
5324         if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
5325                 *allocs = counter_u64_fetch(z->uz_allocs);
5326                 frees = counter_u64_fetch(z->uz_frees);
5327                 *sleeps = z->uz_sleeps;
5328                 *cachefree = 0;
5329                 *xdomain = 0;
5330         } else
5331                 uma_zone_sumstat(z, cachefree, allocs, &frees, sleeps,
5332                     xdomain);
5333         for (i = 0; i < vm_ndomains; i++) {
5334                 *cachefree += ZDOM_GET(z, i)->uzd_nitems;
5335                 if (!((z->uz_flags & UMA_ZONE_SECONDARY) &&
5336                     (LIST_FIRST(&kz->uk_zones) != z)))
5337                         *cachefree += kz->uk_domain[i].ud_free_items;
5338         }
5339         *used = *allocs - frees;
5340         return (((int64_t)*used + *cachefree) * kz->uk_size);
5341 }
5342
5343 DB_SHOW_COMMAND(uma, db_show_uma)
5344 {
5345         const char *fmt_hdr, *fmt_entry;
5346         uma_keg_t kz;
5347         uma_zone_t z;
5348         uint64_t allocs, used, sleeps, xdomain;
5349         long cachefree;
5350         /* variables for sorting */
5351         uma_keg_t cur_keg;
5352         uma_zone_t cur_zone, last_zone;
5353         int64_t cur_size, last_size, size;
5354         int ties;
5355
5356         /* /i option produces machine-parseable CSV output */
5357         if (modif[0] == 'i') {
5358                 fmt_hdr = "%s,%s,%s,%s,%s,%s,%s,%s,%s\n";
5359                 fmt_entry = "\"%s\",%ju,%jd,%ld,%ju,%ju,%u,%jd,%ju\n";
5360         } else {
5361                 fmt_hdr = "%18s %6s %7s %7s %11s %7s %7s %10s %8s\n";
5362                 fmt_entry = "%18s %6ju %7jd %7ld %11ju %7ju %7u %10jd %8ju\n";
5363         }
5364
5365         db_printf(fmt_hdr, "Zone", "Size", "Used", "Free", "Requests",
5366             "Sleeps", "Bucket", "Total Mem", "XFree");
5367
5368         /* Sort the zones with largest size first. */
5369         last_zone = NULL;
5370         last_size = INT64_MAX;
5371         for (;;) {
5372                 cur_zone = NULL;
5373                 cur_size = -1;
5374                 ties = 0;
5375                 LIST_FOREACH(kz, &uma_kegs, uk_link) {
5376                         LIST_FOREACH(z, &kz->uk_zones, uz_link) {
5377                                 /*
5378                                  * In the case of size ties, print out zones
5379                                  * in the order they are encountered.  That is,
5380                                  * when we encounter the most recently output
5381                                  * zone, we have already printed all preceding
5382                                  * ties, and we must print all following ties.
5383                                  */
5384                                 if (z == last_zone) {
5385                                         ties = 1;
5386                                         continue;
5387                                 }
5388                                 size = get_uma_stats(kz, z, &allocs, &used,
5389                                     &sleeps, &cachefree, &xdomain);
5390                                 if (size > cur_size && size < last_size + ties)
5391                                 {
5392                                         cur_size = size;
5393                                         cur_zone = z;
5394                                         cur_keg = kz;
5395                                 }
5396                         }
5397                 }
5398                 if (cur_zone == NULL)
5399                         break;
5400
5401                 size = get_uma_stats(cur_keg, cur_zone, &allocs, &used,
5402                     &sleeps, &cachefree, &xdomain);
5403                 db_printf(fmt_entry, cur_zone->uz_name,
5404                     (uintmax_t)cur_keg->uk_size, (intmax_t)used, cachefree,
5405                     (uintmax_t)allocs, (uintmax_t)sleeps,
5406                     (unsigned)cur_zone->uz_bucket_size, (intmax_t)size,
5407                     xdomain);
5408
5409                 if (db_pager_quit)
5410                         return;
5411                 last_zone = cur_zone;
5412                 last_size = cur_size;
5413         }
5414 }
5415
5416 DB_SHOW_COMMAND(umacache, db_show_umacache)
5417 {
5418         uma_zone_t z;
5419         uint64_t allocs, frees;
5420         long cachefree;
5421         int i;
5422
5423         db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
5424             "Requests", "Bucket");
5425         LIST_FOREACH(z, &uma_cachezones, uz_link) {
5426                 uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL, NULL);
5427                 for (i = 0; i < vm_ndomains; i++)
5428                         cachefree += ZDOM_GET(z, i)->uzd_nitems;
5429                 db_printf("%18s %8ju %8jd %8ld %12ju %8u\n",
5430                     z->uz_name, (uintmax_t)z->uz_size,
5431                     (intmax_t)(allocs - frees), cachefree,
5432                     (uintmax_t)allocs, z->uz_bucket_size);
5433                 if (db_pager_quit)
5434                         return;
5435         }
5436 }
5437 #endif  /* DDB */