sys/vm/uma_core.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3  *
   4  * Copyright (c) 2002-2019 Jeffrey Roberson <jeff@FreeBSD.org>
   5  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
   6  * Copyright (c) 2004-2006 Robert N. M. Watson
   7  * All rights reserved.
   8  *
   9  * Redistribution and use in source and binary forms, with or without
  10  * modification, are permitted provided that the following conditions
  11  * are met:
  12  * 1. Redistributions of source code must retain the above copyright
  13  *    notice unmodified, this list of conditions, and the following
  14  *    disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29  */
  30
  31 /*
  32  * uma_core.c  Implementation of the Universal Memory allocator
  33  *
  34  * This allocator is intended to replace the multitude of similar object caches
  35  * in the standard FreeBSD kernel.  The intent is to be flexible as well as
  36  * efficient.  A primary design goal is to return unused memory to the rest of
  37  * the system.  This will make the system as a whole more flexible due to the
  38  * ability to move memory to subsystems which most need it instead of leaving
  39  * pools of reserved memory unused.
  40  *
  41  * The basic ideas stem from similar slab/zone based allocators whose algorithms
  42  * are well known.
  43  *
  44  */
  45
  46 /*
  47  * TODO:
  48  *      - Improve memory usage for large allocations
  49  *      - Investigate cache size adjustments
  50  */
  51
  52 #include <sys/cdefs.h>
  53 __FBSDID("$FreeBSD$");
  54
  55 #include "opt_ddb.h"
  56 #include "opt_param.h"
  57 #include "opt_vm.h"
  58
  59 #include <sys/param.h>
  60 #include <sys/systm.h>
  61 #include <sys/bitset.h>
  62 #include <sys/domainset.h>
  63 #include <sys/eventhandler.h>
  64 #include <sys/kernel.h>
  65 #include <sys/types.h>
  66 #include <sys/limits.h>
  67 #include <sys/queue.h>
  68 #include <sys/malloc.h>
  69 #include <sys/ktr.h>
  70 #include <sys/lock.h>
  71 #include <sys/sysctl.h>
  72 #include <sys/mutex.h>
  73 #include <sys/proc.h>
  74 #include <sys/random.h>
  75 #include <sys/rwlock.h>
  76 #include <sys/sbuf.h>
  77 #include <sys/sched.h>
  78 #include <sys/sleepqueue.h>
  79 #include <sys/smp.h>
  80 #include <sys/smr.h>
  81 #include <sys/taskqueue.h>
  82 #include <sys/vmmeter.h>
  83
  84 #include <vm/vm.h>
  85 #include <vm/vm_domainset.h>
  86 #include <vm/vm_object.h>
  87 #include <vm/vm_page.h>
  88 #include <vm/vm_pageout.h>
  89 #include <vm/vm_param.h>
  90 #include <vm/vm_phys.h>
  91 #include <vm/vm_pagequeue.h>
  92 #include <vm/vm_map.h>
  93 #include <vm/vm_kern.h>
  94 #include <vm/vm_extern.h>
  95 #include <vm/uma.h>
  96 #include <vm/uma_int.h>
  97 #include <vm/uma_dbg.h>
  98
  99 #include <ddb/ddb.h>
 100
 101 #ifdef DEBUG_MEMGUARD
 102 #include <vm/memguard.h>
 103 #endif
 104
 105 #include <machine/md_var.h>
 106
 107 #ifdef INVARIANTS
 108 #define UMA_ALWAYS_CTORDTOR     1
 109 #else
 110 #define UMA_ALWAYS_CTORDTOR     0
 111 #endif
 112
 113 /*
 114  * This is the zone and keg from which all zones are spawned.
 115  */
 116 static uma_zone_t kegs;
 117 static uma_zone_t zones;
 118
 119 /*
 120  * These are the two zones from which all offpage uma_slab_ts are allocated.
 121  *
 122  * One zone is for slab headers that can represent a larger number of items,
 123  * making the slabs themselves more efficient, and the other zone is for
 124  * headers that are smaller and represent fewer items, making the headers more
 125  * efficient.
 126  */
 127 #define SLABZONE_SIZE(setsize)                                  \
 128     (sizeof(struct uma_hash_slab) + BITSET_SIZE(setsize) * SLAB_BITSETS)
 129 #define SLABZONE0_SETSIZE       (PAGE_SIZE / 16)
 130 #define SLABZONE1_SETSIZE       SLAB_MAX_SETSIZE
 131 #define SLABZONE0_SIZE  SLABZONE_SIZE(SLABZONE0_SETSIZE)
 132 #define SLABZONE1_SIZE  SLABZONE_SIZE(SLABZONE1_SETSIZE)
 133 static uma_zone_t slabzones[2];
 134
 135 /*
 136  * The initial hash tables come out of this zone so they can be allocated
 137  * prior to malloc coming up.
 138  */
 139 static uma_zone_t hashzone;
 140
 141 /* The boot-time adjusted value for cache line alignment. */
 142 int uma_align_cache = 64 - 1;
 143
 144 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
 145 static MALLOC_DEFINE(M_UMA, "UMA", "UMA Misc");
 146
 147 /*
 148  * Are we allowed to allocate buckets?
 149  */
 150 static int bucketdisable = 1;
 151
 152 /* Linked list of all kegs in the system */
 153 static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs);
 154
 155 /* Linked list of all cache-only zones in the system */
 156 static LIST_HEAD(,uma_zone) uma_cachezones =
 157     LIST_HEAD_INITIALIZER(uma_cachezones);
 158
 159 /* This RW lock protects the keg list */
 160 static struct rwlock_padalign __exclusive_cache_line uma_rwlock;
 161
 162 /*
 163  * First available virual address for boot time allocations.
 164  */
 165 static vm_offset_t bootstart;
 166 static vm_offset_t bootmem;
 167
 168 static struct sx uma_reclaim_lock;
 169
 170 /*
 171  * kmem soft limit, initialized by uma_set_limit().  Ensure that early
 172  * allocations don't trigger a wakeup of the reclaim thread.
 173  */
 174 unsigned long uma_kmem_limit = LONG_MAX;
 175 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_limit, CTLFLAG_RD, &uma_kmem_limit, 0,
 176     "UMA kernel memory soft limit");
 177 unsigned long uma_kmem_total;
 178 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_total, CTLFLAG_RD, &uma_kmem_total, 0,
 179     "UMA kernel memory usage");
 180
 181 /* Is the VM done starting up? */
 182 static enum {
 183         BOOT_COLD,
 184         BOOT_KVA,
 185         BOOT_RUNNING,
 186         BOOT_SHUTDOWN,
 187 } booted = BOOT_COLD;
 188
 189 /*
 190  * This is the handle used to schedule events that need to happen
 191  * outside of the allocation fast path.
 192  */
 193 static struct callout uma_callout;
 194 #define UMA_TIMEOUT     20              /* Seconds for callout interval. */
 195
 196 /*
 197  * This structure is passed as the zone ctor arg so that I don't have to create
 198  * a special allocation function just for zones.
 199  */
 200 struct uma_zctor_args {
 201         const char *name;
 202         size_t size;
 203         uma_ctor ctor;
 204         uma_dtor dtor;
 205         uma_init uminit;
 206         uma_fini fini;
 207         uma_import import;
 208         uma_release release;
 209         void *arg;
 210         uma_keg_t keg;
 211         int align;
 212         uint32_t flags;
 213 };
 214
 215 struct uma_kctor_args {
 216         uma_zone_t zone;
 217         size_t size;
 218         uma_init uminit;
 219         uma_fini fini;
 220         int align;
 221         uint32_t flags;
 222 };
 223
 224 struct uma_bucket_zone {
 225         uma_zone_t      ubz_zone;
 226         const char      *ubz_name;
 227         int             ubz_entries;    /* Number of items it can hold. */
 228         int             ubz_maxsize;    /* Maximum allocation size per-item. */
 229 };
 230
 231 /*
 232  * Compute the actual number of bucket entries to pack them in power
 233  * of two sizes for more efficient space utilization.
 234  */
 235 #define BUCKET_SIZE(n)                                          \
 236     (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *))
 237
 238 #define BUCKET_MAX      BUCKET_SIZE(256)
 239 #define BUCKET_MIN      2
 240
 241 struct uma_bucket_zone bucket_zones[] = {
 242         /* Literal bucket sizes. */
 243         { NULL, "2 Bucket", 2, 4096 },
 244         { NULL, "4 Bucket", 4, 3072 },
 245         { NULL, "8 Bucket", 8, 2048 },
 246         { NULL, "16 Bucket", 16, 1024 },
 247         /* Rounded down power of 2 sizes for efficiency. */
 248         { NULL, "32 Bucket", BUCKET_SIZE(32), 512 },
 249         { NULL, "64 Bucket", BUCKET_SIZE(64), 256 },
 250         { NULL, "128 Bucket", BUCKET_SIZE(128), 128 },
 251         { NULL, "256 Bucket", BUCKET_SIZE(256), 64 },
 252         { NULL, NULL, 0}
 253 };
 254
 255 /*
 256  * Flags and enumerations to be passed to internal functions.
 257  */
 258 enum zfreeskip {
 259         SKIP_NONE =     0,
 260         SKIP_CNT =      0x00000001,
 261         SKIP_DTOR =     0x00010000,
 262         SKIP_FINI =     0x00020000,
 263 };
 264
 265 /* Prototypes.. */
 266
 267 void    uma_startup1(vm_offset_t);
 268 void    uma_startup2(void);
 269
 270 static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 271 static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 272 static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 273 static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 274 static void *contig_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
 275 static void page_free(void *, vm_size_t, uint8_t);
 276 static void pcpu_page_free(void *, vm_size_t, uint8_t);
 277 static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int, int);
 278 static void cache_drain(uma_zone_t);
 279 static void bucket_drain(uma_zone_t, uma_bucket_t);
 280 static void bucket_cache_reclaim(uma_zone_t zone, bool);
 281 static int keg_ctor(void *, int, void *, int);
 282 static void keg_dtor(void *, int, void *);
 283 static int zone_ctor(void *, int, void *, int);
 284 static void zone_dtor(void *, int, void *);
 285 static inline void item_dtor(uma_zone_t zone, void *item, int size,
 286     void *udata, enum zfreeskip skip);
 287 static int zero_init(void *, int, int);
 288 static void zone_free_bucket(uma_zone_t zone, uma_bucket_t bucket, void *udata,
 289     int itemdomain, bool ws);
 290 static void zone_foreach(void (*zfunc)(uma_zone_t, void *), void *);
 291 static void zone_foreach_unlocked(void (*zfunc)(uma_zone_t, void *), void *);
 292 static void zone_timeout(uma_zone_t zone, void *);
 293 static int hash_alloc(struct uma_hash *, u_int);
 294 static int hash_expand(struct uma_hash *, struct uma_hash *);
 295 static void hash_free(struct uma_hash *hash);
 296 static void uma_timeout(void *);
 297 static void uma_startup3(void);
 298 static void uma_shutdown(void);
 299 static void *zone_alloc_item(uma_zone_t, void *, int, int);
 300 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip);
 301 static int zone_alloc_limit(uma_zone_t zone, int count, int flags);
 302 static void zone_free_limit(uma_zone_t zone, int count);
 303 static void bucket_enable(void);
 304 static void bucket_init(void);
 305 static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int);
 306 static void bucket_free(uma_zone_t zone, uma_bucket_t, void *);
 307 static void bucket_zone_drain(void);
 308 static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int);
 309 static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab);
 310 static void slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item);
 311 static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
 312     uma_fini fini, int align, uint32_t flags);
 313 static int zone_import(void *, void **, int, int, int);
 314 static void zone_release(void *, void **, int);
 315 static bool cache_alloc(uma_zone_t, uma_cache_t, void *, int);
 316 static bool cache_free(uma_zone_t, uma_cache_t, void *, void *, int);
 317
 318 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
 319 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
 320 static int sysctl_handle_uma_zone_allocs(SYSCTL_HANDLER_ARGS);
 321 static int sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS);
 322 static int sysctl_handle_uma_zone_flags(SYSCTL_HANDLER_ARGS);
 323 static int sysctl_handle_uma_slab_efficiency(SYSCTL_HANDLER_ARGS);
 324 static int sysctl_handle_uma_zone_items(SYSCTL_HANDLER_ARGS);
 325
 326 static uint64_t uma_zone_get_allocs(uma_zone_t zone);
 327
 328 static SYSCTL_NODE(_vm, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
 329     "Memory allocation debugging");
 330
 331 #ifdef INVARIANTS
 332 static uint64_t uma_keg_get_allocs(uma_keg_t zone);
 333 static inline struct noslabbits *slab_dbg_bits(uma_slab_t slab, uma_keg_t keg);
 334
 335 static bool uma_dbg_kskip(uma_keg_t keg, void *mem);
 336 static bool uma_dbg_zskip(uma_zone_t zone, void *mem);
 337 static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item);
 338 static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item);
 339
 340 static u_int dbg_divisor = 1;
 341 SYSCTL_UINT(_vm_debug, OID_AUTO, divisor,
 342     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &dbg_divisor, 0,
 343     "Debug & thrash every this item in memory allocator");
 344
 345 static counter_u64_t uma_dbg_cnt = EARLY_COUNTER;
 346 static counter_u64_t uma_skip_cnt = EARLY_COUNTER;
 347 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, trashed, CTLFLAG_RD,
 348     &uma_dbg_cnt, "memory items debugged");
 349 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, skipped, CTLFLAG_RD,
 350     &uma_skip_cnt, "memory items skipped, not debugged");
 351 #endif
 352
 353 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
 354
 355 SYSCTL_NODE(_vm, OID_AUTO, uma, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 356     "Universal Memory Allocator");
 357
 358 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLTYPE_INT,
 359     0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones");
 360
 361 SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLTYPE_STRUCT,
 362     0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
 363
 364 static int zone_warnings = 1;
 365 SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0,
 366     "Warn when UMA zones becomes full");
 367
 368 static int multipage_slabs = 1;
 369 TUNABLE_INT("vm.debug.uma_multipage_slabs", &multipage_slabs);
 370 SYSCTL_INT(_vm_debug, OID_AUTO, uma_multipage_slabs,
 371     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &multipage_slabs, 0,
 372     "UMA may choose larger slab sizes for better efficiency");
 373
 374 /*
 375  * Select the slab zone for an offpage slab with the given maximum item count.
 376  */
 377 static inline uma_zone_t
 378 slabzone(int ipers)
 379 {
 380
 381         return (slabzones[ipers > SLABZONE0_SETSIZE]);
 382 }
 383
 384 /*
 385  * This routine checks to see whether or not it's safe to enable buckets.
 386  */
 387 static void
 388 bucket_enable(void)
 389 {
 390
 391         KASSERT(booted >= BOOT_KVA, ("Bucket enable before init"));
 392         bucketdisable = vm_page_count_min();
 393 }
 394
 395 /*
 396  * Initialize bucket_zones, the array of zones of buckets of various sizes.
 397  *
 398  * For each zone, calculate the memory required for each bucket, consisting
 399  * of the header and an array of pointers.
 400  */
 401 static void
 402 bucket_init(void)
 403 {
 404         struct uma_bucket_zone *ubz;
 405         int size;
 406
 407         for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) {
 408                 size = roundup(sizeof(struct uma_bucket), sizeof(void *));
 409                 size += sizeof(void *) * ubz->ubz_entries;
 410                 ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
 411                     NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 412                     UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET |
 413                     UMA_ZONE_FIRSTTOUCH);
 414         }
 415 }
 416
 417 /*
 418  * Given a desired number of entries for a bucket, return the zone from which
 419  * to allocate the bucket.
 420  */
 421 static struct uma_bucket_zone *
 422 bucket_zone_lookup(int entries)
 423 {
 424         struct uma_bucket_zone *ubz;
 425
 426         for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
 427                 if (ubz->ubz_entries >= entries)
 428                         return (ubz);
 429         ubz--;
 430         return (ubz);
 431 }
 432
 433 static struct uma_bucket_zone *
 434 bucket_zone_max(uma_zone_t zone, int nitems)
 435 {
 436         struct uma_bucket_zone *ubz;
 437         int bpcpu;
 438
 439         bpcpu = 2;
 440         if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0)
 441                 /* Count the cross-domain bucket. */
 442                 bpcpu++;
 443
 444         for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
 445                 if (ubz->ubz_entries * bpcpu * mp_ncpus > nitems)
 446                         break;
 447         if (ubz == &bucket_zones[0])
 448                 ubz = NULL;
 449         else
 450                 ubz--;
 451         return (ubz);
 452 }
 453
 454 static int
 455 bucket_select(int size)
 456 {
 457         struct uma_bucket_zone *ubz;
 458
 459         ubz = &bucket_zones[0];
 460         if (size > ubz->ubz_maxsize)
 461                 return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1);
 462
 463         for (; ubz->ubz_entries != 0; ubz++)
 464                 if (ubz->ubz_maxsize < size)
 465                         break;
 466         ubz--;
 467         return (ubz->ubz_entries);
 468 }
 469
 470 static uma_bucket_t
 471 bucket_alloc(uma_zone_t zone, void *udata, int flags)
 472 {
 473         struct uma_bucket_zone *ubz;
 474         uma_bucket_t bucket;
 475
 476         /*
 477          * Don't allocate buckets early in boot.
 478          */
 479         if (__predict_false(booted < BOOT_KVA))
 480                 return (NULL);
 481
 482         /*
 483          * To limit bucket recursion we store the original zone flags
 484          * in a cookie passed via zalloc_arg/zfree_arg.  This allows the
 485          * NOVM flag to persist even through deep recursions.  We also
 486          * store ZFLAG_BUCKET once we have recursed attempting to allocate
 487          * a bucket for a bucket zone so we do not allow infinite bucket
 488          * recursion.  This cookie will even persist to frees of unused
 489          * buckets via the allocation path or bucket allocations in the
 490          * free path.
 491          */
 492         if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
 493                 udata = (void *)(uintptr_t)zone->uz_flags;
 494         else {
 495                 if ((uintptr_t)udata & UMA_ZFLAG_BUCKET)
 496                         return (NULL);
 497                 udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET);
 498         }
 499         if (((uintptr_t)udata & UMA_ZONE_VM) != 0)
 500                 flags |= M_NOVM;
 501         ubz = bucket_zone_lookup(zone->uz_bucket_size);
 502         if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0)
 503                 ubz++;
 504         bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags);
 505         if (bucket) {
 506 #ifdef INVARIANTS
 507                 bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
 508 #endif
 509                 bucket->ub_cnt = 0;
 510                 bucket->ub_entries = ubz->ubz_entries;
 511                 bucket->ub_seq = SMR_SEQ_INVALID;
 512                 CTR3(KTR_UMA, "bucket_alloc: zone %s(%p) allocated bucket %p",
 513                     zone->uz_name, zone, bucket);
 514         }
 515
 516         return (bucket);
 517 }
 518
 519 static void
 520 bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata)
 521 {
 522         struct uma_bucket_zone *ubz;
 523
 524         if (bucket->ub_cnt != 0)
 525                 bucket_drain(zone, bucket);
 526
 527         KASSERT(bucket->ub_cnt == 0,
 528             ("bucket_free: Freeing a non free bucket."));
 529         KASSERT(bucket->ub_seq == SMR_SEQ_INVALID,
 530             ("bucket_free: Freeing an SMR bucket."));
 531         if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
 532                 udata = (void *)(uintptr_t)zone->uz_flags;
 533         ubz = bucket_zone_lookup(bucket->ub_entries);
 534         uma_zfree_arg(ubz->ubz_zone, bucket, udata);
 535 }
 536
 537 static void
 538 bucket_zone_drain(void)
 539 {
 540         struct uma_bucket_zone *ubz;
 541
 542         for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
 543                 uma_zone_reclaim(ubz->ubz_zone, UMA_RECLAIM_DRAIN);
 544 }
 545
 546 /*
 547  * Acquire the domain lock and record contention.
 548  */
 549 static uma_zone_domain_t
 550 zone_domain_lock(uma_zone_t zone, int domain)
 551 {
 552         uma_zone_domain_t zdom;
 553         bool lockfail;
 554
 555         zdom = ZDOM_GET(zone, domain);
 556         lockfail = false;
 557         if (ZDOM_OWNED(zdom))
 558                 lockfail = true;
 559         ZDOM_LOCK(zdom);
 560         /* This is unsynchronized.  The counter does not need to be precise. */
 561         if (lockfail && zone->uz_bucket_size < zone->uz_bucket_size_max)
 562                 zone->uz_bucket_size++;
 563         return (zdom);
 564 }
 565
 566 /*
 567  * Search for the domain with the least cached items and return it, breaking
 568  * ties with a preferred domain by returning it.
 569  */
 570 static __noinline int
 571 zone_domain_lowest(uma_zone_t zone, int pref)
 572 {
 573         long least, nitems;
 574         int domain;
 575         int i;
 576
 577         least = LONG_MAX;
 578         domain = 0;
 579         for (i = 0; i < vm_ndomains; i++) {
 580                 nitems = ZDOM_GET(zone, i)->uzd_nitems;
 581                 if (nitems < least) {
 582                         domain = i;
 583                         least = nitems;
 584                 } else if (nitems == least && (i == pref || domain == pref))
 585                         domain = pref;
 586         }
 587
 588         return (domain);
 589 }
 590
 591 /*
 592  * Search for the domain with the most cached items and return it or the
 593  * preferred domain if it has enough to proceed.
 594  */
 595 static __noinline int
 596 zone_domain_highest(uma_zone_t zone, int pref)
 597 {
 598         long most, nitems;
 599         int domain;
 600         int i;
 601
 602         if (ZDOM_GET(zone, pref)->uzd_nitems > BUCKET_MAX)
 603                 return (pref);
 604
 605         most = 0;
 606         domain = 0;
 607         for (i = 0; i < vm_ndomains; i++) {
 608                 nitems = ZDOM_GET(zone, i)->uzd_nitems;
 609                 if (nitems > most) {
 610                         domain = i;
 611                         most = nitems;
 612                 }
 613         }
 614
 615         return (domain);
 616 }
 617
 618 /*
 619  * Safely subtract cnt from imax.
 620  */
 621 static void
 622 zone_domain_imax_sub(uma_zone_domain_t zdom, int cnt)
 623 {
 624         long new;
 625         long old;
 626
 627         old = zdom->uzd_imax;
 628         do {
 629                 if (old <= cnt)
 630                         new = 0;
 631                 else
 632                         new = old - cnt;
 633         } while (atomic_fcmpset_long(&zdom->uzd_imax, &old, new) == 0);
 634 }
 635
 636 /*
 637  * Set the maximum imax value.
 638  */
 639 static void
 640 zone_domain_imax_set(uma_zone_domain_t zdom, int nitems)
 641 {
 642         long old;
 643
 644         old = zdom->uzd_imax;
 645         do {
 646                 if (old >= nitems)
 647                         break;
 648         } while (atomic_fcmpset_long(&zdom->uzd_imax, &old, nitems) == 0);
 649 }
 650
 651 /*
 652  * Attempt to satisfy an allocation by retrieving a full bucket from one of the
 653  * zone's caches.  If a bucket is found the zone is not locked on return.
 654  */
 655 static uma_bucket_t
 656 zone_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom, bool reclaim)
 657 {
 658         uma_bucket_t bucket;
 659         int i;
 660         bool dtor = false;
 661
 662         ZDOM_LOCK_ASSERT(zdom);
 663
 664         if ((bucket = STAILQ_FIRST(&zdom->uzd_buckets)) == NULL)
 665                 return (NULL);
 666
 667         /* SMR Buckets can not be re-used until readers expire. */
 668         if ((zone->uz_flags & UMA_ZONE_SMR) != 0 &&
 669             bucket->ub_seq != SMR_SEQ_INVALID) {
 670                 if (!smr_poll(zone->uz_smr, bucket->ub_seq, false))
 671                         return (NULL);
 672                 bucket->ub_seq = SMR_SEQ_INVALID;
 673                 dtor = (zone->uz_dtor != NULL) || UMA_ALWAYS_CTORDTOR;
 674                 if (STAILQ_NEXT(bucket, ub_link) != NULL)
 675                         zdom->uzd_seq = STAILQ_NEXT(bucket, ub_link)->ub_seq;
 676         }
 677         MPASS(zdom->uzd_nitems >= bucket->ub_cnt);
 678         STAILQ_REMOVE_HEAD(&zdom->uzd_buckets, ub_link);
 679         zdom->uzd_nitems -= bucket->ub_cnt;
 680
 681         /*
 682          * Shift the bounds of the current WSS interval to avoid
 683          * perturbing the estimate.
 684          */
 685         if (reclaim) {
 686                 zdom->uzd_imin -= lmin(zdom->uzd_imin, bucket->ub_cnt);
 687                 zone_domain_imax_sub(zdom, bucket->ub_cnt);
 688         } else if (zdom->uzd_imin > zdom->uzd_nitems)
 689                 zdom->uzd_imin = zdom->uzd_nitems;
 690
 691         ZDOM_UNLOCK(zdom);
 692         if (dtor)
 693                 for (i = 0; i < bucket->ub_cnt; i++)
 694                         item_dtor(zone, bucket->ub_bucket[i], zone->uz_size,
 695                             NULL, SKIP_NONE);
 696
 697         return (bucket);
 698 }
 699
 700 /*
 701  * Insert a full bucket into the specified cache.  The "ws" parameter indicates
 702  * whether the bucket's contents should be counted as part of the zone's working
 703  * set.  The bucket may be freed if it exceeds the bucket limit.
 704  */
 705 static void
 706 zone_put_bucket(uma_zone_t zone, int domain, uma_bucket_t bucket, void *udata,
 707     const bool ws)
 708 {
 709         uma_zone_domain_t zdom;
 710
 711         /* We don't cache empty buckets.  This can happen after a reclaim. */
 712         if (bucket->ub_cnt == 0)
 713                 goto out;
 714         zdom = zone_domain_lock(zone, domain);
 715
 716         KASSERT(!ws || zdom->uzd_nitems < zone->uz_bucket_max,
 717             ("%s: zone %p overflow", __func__, zone));
 718
 719         /*
 720          * Conditionally set the maximum number of items.
 721          */
 722         zdom->uzd_nitems += bucket->ub_cnt;
 723         if (__predict_true(zdom->uzd_nitems < zone->uz_bucket_max)) {
 724                 if (ws)
 725                         zone_domain_imax_set(zdom, zdom->uzd_nitems);
 726                 if (STAILQ_EMPTY(&zdom->uzd_buckets))
 727                         zdom->uzd_seq = bucket->ub_seq;
 728                 STAILQ_INSERT_TAIL(&zdom->uzd_buckets, bucket, ub_link);
 729                 ZDOM_UNLOCK(zdom);
 730                 return;
 731         }
 732         zdom->uzd_nitems -= bucket->ub_cnt;
 733         ZDOM_UNLOCK(zdom);
 734 out:
 735         bucket_free(zone, bucket, udata);
 736 }
 737
 738 /* Pops an item out of a per-cpu cache bucket. */
 739 static inline void *
 740 cache_bucket_pop(uma_cache_t cache, uma_cache_bucket_t bucket)
 741 {
 742         void *item;
 743
 744         CRITICAL_ASSERT(curthread);
 745
 746         bucket->ucb_cnt--;
 747         item = bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt];
 748 #ifdef INVARIANTS
 749         bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] = NULL;
 750         KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled."));
 751 #endif
 752         cache->uc_allocs++;
 753
 754         return (item);
 755 }
 756
 757 /* Pushes an item into a per-cpu cache bucket. */
 758 static inline void
 759 cache_bucket_push(uma_cache_t cache, uma_cache_bucket_t bucket, void *item)
 760 {
 761
 762         CRITICAL_ASSERT(curthread);
 763         KASSERT(bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] == NULL,
 764             ("uma_zfree: Freeing to non free bucket index."));
 765
 766         bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] = item;
 767         bucket->ucb_cnt++;
 768         cache->uc_frees++;
 769 }
 770
 771 /*
 772  * Unload a UMA bucket from a per-cpu cache.
 773  */
 774 static inline uma_bucket_t
 775 cache_bucket_unload(uma_cache_bucket_t bucket)
 776 {
 777         uma_bucket_t b;
 778
 779         b = bucket->ucb_bucket;
 780         if (b != NULL) {
 781                 MPASS(b->ub_entries == bucket->ucb_entries);
 782                 b->ub_cnt = bucket->ucb_cnt;
 783                 bucket->ucb_bucket = NULL;
 784                 bucket->ucb_entries = bucket->ucb_cnt = 0;
 785         }
 786
 787         return (b);
 788 }
 789
 790 static inline uma_bucket_t
 791 cache_bucket_unload_alloc(uma_cache_t cache)
 792 {
 793
 794         return (cache_bucket_unload(&cache->uc_allocbucket));
 795 }
 796
 797 static inline uma_bucket_t
 798 cache_bucket_unload_free(uma_cache_t cache)
 799 {
 800
 801         return (cache_bucket_unload(&cache->uc_freebucket));
 802 }
 803
 804 static inline uma_bucket_t
 805 cache_bucket_unload_cross(uma_cache_t cache)
 806 {
 807
 808         return (cache_bucket_unload(&cache->uc_crossbucket));
 809 }
 810
 811 /*
 812  * Load a bucket into a per-cpu cache bucket.
 813  */
 814 static inline void
 815 cache_bucket_load(uma_cache_bucket_t bucket, uma_bucket_t b)
 816 {
 817
 818         CRITICAL_ASSERT(curthread);
 819         MPASS(bucket->ucb_bucket == NULL);
 820         MPASS(b->ub_seq == SMR_SEQ_INVALID);
 821
 822         bucket->ucb_bucket = b;
 823         bucket->ucb_cnt = b->ub_cnt;
 824         bucket->ucb_entries = b->ub_entries;
 825 }
 826
 827 static inline void
 828 cache_bucket_load_alloc(uma_cache_t cache, uma_bucket_t b)
 829 {
 830
 831         cache_bucket_load(&cache->uc_allocbucket, b);
 832 }
 833
 834 static inline void
 835 cache_bucket_load_free(uma_cache_t cache, uma_bucket_t b)
 836 {
 837
 838         cache_bucket_load(&cache->uc_freebucket, b);
 839 }
 840
 841 #ifdef NUMA
 842 static inline void
 843 cache_bucket_load_cross(uma_cache_t cache, uma_bucket_t b)
 844 {
 845
 846         cache_bucket_load(&cache->uc_crossbucket, b);
 847 }
 848 #endif
 849
 850 /*
 851  * Copy and preserve ucb_spare.
 852  */
 853 static inline void
 854 cache_bucket_copy(uma_cache_bucket_t b1, uma_cache_bucket_t b2)
 855 {
 856
 857         b1->ucb_bucket = b2->ucb_bucket;
 858         b1->ucb_entries = b2->ucb_entries;
 859         b1->ucb_cnt = b2->ucb_cnt;
 860 }
 861
 862 /*
 863  * Swap two cache buckets.
 864  */
 865 static inline void
 866 cache_bucket_swap(uma_cache_bucket_t b1, uma_cache_bucket_t b2)
 867 {
 868         struct uma_cache_bucket b3;
 869
 870         CRITICAL_ASSERT(curthread);
 871
 872         cache_bucket_copy(&b3, b1);
 873         cache_bucket_copy(b1, b2);
 874         cache_bucket_copy(b2, &b3);
 875 }
 876
 877 /*
 878  * Attempt to fetch a bucket from a zone on behalf of the current cpu cache.
 879  */
 880 static uma_bucket_t
 881 cache_fetch_bucket(uma_zone_t zone, uma_cache_t cache, int domain)
 882 {
 883         uma_zone_domain_t zdom;
 884         uma_bucket_t bucket;
 885
 886         /*
 887          * Avoid the lock if possible.
 888          */
 889         zdom = ZDOM_GET(zone, domain);
 890         if (zdom->uzd_nitems == 0)
 891                 return (NULL);
 892
 893         if ((cache_uz_flags(cache) & UMA_ZONE_SMR) != 0 &&
 894             !smr_poll(zone->uz_smr, zdom->uzd_seq, false))
 895                 return (NULL);
 896
 897         /*
 898          * Check the zone's cache of buckets.
 899          */
 900         zdom = zone_domain_lock(zone, domain);
 901         if ((bucket = zone_fetch_bucket(zone, zdom, false)) != NULL) {
 902                 KASSERT(bucket->ub_cnt != 0,
 903                     ("cache_fetch_bucket: Returning an empty bucket."));
 904                 return (bucket);
 905         }
 906         ZDOM_UNLOCK(zdom);
 907
 908         return (NULL);
 909 }
 910
 911 static void
 912 zone_log_warning(uma_zone_t zone)
 913 {
 914         static const struct timeval warninterval = { 300, 0 };
 915
 916         if (!zone_warnings || zone->uz_warning == NULL)
 917                 return;
 918
 919         if (ratecheck(&zone->uz_ratecheck, &warninterval))
 920                 printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning);
 921 }
 922
 923 static inline void
 924 zone_maxaction(uma_zone_t zone)
 925 {
 926
 927         if (zone->uz_maxaction.ta_func != NULL)
 928                 taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction);
 929 }
 930
 931 /*
 932  * Routine called by timeout which is used to fire off some time interval
 933  * based calculations.  (stats, hash size, etc.)
 934  *
 935  * Arguments:
 936  *      arg   Unused
 937  *
 938  * Returns:
 939  *      Nothing
 940  */
 941 static void
 942 uma_timeout(void *unused)
 943 {
 944         bucket_enable();
 945         zone_foreach(zone_timeout, NULL);
 946
 947         /* Reschedule this event */
 948         callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
 949 }
 950
 951 /*
 952  * Update the working set size estimate for the zone's bucket cache.
 953  * The constants chosen here are somewhat arbitrary.  With an update period of
 954  * 20s (UMA_TIMEOUT), this estimate is dominated by zone activity over the
 955  * last 100s.
 956  */
 957 static void
 958 zone_domain_update_wss(uma_zone_domain_t zdom)
 959 {
 960         long wss;
 961
 962         ZDOM_LOCK(zdom);
 963         MPASS(zdom->uzd_imax >= zdom->uzd_imin);
 964         wss = zdom->uzd_imax - zdom->uzd_imin;
 965         zdom->uzd_imax = zdom->uzd_imin = zdom->uzd_nitems;
 966         zdom->uzd_wss = (4 * wss + zdom->uzd_wss) / 5;
 967         ZDOM_UNLOCK(zdom);
 968 }
 969
 970 /*
 971  * Routine to perform timeout driven calculations.  This expands the
 972  * hashes and does per cpu statistics aggregation.
 973  *
 974  *  Returns nothing.
 975  */
 976 static void
 977 zone_timeout(uma_zone_t zone, void *unused)
 978 {
 979         uma_keg_t keg;
 980         u_int slabs, pages;
 981
 982         if ((zone->uz_flags & UMA_ZFLAG_HASH) == 0)
 983                 goto update_wss;
 984
 985         keg = zone->uz_keg;
 986
 987         /*
 988          * Hash zones are non-numa by definition so the first domain
 989          * is the only one present.
 990          */
 991         KEG_LOCK(keg, 0);
 992         pages = keg->uk_domain[0].ud_pages;
 993
 994         /*
 995          * Expand the keg hash table.
 996          *
 997          * This is done if the number of slabs is larger than the hash size.
 998          * What I'm trying to do here is completely reduce collisions.  This
 999          * may be a little aggressive.  Should I allow for two collisions max?
1000          */
1001         if ((slabs = pages / keg->uk_ppera) > keg->uk_hash.uh_hashsize) {
1002                 struct uma_hash newhash;
1003                 struct uma_hash oldhash;
1004                 int ret;
1005
1006                 /*
1007                  * This is so involved because allocating and freeing
1008                  * while the keg lock is held will lead to deadlock.
1009                  * I have to do everything in stages and check for
1010                  * races.
1011                  */
1012                 KEG_UNLOCK(keg, 0);
1013                 ret = hash_alloc(&newhash, 1 << fls(slabs));
1014                 KEG_LOCK(keg, 0);
1015                 if (ret) {
1016                         if (hash_expand(&keg->uk_hash, &newhash)) {
1017                                 oldhash = keg->uk_hash;
1018                                 keg->uk_hash = newhash;
1019                         } else
1020                                 oldhash = newhash;
1021
1022                         KEG_UNLOCK(keg, 0);
1023                         hash_free(&oldhash);
1024                         goto update_wss;
1025                 }
1026         }
1027         KEG_UNLOCK(keg, 0);
1028
1029 update_wss:
1030         for (int i = 0; i < vm_ndomains; i++)
1031                 zone_domain_update_wss(ZDOM_GET(zone, i));
1032 }
1033
1034 /*
1035  * Allocate and zero fill the next sized hash table from the appropriate
1036  * backing store.
1037  *
1038  * Arguments:
1039  *      hash  A new hash structure with the old hash size in uh_hashsize
1040  *
1041  * Returns:
1042  *      1 on success and 0 on failure.
1043  */
1044 static int
1045 hash_alloc(struct uma_hash *hash, u_int size)
1046 {
1047         size_t alloc;
1048
1049         KASSERT(powerof2(size), ("hash size must be power of 2"));
1050         if (size > UMA_HASH_SIZE_INIT)  {
1051                 hash->uh_hashsize = size;
1052                 alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
1053                 hash->uh_slab_hash = malloc(alloc, M_UMAHASH, M_NOWAIT);
1054         } else {
1055                 alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
1056                 hash->uh_slab_hash = zone_alloc_item(hashzone, NULL,
1057                     UMA_ANYDOMAIN, M_WAITOK);
1058                 hash->uh_hashsize = UMA_HASH_SIZE_INIT;
1059         }
1060         if (hash->uh_slab_hash) {
1061                 bzero(hash->uh_slab_hash, alloc);
1062                 hash->uh_hashmask = hash->uh_hashsize - 1;
1063                 return (1);
1064         }
1065
1066         return (0);
1067 }
1068
1069 /*
1070  * Expands the hash table for HASH zones.  This is done from zone_timeout
1071  * to reduce collisions.  This must not be done in the regular allocation
1072  * path, otherwise, we can recurse on the vm while allocating pages.
1073  *
1074  * Arguments:
1075  *      oldhash  The hash you want to expand
1076  *      newhash  The hash structure for the new table
1077  *
1078  * Returns:
1079  *      Nothing
1080  *
1081  * Discussion:
1082  */
1083 static int
1084 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
1085 {
1086         uma_hash_slab_t slab;
1087         u_int hval;
1088         u_int idx;
1089
1090         if (!newhash->uh_slab_hash)
1091                 return (0);
1092
1093         if (oldhash->uh_hashsize >= newhash->uh_hashsize)
1094                 return (0);
1095
1096         /*
1097          * I need to investigate hash algorithms for resizing without a
1098          * full rehash.
1099          */
1100
1101         for (idx = 0; idx < oldhash->uh_hashsize; idx++)
1102                 while (!LIST_EMPTY(&oldhash->uh_slab_hash[idx])) {
1103                         slab = LIST_FIRST(&oldhash->uh_slab_hash[idx]);
1104                         LIST_REMOVE(slab, uhs_hlink);
1105                         hval = UMA_HASH(newhash, slab->uhs_data);
1106                         LIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
1107                             slab, uhs_hlink);
1108                 }
1109
1110         return (1);
1111 }
1112
1113 /*
1114  * Free the hash bucket to the appropriate backing store.
1115  *
1116  * Arguments:
1117  *      slab_hash  The hash bucket we're freeing
1118  *      hashsize   The number of entries in that hash bucket
1119  *
1120  * Returns:
1121  *      Nothing
1122  */
1123 static void
1124 hash_free(struct uma_hash *hash)
1125 {
1126         if (hash->uh_slab_hash == NULL)
1127                 return;
1128         if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
1129                 zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE);
1130         else
1131                 free(hash->uh_slab_hash, M_UMAHASH);
1132 }
1133
1134 /*
1135  * Frees all outstanding items in a bucket
1136  *
1137  * Arguments:
1138  *      zone   The zone to free to, must be unlocked.
1139  *      bucket The free/alloc bucket with items.
1140  *
1141  * Returns:
1142  *      Nothing
1143  */
1144 static void
1145 bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
1146 {
1147         int i;
1148
1149         if (bucket->ub_cnt == 0)
1150                 return;
1151
1152         if ((zone->uz_flags & UMA_ZONE_SMR) != 0 &&
1153             bucket->ub_seq != SMR_SEQ_INVALID) {
1154                 smr_wait(zone->uz_smr, bucket->ub_seq);
1155                 bucket->ub_seq = SMR_SEQ_INVALID;
1156                 for (i = 0; i < bucket->ub_cnt; i++)
1157                         item_dtor(zone, bucket->ub_bucket[i],
1158                             zone->uz_size, NULL, SKIP_NONE);
1159         }
1160         if (zone->uz_fini)
1161                 for (i = 0; i < bucket->ub_cnt; i++)
1162                         zone->uz_fini(bucket->ub_bucket[i], zone->uz_size);
1163         zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt);
1164         if (zone->uz_max_items > 0)
1165                 zone_free_limit(zone, bucket->ub_cnt);
1166 #ifdef INVARIANTS
1167         bzero(bucket->ub_bucket, sizeof(void *) * bucket->ub_cnt);
1168 #endif
1169         bucket->ub_cnt = 0;
1170 }
1171
1172 /*
1173  * Drains the per cpu caches for a zone.
1174  *
1175  * NOTE: This may only be called while the zone is being torn down, and not
1176  * during normal operation.  This is necessary in order that we do not have
1177  * to migrate CPUs to drain the per-CPU caches.
1178  *
1179  * Arguments:
1180  *      zone     The zone to drain, must be unlocked.
1181  *
1182  * Returns:
1183  *      Nothing
1184  */
1185 static void
1186 cache_drain(uma_zone_t zone)
1187 {
1188         uma_cache_t cache;
1189         uma_bucket_t bucket;
1190         smr_seq_t seq;
1191         int cpu;
1192
1193         /*
1194          * XXX: It is safe to not lock the per-CPU caches, because we're
1195          * tearing down the zone anyway.  I.e., there will be no further use
1196          * of the caches at this point.
1197          *
1198          * XXX: It would good to be able to assert that the zone is being
1199          * torn down to prevent improper use of cache_drain().
1200          */
1201         seq = SMR_SEQ_INVALID;
1202         if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
1203                 seq = smr_advance(zone->uz_smr);
1204         CPU_FOREACH(cpu) {
1205                 cache = &zone->uz_cpu[cpu];
1206                 bucket = cache_bucket_unload_alloc(cache);
1207                 if (bucket != NULL)
1208                         bucket_free(zone, bucket, NULL);
1209                 bucket = cache_bucket_unload_free(cache);
1210                 if (bucket != NULL) {
1211                         bucket->ub_seq = seq;
1212                         bucket_free(zone, bucket, NULL);
1213                 }
1214                 bucket = cache_bucket_unload_cross(cache);
1215                 if (bucket != NULL) {
1216                         bucket->ub_seq = seq;
1217                         bucket_free(zone, bucket, NULL);
1218                 }
1219         }
1220         bucket_cache_reclaim(zone, true);
1221 }
1222
1223 static void
1224 cache_shrink(uma_zone_t zone, void *unused)
1225 {
1226
1227         if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
1228                 return;
1229
1230         zone->uz_bucket_size =
1231             (zone->uz_bucket_size_min + zone->uz_bucket_size) / 2;
1232 }
1233
1234 static void
1235 cache_drain_safe_cpu(uma_zone_t zone, void *unused)
1236 {
1237         uma_cache_t cache;
1238         uma_bucket_t b1, b2, b3;
1239         int domain;
1240
1241         if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
1242                 return;
1243
1244         b1 = b2 = b3 = NULL;
1245         critical_enter();
1246         cache = &zone->uz_cpu[curcpu];
1247         domain = PCPU_GET(domain);
1248         b1 = cache_bucket_unload_alloc(cache);
1249
1250         /*
1251          * Don't flush SMR zone buckets.  This leaves the zone without a
1252          * bucket and forces every free to synchronize().
1253          */
1254         if ((zone->uz_flags & UMA_ZONE_SMR) == 0) {
1255                 b2 = cache_bucket_unload_free(cache);
1256                 b3 = cache_bucket_unload_cross(cache);
1257         }
1258         critical_exit();
1259
1260         if (b1 != NULL)
1261                 zone_free_bucket(zone, b1, NULL, domain, false);
1262         if (b2 != NULL)
1263                 zone_free_bucket(zone, b2, NULL, domain, false);
1264         if (b3 != NULL) {
1265                 /* Adjust the domain so it goes to zone_free_cross. */
1266                 domain = (domain + 1) % vm_ndomains;
1267                 zone_free_bucket(zone, b3, NULL, domain, false);
1268         }
1269 }
1270
1271 /*
1272  * Safely drain per-CPU caches of a zone(s) to alloc bucket.
1273  * This is an expensive call because it needs to bind to all CPUs
1274  * one by one and enter a critical section on each of them in order
1275  * to safely access their cache buckets.
1276  * Zone lock must not be held on call this function.
1277  */
1278 static void
1279 pcpu_cache_drain_safe(uma_zone_t zone)
1280 {
1281         int cpu;
1282
1283         /*
1284          * Polite bucket sizes shrinking was not enough, shrink aggressively.
1285          */
1286         if (zone)
1287                 cache_shrink(zone, NULL);
1288         else
1289                 zone_foreach(cache_shrink, NULL);
1290
1291         CPU_FOREACH(cpu) {
1292                 thread_lock(curthread);
1293                 sched_bind(curthread, cpu);
1294                 thread_unlock(curthread);
1295
1296                 if (zone)
1297                         cache_drain_safe_cpu(zone, NULL);
1298                 else
1299                         zone_foreach(cache_drain_safe_cpu, NULL);
1300         }
1301         thread_lock(curthread);
1302         sched_unbind(curthread);
1303         thread_unlock(curthread);
1304 }
1305
1306 /*
1307  * Reclaim cached buckets from a zone.  All buckets are reclaimed if the caller
1308  * requested a drain, otherwise the per-domain caches are trimmed to either
1309  * estimated working set size.
1310  */
1311 static void
1312 bucket_cache_reclaim(uma_zone_t zone, bool drain)
1313 {
1314         uma_zone_domain_t zdom;
1315         uma_bucket_t bucket;
1316         long target;
1317         int i;
1318
1319         /*
1320          * Shrink the zone bucket size to ensure that the per-CPU caches
1321          * don't grow too large.
1322          */
1323         if (zone->uz_bucket_size > zone->uz_bucket_size_min)
1324                 zone->uz_bucket_size--;
1325
1326         for (i = 0; i < vm_ndomains; i++) {
1327                 /*
1328                  * The cross bucket is partially filled and not part of
1329                  * the item count.  Reclaim it individually here.
1330                  */
1331                 zdom = ZDOM_GET(zone, i);
1332                 if ((zone->uz_flags & UMA_ZONE_SMR) == 0 || drain) {
1333                         ZONE_CROSS_LOCK(zone);
1334                         bucket = zdom->uzd_cross;
1335                         zdom->uzd_cross = NULL;
1336                         ZONE_CROSS_UNLOCK(zone);
1337                         if (bucket != NULL)
1338                                 bucket_free(zone, bucket, NULL);
1339                 }
1340
1341                 /*
1342                  * If we were asked to drain the zone, we are done only once
1343                  * this bucket cache is empty.  Otherwise, we reclaim items in
1344                  * excess of the zone's estimated working set size.  If the
1345                  * difference nitems - imin is larger than the WSS estimate,
1346                  * then the estimate will grow at the end of this interval and
1347                  * we ignore the historical average.
1348                  */
1349                 ZDOM_LOCK(zdom);
1350                 target = drain ? 0 : lmax(zdom->uzd_wss, zdom->uzd_nitems -
1351                     zdom->uzd_imin);
1352                 while (zdom->uzd_nitems > target) {
1353                         bucket = zone_fetch_bucket(zone, zdom, true);
1354                         if (bucket == NULL)
1355                                 break;
1356                         bucket_free(zone, bucket, NULL);
1357                         ZDOM_LOCK(zdom);
1358                 }
1359                 ZDOM_UNLOCK(zdom);
1360         }
1361 }
1362
1363 static void
1364 keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start)
1365 {
1366         uint8_t *mem;
1367         int i;
1368         uint8_t flags;
1369
1370         CTR4(KTR_UMA, "keg_free_slab keg %s(%p) slab %p, returning %d bytes",
1371             keg->uk_name, keg, slab, PAGE_SIZE * keg->uk_ppera);
1372
1373         mem = slab_data(slab, keg);
1374         flags = slab->us_flags;
1375         i = start;
1376         if (keg->uk_fini != NULL) {
1377                 for (i--; i > -1; i--)
1378 #ifdef INVARIANTS
1379                 /*
1380                  * trash_fini implies that dtor was trash_dtor. trash_fini
1381                  * would check that memory hasn't been modified since free,
1382                  * which executed trash_dtor.
1383                  * That's why we need to run uma_dbg_kskip() check here,
1384                  * albeit we don't make skip check for other init/fini
1385                  * invocations.
1386                  */
1387                 if (!uma_dbg_kskip(keg, slab_item(slab, keg, i)) ||
1388                     keg->uk_fini != trash_fini)
1389 #endif
1390                         keg->uk_fini(slab_item(slab, keg, i), keg->uk_size);
1391         }
1392         if (keg->uk_flags & UMA_ZFLAG_OFFPAGE)
1393                 zone_free_item(slabzone(keg->uk_ipers), slab_tohashslab(slab),
1394                     NULL, SKIP_NONE);
1395         keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera, flags);
1396         uma_total_dec(PAGE_SIZE * keg->uk_ppera);
1397 }
1398
1399 /*
1400  * Frees pages from a keg back to the system.  This is done on demand from
1401  * the pageout daemon.
1402  *
1403  * Returns nothing.
1404  */
1405 static void
1406 keg_drain(uma_keg_t keg)
1407 {
1408         struct slabhead freeslabs;
1409         uma_domain_t dom;
1410         uma_slab_t slab, tmp;
1411         int i, n;
1412
1413         if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)
1414                 return;
1415
1416         for (i = 0; i < vm_ndomains; i++) {
1417                 CTR4(KTR_UMA, "keg_drain %s(%p) domain %d free items: %u",
1418                     keg->uk_name, keg, i, dom->ud_free_items);
1419                 dom = &keg->uk_domain[i];
1420                 LIST_INIT(&freeslabs);
1421
1422                 KEG_LOCK(keg, i);
1423                 if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0) {
1424                         LIST_FOREACH(slab, &dom->ud_free_slab, us_link)
1425                                 UMA_HASH_REMOVE(&keg->uk_hash, slab);
1426                 }
1427                 n = dom->ud_free_slabs;
1428                 LIST_SWAP(&freeslabs, &dom->ud_free_slab, uma_slab, us_link);
1429                 dom->ud_free_slabs = 0;
1430                 dom->ud_free_items -= n * keg->uk_ipers;
1431                 dom->ud_pages -= n * keg->uk_ppera;
1432                 KEG_UNLOCK(keg, i);
1433
1434                 LIST_FOREACH_SAFE(slab, &freeslabs, us_link, tmp)
1435                         keg_free_slab(keg, slab, keg->uk_ipers);
1436         }
1437 }
1438
1439 static void
1440 zone_reclaim(uma_zone_t zone, int waitok, bool drain)
1441 {
1442
1443         /*
1444          * Set draining to interlock with zone_dtor() so we can release our
1445          * locks as we go.  Only dtor() should do a WAITOK call since it
1446          * is the only call that knows the structure will still be available
1447          * when it wakes up.
1448          */
1449         ZONE_LOCK(zone);
1450         while (zone->uz_flags & UMA_ZFLAG_RECLAIMING) {
1451                 if (waitok == M_NOWAIT)
1452                         goto out;
1453                 msleep(zone, &ZDOM_GET(zone, 0)->uzd_lock, PVM, "zonedrain",
1454                     1);
1455         }
1456         zone->uz_flags |= UMA_ZFLAG_RECLAIMING;
1457         ZONE_UNLOCK(zone);
1458         bucket_cache_reclaim(zone, drain);
1459
1460         /*
1461          * The DRAINING flag protects us from being freed while
1462          * we're running.  Normally the uma_rwlock would protect us but we
1463          * must be able to release and acquire the right lock for each keg.
1464          */
1465         if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0)
1466                 keg_drain(zone->uz_keg);
1467         ZONE_LOCK(zone);
1468         zone->uz_flags &= ~UMA_ZFLAG_RECLAIMING;
1469         wakeup(zone);
1470 out:
1471         ZONE_UNLOCK(zone);
1472 }
1473
1474 static void
1475 zone_drain(uma_zone_t zone, void *unused)
1476 {
1477
1478         zone_reclaim(zone, M_NOWAIT, true);
1479 }
1480
1481 static void
1482 zone_trim(uma_zone_t zone, void *unused)
1483 {
1484
1485         zone_reclaim(zone, M_NOWAIT, false);
1486 }
1487
1488 /*
1489  * Allocate a new slab for a keg and inserts it into the partial slab list.
1490  * The keg should be unlocked on entry.  If the allocation succeeds it will
1491  * be locked on return.
1492  *
1493  * Arguments:
1494  *      flags   Wait flags for the item initialization routine
1495  *      aflags  Wait flags for the slab allocation
1496  *
1497  * Returns:
1498  *      The slab that was allocated or NULL if there is no memory and the
1499  *      caller specified M_NOWAIT.
1500  */
1501 static uma_slab_t
1502 keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int flags,
1503     int aflags)
1504 {
1505         uma_domain_t dom;
1506         uma_alloc allocf;
1507         uma_slab_t slab;
1508         unsigned long size;
1509         uint8_t *mem;
1510         uint8_t sflags;
1511         int i;
1512
1513         KASSERT(domain >= 0 && domain < vm_ndomains,
1514             ("keg_alloc_slab: domain %d out of range", domain));
1515
1516         allocf = keg->uk_allocf;
1517         slab = NULL;
1518         mem = NULL;
1519         if (keg->uk_flags & UMA_ZFLAG_OFFPAGE) {
1520                 uma_hash_slab_t hslab;
1521                 hslab = zone_alloc_item(slabzone(keg->uk_ipers), NULL,
1522                     domain, aflags);
1523                 if (hslab == NULL)
1524                         goto fail;
1525                 slab = &hslab->uhs_slab;
1526         }
1527
1528         /*
1529          * This reproduces the old vm_zone behavior of zero filling pages the
1530          * first time they are added to a zone.
1531          *
1532          * Malloced items are zeroed in uma_zalloc.
1533          */
1534
1535         if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
1536                 aflags |= M_ZERO;
1537         else
1538                 aflags &= ~M_ZERO;
1539
1540         if (keg->uk_flags & UMA_ZONE_NODUMP)
1541                 aflags |= M_NODUMP;
1542
1543         /* zone is passed for legacy reasons. */
1544         size = keg->uk_ppera * PAGE_SIZE;
1545         mem = allocf(zone, size, domain, &sflags, aflags);
1546         if (mem == NULL) {
1547                 if (keg->uk_flags & UMA_ZFLAG_OFFPAGE)
1548                         zone_free_item(slabzone(keg->uk_ipers),
1549                             slab_tohashslab(slab), NULL, SKIP_NONE);
1550                 goto fail;
1551         }
1552         uma_total_inc(size);
1553
1554         /* For HASH zones all pages go to the same uma_domain. */
1555         if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0)
1556                 domain = 0;
1557
1558         /* Point the slab into the allocated memory */
1559         if (!(keg->uk_flags & UMA_ZFLAG_OFFPAGE))
1560                 slab = (uma_slab_t )(mem + keg->uk_pgoff);
1561         else
1562                 slab_tohashslab(slab)->uhs_data = mem;
1563
1564         if (keg->uk_flags & UMA_ZFLAG_VTOSLAB)
1565                 for (i = 0; i < keg->uk_ppera; i++)
1566                         vsetzoneslab((vm_offset_t)mem + (i * PAGE_SIZE),
1567                             zone, slab);
1568
1569         slab->us_freecount = keg->uk_ipers;
1570         slab->us_flags = sflags;
1571         slab->us_domain = domain;
1572
1573         BIT_FILL(keg->uk_ipers, &slab->us_free);
1574 #ifdef INVARIANTS
1575         BIT_ZERO(keg->uk_ipers, slab_dbg_bits(slab, keg));
1576 #endif
1577
1578         if (keg->uk_init != NULL) {
1579                 for (i = 0; i < keg->uk_ipers; i++)
1580                         if (keg->uk_init(slab_item(slab, keg, i),
1581                             keg->uk_size, flags) != 0)
1582                                 break;
1583                 if (i != keg->uk_ipers) {
1584                         keg_free_slab(keg, slab, i);
1585                         goto fail;
1586                 }
1587         }
1588         KEG_LOCK(keg, domain);
1589
1590         CTR3(KTR_UMA, "keg_alloc_slab: allocated slab %p for %s(%p)",
1591             slab, keg->uk_name, keg);
1592
1593         if (keg->uk_flags & UMA_ZFLAG_HASH)
1594                 UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
1595
1596         /*
1597          * If we got a slab here it's safe to mark it partially used
1598          * and return.  We assume that the caller is going to remove
1599          * at least one item.
1600          */
1601         dom = &keg->uk_domain[domain];
1602         LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
1603         dom->ud_pages += keg->uk_ppera;
1604         dom->ud_free_items += keg->uk_ipers;
1605
1606         return (slab);
1607
1608 fail:
1609         return (NULL);
1610 }
1611
1612 /*
1613  * This function is intended to be used early on in place of page_alloc() so
1614  * that we may use the boot time page cache to satisfy allocations before
1615  * the VM is ready.
1616  */
1617 static void *
1618 startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1619     int wait)
1620 {
1621         vm_paddr_t pa;
1622         vm_page_t m;
1623         void *mem;
1624         int pages;
1625         int i;
1626
1627         pages = howmany(bytes, PAGE_SIZE);
1628         KASSERT(pages > 0, ("%s can't reserve 0 pages", __func__));
1629
1630         *pflag = UMA_SLAB_BOOT;
1631         m = vm_page_alloc_contig_domain(NULL, 0, domain,
1632             malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED, pages,
1633             (vm_paddr_t)0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT);
1634         if (m == NULL)
1635                 return (NULL);
1636
1637         pa = VM_PAGE_TO_PHYS(m);
1638         for (i = 0; i < pages; i++, pa += PAGE_SIZE) {
1639 #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) || \
1640     defined(__riscv) || defined(__powerpc64__)
1641                 if ((wait & M_NODUMP) == 0)
1642                         dump_add_page(pa);
1643 #endif
1644         }
1645         /* Allocate KVA and indirectly advance bootmem. */
1646         mem = (void *)pmap_map(&bootmem, m->phys_addr,
1647             m->phys_addr + (pages * PAGE_SIZE), VM_PROT_READ | VM_PROT_WRITE);
1648         if ((wait & M_ZERO) != 0)
1649                 bzero(mem, pages * PAGE_SIZE);
1650
1651         return (mem);
1652 }
1653
1654 static void
1655 startup_free(void *mem, vm_size_t bytes)
1656 {
1657         vm_offset_t va;
1658         vm_page_t m;
1659
1660         va = (vm_offset_t)mem;
1661         m = PHYS_TO_VM_PAGE(pmap_kextract(va));
1662         pmap_remove(kernel_pmap, va, va + bytes);
1663         for (; bytes != 0; bytes -= PAGE_SIZE, m++) {
1664 #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) || \
1665     defined(__riscv) || defined(__powerpc64__)
1666                 dump_drop_page(VM_PAGE_TO_PHYS(m));
1667 #endif
1668                 vm_page_unwire_noq(m);
1669                 vm_page_free(m);
1670         }
1671 }
1672
1673 /*
1674  * Allocates a number of pages from the system
1675  *
1676  * Arguments:
1677  *      bytes  The number of bytes requested
1678  *      wait  Shall we wait?
1679  *
1680  * Returns:
1681  *      A pointer to the alloced memory or possibly
1682  *      NULL if M_NOWAIT is set.
1683  */
1684 static void *
1685 page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1686     int wait)
1687 {
1688         void *p;        /* Returned page */
1689
1690         *pflag = UMA_SLAB_KERNEL;
1691         p = (void *)kmem_malloc_domainset(DOMAINSET_FIXED(domain), bytes, wait);
1692
1693         return (p);
1694 }
1695
1696 static void *
1697 pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1698     int wait)
1699 {
1700         struct pglist alloctail;
1701         vm_offset_t addr, zkva;
1702         int cpu, flags;
1703         vm_page_t p, p_next;
1704 #ifdef NUMA
1705         struct pcpu *pc;
1706 #endif
1707
1708         MPASS(bytes == (mp_maxid + 1) * PAGE_SIZE);
1709
1710         TAILQ_INIT(&alloctail);
1711         flags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
1712             malloc2vm_flags(wait);
1713         *pflag = UMA_SLAB_KERNEL;
1714         for (cpu = 0; cpu <= mp_maxid; cpu++) {
1715                 if (CPU_ABSENT(cpu)) {
1716                         p = vm_page_alloc(NULL, 0, flags);
1717                 } else {
1718 #ifndef NUMA
1719                         p = vm_page_alloc(NULL, 0, flags);
1720 #else
1721                         pc = pcpu_find(cpu);
1722                         if (__predict_false(VM_DOMAIN_EMPTY(pc->pc_domain)))
1723                                 p = NULL;
1724                         else
1725                                 p = vm_page_alloc_domain(NULL, 0,
1726                                     pc->pc_domain, flags);
1727                         if (__predict_false(p == NULL))
1728                                 p = vm_page_alloc(NULL, 0, flags);
1729 #endif
1730                 }
1731                 if (__predict_false(p == NULL))
1732                         goto fail;
1733                 TAILQ_INSERT_TAIL(&alloctail, p, listq);
1734         }
1735         if ((addr = kva_alloc(bytes)) == 0)
1736                 goto fail;
1737         zkva = addr;
1738         TAILQ_FOREACH(p, &alloctail, listq) {
1739                 pmap_qenter(zkva, &p, 1);
1740                 zkva += PAGE_SIZE;
1741         }
1742         return ((void*)addr);
1743 fail:
1744         TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
1745                 vm_page_unwire_noq(p);
1746                 vm_page_free(p);
1747         }
1748         return (NULL);
1749 }
1750
1751 /*
1752  * Allocates a number of pages from within an object
1753  *
1754  * Arguments:
1755  *      bytes  The number of bytes requested
1756  *      wait   Shall we wait?
1757  *
1758  * Returns:
1759  *      A pointer to the alloced memory or possibly
1760  *      NULL if M_NOWAIT is set.
1761  */
1762 static void *
1763 noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
1764     int wait)
1765 {
1766         TAILQ_HEAD(, vm_page) alloctail;
1767         u_long npages;
1768         vm_offset_t retkva, zkva;
1769         vm_page_t p, p_next;
1770         uma_keg_t keg;
1771
1772         TAILQ_INIT(&alloctail);
1773         keg = zone->uz_keg;
1774
1775         npages = howmany(bytes, PAGE_SIZE);
1776         while (npages > 0) {
1777                 p = vm_page_alloc_domain(NULL, 0, domain, VM_ALLOC_INTERRUPT |
1778                     VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
1779                     ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK :
1780                     VM_ALLOC_NOWAIT));
1781                 if (p != NULL) {
1782                         /*
1783                          * Since the page does not belong to an object, its
1784                          * listq is unused.
1785                          */
1786                         TAILQ_INSERT_TAIL(&alloctail, p, listq);
1787                         npages--;
1788                         continue;
1789                 }
1790                 /*
1791                  * Page allocation failed, free intermediate pages and
1792                  * exit.
1793                  */
1794                 TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
1795                         vm_page_unwire_noq(p);
1796                         vm_page_free(p);
1797                 }
1798                 return (NULL);
1799         }
1800         *flags = UMA_SLAB_PRIV;
1801         zkva = keg->uk_kva +
1802             atomic_fetchadd_long(&keg->uk_offset, round_page(bytes));
1803         retkva = zkva;
1804         TAILQ_FOREACH(p, &alloctail, listq) {
1805                 pmap_qenter(zkva, &p, 1);
1806                 zkva += PAGE_SIZE;
1807         }
1808
1809         return ((void *)retkva);
1810 }
1811
1812 /*
1813  * Allocate physically contiguous pages.
1814  */
1815 static void *
1816 contig_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1817     int wait)
1818 {
1819
1820         *pflag = UMA_SLAB_KERNEL;
1821         return ((void *)kmem_alloc_contig_domainset(DOMAINSET_FIXED(domain),
1822             bytes, wait, 0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT));
1823 }
1824
1825 /*
1826  * Frees a number of pages to the system
1827  *
1828  * Arguments:
1829  *      mem   A pointer to the memory to be freed
1830  *      size  The size of the memory being freed
1831  *      flags The original p->us_flags field
1832  *
1833  * Returns:
1834  *      Nothing
1835  */
1836 static void
1837 page_free(void *mem, vm_size_t size, uint8_t flags)
1838 {
1839
1840         if ((flags & UMA_SLAB_BOOT) != 0) {
1841                 startup_free(mem, size);
1842                 return;
1843         }
1844
1845         KASSERT((flags & UMA_SLAB_KERNEL) != 0,
1846             ("UMA: page_free used with invalid flags %x", flags));
1847
1848         kmem_free((vm_offset_t)mem, size);
1849 }
1850
1851 /*
1852  * Frees pcpu zone allocations
1853  *
1854  * Arguments:
1855  *      mem   A pointer to the memory to be freed
1856  *      size  The size of the memory being freed
1857  *      flags The original p->us_flags field
1858  *
1859  * Returns:
1860  *      Nothing
1861  */
1862 static void
1863 pcpu_page_free(void *mem, vm_size_t size, uint8_t flags)
1864 {
1865         vm_offset_t sva, curva;
1866         vm_paddr_t paddr;
1867         vm_page_t m;
1868
1869         MPASS(size == (mp_maxid+1)*PAGE_SIZE);
1870
1871         if ((flags & UMA_SLAB_BOOT) != 0) {
1872                 startup_free(mem, size);
1873                 return;
1874         }
1875
1876         sva = (vm_offset_t)mem;
1877         for (curva = sva; curva < sva + size; curva += PAGE_SIZE) {
1878                 paddr = pmap_kextract(curva);
1879                 m = PHYS_TO_VM_PAGE(paddr);
1880                 vm_page_unwire_noq(m);
1881                 vm_page_free(m);
1882         }
1883         pmap_qremove(sva, size >> PAGE_SHIFT);
1884         kva_free(sva, size);
1885 }
1886
1887
1888 /*
1889  * Zero fill initializer
1890  *
1891  * Arguments/Returns follow uma_init specifications
1892  */
1893 static int
1894 zero_init(void *mem, int size, int flags)
1895 {
1896         bzero(mem, size);
1897         return (0);
1898 }
1899
1900 #ifdef INVARIANTS
1901 struct noslabbits *
1902 slab_dbg_bits(uma_slab_t slab, uma_keg_t keg)
1903 {
1904
1905         return ((void *)((char *)&slab->us_free + BITSET_SIZE(keg->uk_ipers)));
1906 }
1907 #endif
1908
1909 /*
1910  * Actual size of embedded struct slab (!OFFPAGE).
1911  */
1912 size_t
1913 slab_sizeof(int nitems)
1914 {
1915         size_t s;
1916
1917         s = sizeof(struct uma_slab) + BITSET_SIZE(nitems) * SLAB_BITSETS;
1918         return (roundup(s, UMA_ALIGN_PTR + 1));
1919 }
1920
1921 /*
1922  * Size of memory for embedded slabs (!OFFPAGE).
1923  */
1924 size_t
1925 slab_space(int nitems)
1926 {
1927         return (UMA_SLAB_SIZE - slab_sizeof(nitems));
1928 }
1929
1930 #define UMA_FIXPT_SHIFT 31
1931 #define UMA_FRAC_FIXPT(n, d)                                            \
1932         ((uint32_t)(((uint64_t)(n) << UMA_FIXPT_SHIFT) / (d)))
1933 #define UMA_FIXPT_PCT(f)                                                \
1934         ((u_int)(((uint64_t)100 * (f)) >> UMA_FIXPT_SHIFT))
1935 #define UMA_PCT_FIXPT(pct)      UMA_FRAC_FIXPT((pct), 100)
1936 #define UMA_MIN_EFF     UMA_PCT_FIXPT(100 - UMA_MAX_WASTE)
1937
1938 /*
1939  * Compute the number of items that will fit in a slab.  If hdr is true, the
1940  * item count may be limited to provide space in the slab for an inline slab
1941  * header.  Otherwise, all slab space will be provided for item storage.
1942  */
1943 static u_int
1944 slab_ipers_hdr(u_int size, u_int rsize, u_int slabsize, bool hdr)
1945 {
1946         u_int ipers;
1947         u_int padpi;
1948
1949         /* The padding between items is not needed after the last item. */
1950         padpi = rsize - size;
1951
1952         if (hdr) {
1953                 /*
1954                  * Start with the maximum item count and remove items until
1955                  * the slab header first alongside the allocatable memory.
1956                  */
1957                 for (ipers = MIN(SLAB_MAX_SETSIZE,
1958                     (slabsize + padpi - slab_sizeof(1)) / rsize);
1959                     ipers > 0 &&
1960                     ipers * rsize - padpi + slab_sizeof(ipers) > slabsize;
1961                     ipers--)
1962                         continue;
1963         } else {
1964                 ipers = MIN((slabsize + padpi) / rsize, SLAB_MAX_SETSIZE);
1965         }
1966
1967         return (ipers);
1968 }
1969
1970 /*
1971  * Compute the number of items that will fit in a slab for a startup zone.
1972  */
1973 int
1974 slab_ipers(size_t size, int align)
1975 {
1976         int rsize;
1977
1978         rsize = roundup(size, align + 1); /* Assume no CACHESPREAD */
1979         return (slab_ipers_hdr(size, rsize, UMA_SLAB_SIZE, true));
1980 }
1981
1982 struct keg_layout_result {
1983         u_int format;
1984         u_int slabsize;
1985         u_int ipers;
1986         u_int eff;
1987 };
1988
1989 static void
1990 keg_layout_one(uma_keg_t keg, u_int rsize, u_int slabsize, u_int fmt,
1991     struct keg_layout_result *kl)
1992 {
1993         u_int total;
1994
1995         kl->format = fmt;
1996         kl->slabsize = slabsize;
1997
1998         /* Handle INTERNAL as inline with an extra page. */
1999         if ((fmt & UMA_ZFLAG_INTERNAL) != 0) {
2000                 kl->format &= ~UMA_ZFLAG_INTERNAL;
2001                 kl->slabsize += PAGE_SIZE;
2002         }
2003
2004         kl->ipers = slab_ipers_hdr(keg->uk_size, rsize, kl->slabsize,
2005             (fmt & UMA_ZFLAG_OFFPAGE) == 0);
2006
2007         /* Account for memory used by an offpage slab header. */
2008         total = kl->slabsize;
2009         if ((fmt & UMA_ZFLAG_OFFPAGE) != 0)
2010                 total += slabzone(kl->ipers)->uz_keg->uk_rsize;
2011
2012         kl->eff = UMA_FRAC_FIXPT(kl->ipers * rsize, total);
2013 }
2014
2015 /*
2016  * Determine the format of a uma keg.  This determines where the slab header
2017  * will be placed (inline or offpage) and calculates ipers, rsize, and ppera.
2018  *
2019  * Arguments
2020  *      keg  The zone we should initialize
2021  *
2022  * Returns
2023  *      Nothing
2024  */
2025 static void
2026 keg_layout(uma_keg_t keg)
2027 {
2028         struct keg_layout_result kl = {}, kl_tmp;
2029         u_int fmts[2];
2030         u_int alignsize;
2031         u_int nfmt;
2032         u_int pages;
2033         u_int rsize;
2034         u_int slabsize;
2035         u_int i, j;
2036
2037         KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 ||
2038             (keg->uk_size <= UMA_PCPU_ALLOC_SIZE &&
2039              (keg->uk_flags & UMA_ZONE_CACHESPREAD) == 0),
2040             ("%s: cannot configure for PCPU: keg=%s, size=%u, flags=0x%b",
2041              __func__, keg->uk_name, keg->uk_size, keg->uk_flags,
2042              PRINT_UMA_ZFLAGS));
2043         KASSERT((keg->uk_flags & (UMA_ZFLAG_INTERNAL | UMA_ZONE_VM)) == 0 ||
2044             (keg->uk_flags & (UMA_ZONE_NOTOUCH | UMA_ZONE_PCPU)) == 0,
2045             ("%s: incompatible flags 0x%b", __func__, keg->uk_flags,
2046              PRINT_UMA_ZFLAGS));
2047
2048         alignsize = keg->uk_align + 1;
2049
2050         /*
2051          * Calculate the size of each allocation (rsize) according to
2052          * alignment.  If the requested size is smaller than we have
2053          * allocation bits for we round it up.
2054          */
2055         rsize = MAX(keg->uk_size, UMA_SMALLEST_UNIT);
2056         rsize = roundup2(rsize, alignsize);
2057
2058         if ((keg->uk_flags & UMA_ZONE_CACHESPREAD) != 0) {
2059                 /*
2060                  * We want one item to start on every align boundary in a page.
2061                  * To do this we will span pages.  We will also extend the item
2062                  * by the size of align if it is an even multiple of align.
2063                  * Otherwise, it would fall on the same boundary every time.
2064                  */
2065                 if ((rsize & alignsize) == 0)
2066                         rsize += alignsize;
2067                 slabsize = rsize * (PAGE_SIZE / alignsize);
2068                 slabsize = MIN(slabsize, rsize * SLAB_MAX_SETSIZE);
2069                 slabsize = MIN(slabsize, UMA_CACHESPREAD_MAX_SIZE);
2070                 slabsize = round_page(slabsize);
2071         } else {
2072                 /*
2073                  * Start with a slab size of as many pages as it takes to
2074                  * represent a single item.  We will try to fit as many
2075                  * additional items into the slab as possible.
2076                  */
2077                 slabsize = round_page(keg->uk_size);
2078         }
2079
2080         /* Build a list of all of the available formats for this keg. */
2081         nfmt = 0;
2082
2083         /* Evaluate an inline slab layout. */
2084         if ((keg->uk_flags & (UMA_ZONE_NOTOUCH | UMA_ZONE_PCPU)) == 0)
2085                 fmts[nfmt++] = 0;
2086
2087         /* TODO: vm_page-embedded slab. */
2088
2089         /*
2090          * We can't do OFFPAGE if we're internal or if we've been
2091          * asked to not go to the VM for buckets.  If we do this we
2092          * may end up going to the VM for slabs which we do not want
2093          * to do if we're UMA_ZONE_VM, which clearly forbids it.
2094          * In those cases, evaluate a pseudo-format called INTERNAL
2095          * which has an inline slab header and one extra page to
2096          * guarantee that it fits.
2097          *
2098          * Otherwise, see if using an OFFPAGE slab will improve our
2099          * efficiency.
2100          */
2101         if ((keg->uk_flags & (UMA_ZFLAG_INTERNAL | UMA_ZONE_VM)) != 0)
2102                 fmts[nfmt++] = UMA_ZFLAG_INTERNAL;
2103         else
2104                 fmts[nfmt++] = UMA_ZFLAG_OFFPAGE;
2105
2106         /*
2107          * Choose a slab size and format which satisfy the minimum efficiency.
2108          * Prefer the smallest slab size that meets the constraints.
2109          *
2110          * Start with a minimum slab size, to accommodate CACHESPREAD.  Then,
2111          * for small items (up to PAGE_SIZE), the iteration increment is one
2112          * page; and for large items, the increment is one item.
2113          */
2114         i = (slabsize + rsize - keg->uk_size) / MAX(PAGE_SIZE, rsize);
2115         KASSERT(i >= 1, ("keg %s(%p) flags=0x%b slabsize=%u, rsize=%u, i=%u",
2116             keg->uk_name, keg, keg->uk_flags, PRINT_UMA_ZFLAGS, slabsize,
2117             rsize, i));
2118         for ( ; ; i++) {
2119                 slabsize = (rsize <= PAGE_SIZE) ? ptoa(i) :
2120                     round_page(rsize * (i - 1) + keg->uk_size);
2121
2122                 for (j = 0; j < nfmt; j++) {
2123                         /* Only if we have no viable format yet. */
2124                         if ((fmts[j] & UMA_ZFLAG_INTERNAL) != 0 &&
2125                             kl.ipers > 0)
2126                                 continue;
2127
2128                         keg_layout_one(keg, rsize, slabsize, fmts[j], &kl_tmp);
2129                         if (kl_tmp.eff <= kl.eff)
2130                                 continue;
2131
2132                         kl = kl_tmp;
2133
2134                         CTR6(KTR_UMA, "keg %s layout: format %#x "
2135                             "(ipers %u * rsize %u) / slabsize %#x = %u%% eff",
2136                             keg->uk_name, kl.format, kl.ipers, rsize,
2137                             kl.slabsize, UMA_FIXPT_PCT(kl.eff));
2138
2139                         /* Stop when we reach the minimum efficiency. */
2140                         if (kl.eff >= UMA_MIN_EFF)
2141                                 break;
2142                 }
2143
2144                 if (kl.eff >= UMA_MIN_EFF || !multipage_slabs ||
2145                     slabsize >= SLAB_MAX_SETSIZE * rsize ||
2146                     (keg->uk_flags & (UMA_ZONE_PCPU | UMA_ZONE_CONTIG)) != 0)
2147                         break;
2148         }
2149
2150         pages = atop(kl.slabsize);
2151         if ((keg->uk_flags & UMA_ZONE_PCPU) != 0)
2152                 pages *= mp_maxid + 1;
2153
2154         keg->uk_rsize = rsize;
2155         keg->uk_ipers = kl.ipers;
2156         keg->uk_ppera = pages;
2157         keg->uk_flags |= kl.format;
2158
2159         /*
2160          * How do we find the slab header if it is offpage or if not all item
2161          * start addresses are in the same page?  We could solve the latter
2162          * case with vaddr alignment, but we don't.
2163          */
2164         if ((keg->uk_flags & UMA_ZFLAG_OFFPAGE) != 0 ||
2165             (keg->uk_ipers - 1) * rsize >= PAGE_SIZE) {
2166                 if ((keg->uk_flags & UMA_ZONE_NOTPAGE) != 0)
2167                         keg->uk_flags |= UMA_ZFLAG_HASH;
2168                 else
2169                         keg->uk_flags |= UMA_ZFLAG_VTOSLAB;
2170         }
2171
2172         CTR6(KTR_UMA, "%s: keg=%s, flags=%#x, rsize=%u, ipers=%u, ppera=%u",
2173             __func__, keg->uk_name, keg->uk_flags, rsize, keg->uk_ipers,
2174             pages);
2175         KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_MAX_SETSIZE,
2176             ("%s: keg=%s, flags=0x%b, rsize=%u, ipers=%u, ppera=%u", __func__,
2177              keg->uk_name, keg->uk_flags, PRINT_UMA_ZFLAGS, rsize,
2178              keg->uk_ipers, pages));
2179 }
2180
2181 /*
2182  * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
2183  * the keg onto the global keg list.
2184  *
2185  * Arguments/Returns follow uma_ctor specifications
2186  *      udata  Actually uma_kctor_args
2187  */
2188 static int
2189 keg_ctor(void *mem, int size, void *udata, int flags)
2190 {
2191         struct uma_kctor_args *arg = udata;
2192         uma_keg_t keg = mem;
2193         uma_zone_t zone;
2194         int i;
2195
2196         bzero(keg, size);
2197         keg->uk_size = arg->size;
2198         keg->uk_init = arg->uminit;
2199         keg->uk_fini = arg->fini;
2200         keg->uk_align = arg->align;
2201         keg->uk_reserve = 0;
2202         keg->uk_flags = arg->flags;
2203
2204         /*
2205          * We use a global round-robin policy by default.  Zones with
2206          * UMA_ZONE_FIRSTTOUCH set will use first-touch instead, in which
2207          * case the iterator is never run.
2208          */
2209         keg->uk_dr.dr_policy = DOMAINSET_RR();
2210         keg->uk_dr.dr_iter = 0;
2211
2212         /*
2213          * The master zone is passed to us at keg-creation time.
2214          */
2215         zone = arg->zone;
2216         keg->uk_name = zone->uz_name;
2217
2218         if (arg->flags & UMA_ZONE_ZINIT)
2219                 keg->uk_init = zero_init;
2220
2221         if (arg->flags & UMA_ZONE_MALLOC)
2222                 keg->uk_flags |= UMA_ZFLAG_VTOSLAB;
2223
2224 #ifndef SMP
2225         keg->uk_flags &= ~UMA_ZONE_PCPU;
2226 #endif
2227
2228         keg_layout(keg);
2229
2230         /*
2231          * Use a first-touch NUMA policy for kegs that pmap_extract() will
2232          * work on.  Use round-robin for everything else.
2233          *
2234          * Zones may override the default by specifying either.
2235          */
2236 #ifdef NUMA
2237         if ((keg->uk_flags &
2238             (UMA_ZONE_ROUNDROBIN | UMA_ZFLAG_CACHE | UMA_ZONE_NOTPAGE)) == 0)
2239                 keg->uk_flags |= UMA_ZONE_FIRSTTOUCH;
2240         else if ((keg->uk_flags & UMA_ZONE_FIRSTTOUCH) == 0)
2241                 keg->uk_flags |= UMA_ZONE_ROUNDROBIN;
2242 #endif
2243
2244         /*
2245          * If we haven't booted yet we need allocations to go through the
2246          * startup cache until the vm is ready.
2247          */
2248 #ifdef UMA_MD_SMALL_ALLOC
2249         if (keg->uk_ppera == 1)
2250                 keg->uk_allocf = uma_small_alloc;
2251         else
2252 #endif
2253         if (booted < BOOT_KVA)
2254                 keg->uk_allocf = startup_alloc;
2255         else if (keg->uk_flags & UMA_ZONE_PCPU)
2256                 keg->uk_allocf = pcpu_page_alloc;
2257         else if ((keg->uk_flags & UMA_ZONE_CONTIG) != 0 && keg->uk_ppera > 1)
2258                 keg->uk_allocf = contig_alloc;
2259         else
2260                 keg->uk_allocf = page_alloc;
2261 #ifdef UMA_MD_SMALL_ALLOC
2262         if (keg->uk_ppera == 1)
2263                 keg->uk_freef = uma_small_free;
2264         else
2265 #endif
2266         if (keg->uk_flags & UMA_ZONE_PCPU)
2267                 keg->uk_freef = pcpu_page_free;
2268         else
2269                 keg->uk_freef = page_free;
2270
2271         /*
2272          * Initialize keg's locks.
2273          */
2274         for (i = 0; i < vm_ndomains; i++)
2275                 KEG_LOCK_INIT(keg, i, (arg->flags & UMA_ZONE_MTXCLASS));
2276
2277         /*
2278          * If we're putting the slab header in the actual page we need to
2279          * figure out where in each page it goes.  See slab_sizeof
2280          * definition.
2281          */
2282         if (!(keg->uk_flags & UMA_ZFLAG_OFFPAGE)) {
2283                 size_t shsize;
2284
2285                 shsize = slab_sizeof(keg->uk_ipers);
2286                 keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - shsize;
2287                 /*
2288                  * The only way the following is possible is if with our
2289                  * UMA_ALIGN_PTR adjustments we are now bigger than
2290                  * UMA_SLAB_SIZE.  I haven't checked whether this is
2291                  * mathematically possible for all cases, so we make
2292                  * sure here anyway.
2293                  */
2294                 KASSERT(keg->uk_pgoff + shsize <= PAGE_SIZE * keg->uk_ppera,
2295                     ("zone %s ipers %d rsize %d size %d slab won't fit",
2296                     zone->uz_name, keg->uk_ipers, keg->uk_rsize, keg->uk_size));
2297         }
2298
2299         if (keg->uk_flags & UMA_ZFLAG_HASH)
2300                 hash_alloc(&keg->uk_hash, 0);
2301
2302         CTR3(KTR_UMA, "keg_ctor %p zone %s(%p)", keg, zone->uz_name, zone);
2303
2304         LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
2305
2306         rw_wlock(&uma_rwlock);
2307         LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
2308         rw_wunlock(&uma_rwlock);
2309         return (0);
2310 }
2311
2312 static void
2313 zone_kva_available(uma_zone_t zone, void *unused)
2314 {
2315         uma_keg_t keg;
2316
2317         if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0)
2318                 return;
2319         KEG_GET(zone, keg);
2320
2321         if (keg->uk_allocf == startup_alloc) {
2322                 /* Switch to the real allocator. */
2323                 if (keg->uk_flags & UMA_ZONE_PCPU)
2324                         keg->uk_allocf = pcpu_page_alloc;
2325                 else if ((keg->uk_flags & UMA_ZONE_CONTIG) != 0 &&
2326                     keg->uk_ppera > 1)
2327                         keg->uk_allocf = contig_alloc;
2328                 else
2329                         keg->uk_allocf = page_alloc;
2330         }
2331 }
2332
2333 static void
2334 zone_alloc_counters(uma_zone_t zone, void *unused)
2335 {
2336
2337         zone->uz_allocs = counter_u64_alloc(M_WAITOK);
2338         zone->uz_frees = counter_u64_alloc(M_WAITOK);
2339         zone->uz_fails = counter_u64_alloc(M_WAITOK);
2340         zone->uz_xdomain = counter_u64_alloc(M_WAITOK);
2341 }
2342
2343 static void
2344 zone_alloc_sysctl(uma_zone_t zone, void *unused)
2345 {
2346         uma_zone_domain_t zdom;
2347         uma_domain_t dom;
2348         uma_keg_t keg;
2349         struct sysctl_oid *oid, *domainoid;
2350         int domains, i, cnt;
2351         static const char *nokeg = "cache zone";
2352         char *c;
2353
2354         /*
2355          * Make a sysctl safe copy of the zone name by removing
2356          * any special characters and handling dups by appending
2357          * an index.
2358          */
2359         if (zone->uz_namecnt != 0) {
2360                 /* Count the number of decimal digits and '_' separator. */
2361                 for (i = 1, cnt = zone->uz_namecnt; cnt != 0; i++)
2362                         cnt /= 10;
2363                 zone->uz_ctlname = malloc(strlen(zone->uz_name) + i + 1,
2364                     M_UMA, M_WAITOK);
2365                 sprintf(zone->uz_ctlname, "%s_%d", zone->uz_name,
2366                     zone->uz_namecnt);
2367         } else
2368                 zone->uz_ctlname = strdup(zone->uz_name, M_UMA);
2369         for (c = zone->uz_ctlname; *c != '\0'; c++)
2370                 if (strchr("./\\ -", *c) != NULL)
2371                         *c = '_';
2372
2373         /*
2374          * Basic parameters at the root.
2375          */
2376         zone->uz_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_vm_uma),
2377             OID_AUTO, zone->uz_ctlname, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
2378         oid = zone->uz_oid;
2379         SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2380             "size", CTLFLAG_RD, &zone->uz_size, 0, "Allocation size");
2381         SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2382             "flags", CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_MPSAFE,
2383             zone, 0, sysctl_handle_uma_zone_flags, "A",
2384             "Allocator configuration flags");
2385         SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2386             "bucket_size", CTLFLAG_RD, &zone->uz_bucket_size, 0,
2387             "Desired per-cpu cache size");
2388         SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2389             "bucket_size_max", CTLFLAG_RD, &zone->uz_bucket_size_max, 0,
2390             "Maximum allowed per-cpu cache size");
2391
2392         /*
2393          * keg if present.
2394          */
2395         if ((zone->uz_flags & UMA_ZFLAG_HASH) == 0)
2396                 domains = vm_ndomains;
2397         else
2398                 domains = 1;
2399         oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO,
2400             "keg", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
2401         keg = zone->uz_keg;
2402         if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0) {
2403                 SYSCTL_ADD_CONST_STRING(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2404                     "name", CTLFLAG_RD, keg->uk_name, "Keg name");
2405                 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2406                     "rsize", CTLFLAG_RD, &keg->uk_rsize, 0,
2407                     "Real object size with alignment");
2408                 SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2409                     "ppera", CTLFLAG_RD, &keg->uk_ppera, 0,
2410                     "pages per-slab allocation");
2411                 SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2412                     "ipers", CTLFLAG_RD, &keg->uk_ipers, 0,
2413                     "items available per-slab");
2414                 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2415                     "align", CTLFLAG_RD, &keg->uk_align, 0,
2416                     "item alignment mask");
2417                 SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2418                     "efficiency", CTLFLAG_RD | CTLTYPE_INT | CTLFLAG_MPSAFE,
2419                     keg, 0, sysctl_handle_uma_slab_efficiency, "I",
2420                     "Slab utilization (100 - internal fragmentation %)");
2421                 domainoid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(oid),
2422                     OID_AUTO, "domain", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
2423                 for (i = 0; i < domains; i++) {
2424                         dom = &keg->uk_domain[i];
2425                         oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(domainoid),
2426                             OID_AUTO, VM_DOMAIN(i)->vmd_name,
2427                             CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
2428                         SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2429                             "pages", CTLFLAG_RD, &dom->ud_pages, 0,
2430                             "Total pages currently allocated from VM");
2431                         SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2432                             "free_items", CTLFLAG_RD, &dom->ud_free_items, 0,
2433                             "items free in the slab layer");
2434                 }
2435         } else
2436                 SYSCTL_ADD_CONST_STRING(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2437                     "name", CTLFLAG_RD, nokeg, "Keg name");
2438
2439         /*
2440          * Information about zone limits.
2441          */
2442         oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO,
2443             "limit", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
2444         SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2445             "items", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE,
2446             zone, 0, sysctl_handle_uma_zone_items, "QU",
2447             "current number of allocated items if limit is set");
2448         SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2449             "max_items", CTLFLAG_RD, &zone->uz_max_items, 0,
2450             "Maximum number of cached items");
2451         SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2452             "sleepers", CTLFLAG_RD, &zone->uz_sleepers, 0,
2453             "Number of threads sleeping at limit");
2454         SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2455             "sleeps", CTLFLAG_RD, &zone->uz_sleeps, 0,
2456             "Total zone limit sleeps");
2457         SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2458             "bucket_max", CTLFLAG_RD, &zone->uz_bucket_max, 0,
2459             "Maximum number of items in each domain's bucket cache");
2460
2461         /*
2462          * Per-domain zone information.
2463          */
2464         domainoid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid),
2465             OID_AUTO, "domain", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
2466         for (i = 0; i < domains; i++) {
2467                 zdom = ZDOM_GET(zone, i);
2468                 oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(domainoid),
2469                     OID_AUTO, VM_DOMAIN(i)->vmd_name,
2470                     CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
2471                 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2472                     "nitems", CTLFLAG_RD, &zdom->uzd_nitems,
2473                     "number of items in this domain");
2474                 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2475                     "imax", CTLFLAG_RD, &zdom->uzd_imax,
2476                     "maximum item count in this period");
2477                 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2478                     "imin", CTLFLAG_RD, &zdom->uzd_imin,
2479                     "minimum item count in this period");
2480                 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2481                     "wss", CTLFLAG_RD, &zdom->uzd_wss,
2482                     "Working set size");
2483         }
2484
2485         /*
2486          * General statistics.
2487          */
2488         oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO,
2489             "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
2490         SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2491             "current", CTLFLAG_RD | CTLTYPE_INT | CTLFLAG_MPSAFE,
2492             zone, 1, sysctl_handle_uma_zone_cur, "I",
2493             "Current number of allocated items");
2494         SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2495             "allocs", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE,
2496             zone, 0, sysctl_handle_uma_zone_allocs, "QU",
2497             "Total allocation calls");
2498         SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2499             "frees", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE,
2500             zone, 0, sysctl_handle_uma_zone_frees, "QU",
2501             "Total free calls");
2502         SYSCTL_ADD_COUNTER_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2503             "fails", CTLFLAG_RD, &zone->uz_fails,
2504             "Number of allocation failures");
2505         SYSCTL_ADD_COUNTER_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2506             "xdomain", CTLFLAG_RD, &zone->uz_xdomain,
2507             "Free calls from the wrong domain");
2508 }
2509
2510 struct uma_zone_count {
2511         const char      *name;
2512         int             count;
2513 };
2514
2515 static void
2516 zone_count(uma_zone_t zone, void *arg)
2517 {
2518         struct uma_zone_count *cnt;
2519
2520         cnt = arg;
2521         /*
2522          * Some zones are rapidly created with identical names and
2523          * destroyed out of order.  This can lead to gaps in the count.
2524          * Use one greater than the maximum observed for this name.
2525          */
2526         if (strcmp(zone->uz_name, cnt->name) == 0)
2527                 cnt->count = MAX(cnt->count,
2528                     zone->uz_namecnt + 1);
2529 }
2530
2531 static void
2532 zone_update_caches(uma_zone_t zone)
2533 {
2534         int i;
2535
2536         for (i = 0; i <= mp_maxid; i++) {
2537                 cache_set_uz_size(&zone->uz_cpu[i], zone->uz_size);
2538                 cache_set_uz_flags(&zone->uz_cpu[i], zone->uz_flags);
2539         }
2540 }
2541
2542 /*
2543  * Zone header ctor.  This initializes all fields, locks, etc.
2544  *
2545  * Arguments/Returns follow uma_ctor specifications
2546  *      udata  Actually uma_zctor_args
2547  */
2548 static int
2549 zone_ctor(void *mem, int size, void *udata, int flags)
2550 {
2551         struct uma_zone_count cnt;
2552         struct uma_zctor_args *arg = udata;
2553         uma_zone_domain_t zdom;
2554         uma_zone_t zone = mem;
2555         uma_zone_t z;
2556         uma_keg_t keg;
2557         int i;
2558
2559         bzero(zone, size);
2560         zone->uz_name = arg->name;
2561         zone->uz_ctor = arg->ctor;
2562         zone->uz_dtor = arg->dtor;
2563         zone->uz_init = NULL;
2564         zone->uz_fini = NULL;
2565         zone->uz_sleeps = 0;
2566         zone->uz_bucket_size = 0;
2567         zone->uz_bucket_size_min = 0;
2568         zone->uz_bucket_size_max = BUCKET_MAX;
2569         zone->uz_flags = (arg->flags & UMA_ZONE_SMR);
2570         zone->uz_warning = NULL;
2571         /* The domain structures follow the cpu structures. */
2572         zone->uz_bucket_max = ULONG_MAX;
2573         timevalclear(&zone->uz_ratecheck);
2574
2575         /* Count the number of duplicate names. */
2576         cnt.name = arg->name;
2577         cnt.count = 0;
2578         zone_foreach(zone_count, &cnt);
2579         zone->uz_namecnt = cnt.count;
2580         ZONE_CROSS_LOCK_INIT(zone);
2581
2582         for (i = 0; i < vm_ndomains; i++) {
2583                 zdom = ZDOM_GET(zone, i);
2584                 ZDOM_LOCK_INIT(zone, zdom, (arg->flags & UMA_ZONE_MTXCLASS));
2585                 STAILQ_INIT(&zdom->uzd_buckets);
2586         }
2587
2588 #ifdef INVARIANTS
2589         if (arg->uminit == trash_init && arg->fini == trash_fini)
2590                 zone->uz_flags |= UMA_ZFLAG_TRASH | UMA_ZFLAG_CTORDTOR;
2591 #endif
2592
2593         /*
2594          * This is a pure cache zone, no kegs.
2595          */
2596         if (arg->import) {
2597                 KASSERT((arg->flags & UMA_ZFLAG_CACHE) != 0,
2598                     ("zone_ctor: Import specified for non-cache zone."));
2599                 zone->uz_flags = arg->flags;
2600                 zone->uz_size = arg->size;
2601                 zone->uz_import = arg->import;
2602                 zone->uz_release = arg->release;
2603                 zone->uz_arg = arg->arg;
2604 #ifdef NUMA
2605                 /*
2606                  * Cache zones are round-robin unless a policy is
2607                  * specified because they may have incompatible
2608                  * constraints.
2609                  */
2610                 if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) == 0)
2611                         zone->uz_flags |= UMA_ZONE_ROUNDROBIN;
2612 #endif
2613                 rw_wlock(&uma_rwlock);
2614                 LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link);
2615                 rw_wunlock(&uma_rwlock);
2616                 goto out;
2617         }
2618
2619         /*
2620          * Use the regular zone/keg/slab allocator.
2621          */
2622         zone->uz_import = zone_import;
2623         zone->uz_release = zone_release;
2624         zone->uz_arg = zone;
2625         keg = arg->keg;
2626
2627         if (arg->flags & UMA_ZONE_SECONDARY) {
2628                 KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0,
2629                     ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
2630                 KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
2631                 zone->uz_init = arg->uminit;
2632                 zone->uz_fini = arg->fini;
2633                 zone->uz_flags |= UMA_ZONE_SECONDARY;
2634                 rw_wlock(&uma_rwlock);
2635                 ZONE_LOCK(zone);
2636                 LIST_FOREACH(z, &keg->uk_zones, uz_link) {
2637                         if (LIST_NEXT(z, uz_link) == NULL) {
2638                                 LIST_INSERT_AFTER(z, zone, uz_link);
2639                                 break;
2640                         }
2641                 }
2642                 ZONE_UNLOCK(zone);
2643                 rw_wunlock(&uma_rwlock);
2644         } else if (keg == NULL) {
2645                 if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
2646                     arg->align, arg->flags)) == NULL)
2647                         return (ENOMEM);
2648         } else {
2649                 struct uma_kctor_args karg;
2650                 int error;
2651
2652                 /* We should only be here from uma_startup() */
2653                 karg.size = arg->size;
2654                 karg.uminit = arg->uminit;
2655                 karg.fini = arg->fini;
2656                 karg.align = arg->align;
2657                 karg.flags = (arg->flags & ~UMA_ZONE_SMR);
2658                 karg.zone = zone;
2659                 error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg,
2660                     flags);
2661                 if (error)
2662                         return (error);
2663         }
2664
2665         /* Inherit properties from the keg. */
2666         zone->uz_keg = keg;
2667         zone->uz_size = keg->uk_size;
2668         zone->uz_flags |= (keg->uk_flags &
2669             (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT));
2670
2671 out:
2672         if (__predict_true(booted >= BOOT_RUNNING)) {
2673                 zone_alloc_counters(zone, NULL);
2674                 zone_alloc_sysctl(zone, NULL);
2675         } else {
2676                 zone->uz_allocs = EARLY_COUNTER;
2677                 zone->uz_frees = EARLY_COUNTER;
2678                 zone->uz_fails = EARLY_COUNTER;
2679         }
2680
2681         /* Caller requests a private SMR context. */
2682         if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
2683                 zone->uz_smr = smr_create(zone->uz_name, 0, 0);
2684
2685         KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) !=
2686             (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET),
2687             ("Invalid zone flag combination"));
2688         if (arg->flags & UMA_ZFLAG_INTERNAL)
2689                 zone->uz_bucket_size_max = zone->uz_bucket_size = 0;
2690         if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0)
2691                 zone->uz_bucket_size = BUCKET_MAX;
2692         else if ((arg->flags & UMA_ZONE_MINBUCKET) != 0)
2693                 zone->uz_bucket_size_max = zone->uz_bucket_size = BUCKET_MIN;
2694         else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0)
2695                 zone->uz_bucket_size = 0;
2696         else
2697                 zone->uz_bucket_size = bucket_select(zone->uz_size);
2698         zone->uz_bucket_size_min = zone->uz_bucket_size;
2699         if (zone->uz_dtor != NULL || zone->uz_ctor != NULL)
2700                 zone->uz_flags |= UMA_ZFLAG_CTORDTOR;
2701         zone_update_caches(zone);
2702
2703         return (0);
2704 }
2705
2706 /*
2707  * Keg header dtor.  This frees all data, destroys locks, frees the hash
2708  * table and removes the keg from the global list.
2709  *
2710  * Arguments/Returns follow uma_dtor specifications
2711  *      udata  unused
2712  */
2713 static void
2714 keg_dtor(void *arg, int size, void *udata)
2715 {
2716         uma_keg_t keg;
2717         uint32_t free, pages;
2718         int i;
2719
2720         keg = (uma_keg_t)arg;
2721         free = pages = 0;
2722         for (i = 0; i < vm_ndomains; i++) {
2723                 free += keg->uk_domain[i].ud_free_items;
2724                 pages += keg->uk_domain[i].ud_pages;
2725                 KEG_LOCK_FINI(keg, i);
2726         }
2727         if (pages != 0)
2728                 printf("Freed UMA keg (%s) was not empty (%u items). "
2729                     " Lost %u pages of memory.\n",
2730                     keg->uk_name ? keg->uk_name : "",
2731                     pages / keg->uk_ppera * keg->uk_ipers - free, pages);
2732
2733         hash_free(&keg->uk_hash);
2734 }
2735
2736 /*
2737  * Zone header dtor.
2738  *
2739  * Arguments/Returns follow uma_dtor specifications
2740  *      udata  unused
2741  */
2742 static void
2743 zone_dtor(void *arg, int size, void *udata)
2744 {
2745         uma_zone_t zone;
2746         uma_keg_t keg;
2747         int i;
2748
2749         zone = (uma_zone_t)arg;
2750
2751         sysctl_remove_oid(zone->uz_oid, 1, 1);
2752
2753         if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
2754                 cache_drain(zone);
2755
2756         rw_wlock(&uma_rwlock);
2757         LIST_REMOVE(zone, uz_link);
2758         rw_wunlock(&uma_rwlock);
2759         zone_reclaim(zone, M_WAITOK, true);
2760
2761         /*
2762          * We only destroy kegs from non secondary/non cache zones.
2763          */
2764         if ((zone->uz_flags & (UMA_ZONE_SECONDARY | UMA_ZFLAG_CACHE)) == 0) {
2765                 keg = zone->uz_keg;
2766                 rw_wlock(&uma_rwlock);
2767                 LIST_REMOVE(keg, uk_link);
2768                 rw_wunlock(&uma_rwlock);
2769                 zone_free_item(kegs, keg, NULL, SKIP_NONE);
2770         }
2771         counter_u64_free(zone->uz_allocs);
2772         counter_u64_free(zone->uz_frees);
2773         counter_u64_free(zone->uz_fails);
2774         counter_u64_free(zone->uz_xdomain);
2775         free(zone->uz_ctlname, M_UMA);
2776         for (i = 0; i < vm_ndomains; i++)
2777                 ZDOM_LOCK_FINI(ZDOM_GET(zone, i));
2778         ZONE_CROSS_LOCK_FINI(zone);
2779 }
2780
2781 static void
2782 zone_foreach_unlocked(void (*zfunc)(uma_zone_t, void *arg), void *arg)
2783 {
2784         uma_keg_t keg;
2785         uma_zone_t zone;
2786
2787         LIST_FOREACH(keg, &uma_kegs, uk_link) {
2788                 LIST_FOREACH(zone, &keg->uk_zones, uz_link)
2789                         zfunc(zone, arg);
2790         }
2791         LIST_FOREACH(zone, &uma_cachezones, uz_link)
2792                 zfunc(zone, arg);
2793 }
2794
2795 /*
2796  * Traverses every zone in the system and calls a callback
2797  *
2798  * Arguments:
2799  *      zfunc  A pointer to a function which accepts a zone
2800  *              as an argument.
2801  *
2802  * Returns:
2803  *      Nothing
2804  */
2805 static void
2806 zone_foreach(void (*zfunc)(uma_zone_t, void *arg), void *arg)
2807 {
2808
2809         rw_rlock(&uma_rwlock);
2810         zone_foreach_unlocked(zfunc, arg);
2811         rw_runlock(&uma_rwlock);
2812 }
2813
2814 /*
2815  * Initialize the kernel memory allocator.  This is done after pages can be
2816  * allocated but before general KVA is available.
2817  */
2818 void
2819 uma_startup1(vm_offset_t virtual_avail)
2820 {
2821         struct uma_zctor_args args;
2822         size_t ksize, zsize, size;
2823         uma_keg_t masterkeg;
2824         uintptr_t m;
2825         uint8_t pflag;
2826
2827         bootstart = bootmem = virtual_avail;
2828
2829         rw_init(&uma_rwlock, "UMA lock");
2830         sx_init(&uma_reclaim_lock, "umareclaim");
2831
2832         ksize = sizeof(struct uma_keg) +
2833             (sizeof(struct uma_domain) * vm_ndomains);
2834         ksize = roundup(ksize, UMA_SUPER_ALIGN);
2835         zsize = sizeof(struct uma_zone) +
2836             (sizeof(struct uma_cache) * (mp_maxid + 1)) +
2837             (sizeof(struct uma_zone_domain) * vm_ndomains);
2838         zsize = roundup(zsize, UMA_SUPER_ALIGN);
2839
2840         /* Allocate the zone of zones, zone of kegs, and zone of zones keg. */
2841         size = (zsize * 2) + ksize;
2842         m = (uintptr_t)startup_alloc(NULL, size, 0, &pflag, M_NOWAIT | M_ZERO);
2843         zones = (uma_zone_t)m;
2844         m += zsize;
2845         kegs = (uma_zone_t)m;
2846         m += zsize;
2847         masterkeg = (uma_keg_t)m;
2848
2849         /* "manually" create the initial zone */
2850         memset(&args, 0, sizeof(args));
2851         args.name = "UMA Kegs";
2852         args.size = ksize;
2853         args.ctor = keg_ctor;
2854         args.dtor = keg_dtor;
2855         args.uminit = zero_init;
2856         args.fini = NULL;
2857         args.keg = masterkeg;
2858         args.align = UMA_SUPER_ALIGN - 1;
2859         args.flags = UMA_ZFLAG_INTERNAL;
2860         zone_ctor(kegs, zsize, &args, M_WAITOK);
2861
2862         args.name = "UMA Zones";
2863         args.size = zsize;
2864         args.ctor = zone_ctor;
2865         args.dtor = zone_dtor;
2866         args.uminit = zero_init;
2867         args.fini = NULL;
2868         args.keg = NULL;
2869         args.align = UMA_SUPER_ALIGN - 1;
2870         args.flags = UMA_ZFLAG_INTERNAL;
2871         zone_ctor(zones, zsize, &args, M_WAITOK);
2872
2873         /* Now make zones for slab headers */
2874         slabzones[0] = uma_zcreate("UMA Slabs 0", SLABZONE0_SIZE,
2875             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
2876         slabzones[1] = uma_zcreate("UMA Slabs 1", SLABZONE1_SIZE,
2877             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
2878
2879         hashzone = uma_zcreate("UMA Hash",
2880             sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
2881             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
2882
2883         bucket_init();
2884         smr_init();
2885 }
2886
2887 #ifndef UMA_MD_SMALL_ALLOC
2888 extern void vm_radix_reserve_kva(void);
2889 #endif
2890
2891 /*
2892  * Advertise the availability of normal kva allocations and switch to
2893  * the default back-end allocator.  Marks the KVA we consumed on startup
2894  * as used in the map.
2895  */
2896 void
2897 uma_startup2(void)
2898 {
2899
2900         if (bootstart != bootmem) {
2901                 vm_map_lock(kernel_map);
2902                 (void)vm_map_insert(kernel_map, NULL, 0, bootstart, bootmem,
2903                     VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT);
2904                 vm_map_unlock(kernel_map);
2905         }
2906
2907 #ifndef UMA_MD_SMALL_ALLOC
2908         /* Set up radix zone to use noobj_alloc. */
2909         vm_radix_reserve_kva();
2910 #endif
2911
2912         booted = BOOT_KVA;
2913         zone_foreach_unlocked(zone_kva_available, NULL);
2914         bucket_enable();
2915 }
2916
2917 /*
2918  * Finish our initialization steps.
2919  */
2920 static void
2921 uma_startup3(void)
2922 {
2923
2924 #ifdef INVARIANTS
2925         TUNABLE_INT_FETCH("vm.debug.divisor", &dbg_divisor);
2926         uma_dbg_cnt = counter_u64_alloc(M_WAITOK);
2927         uma_skip_cnt = counter_u64_alloc(M_WAITOK);
2928 #endif
2929         zone_foreach_unlocked(zone_alloc_counters, NULL);
2930         zone_foreach_unlocked(zone_alloc_sysctl, NULL);
2931         callout_init(&uma_callout, 1);
2932         callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
2933         booted = BOOT_RUNNING;
2934
2935         EVENTHANDLER_REGISTER(shutdown_post_sync, uma_shutdown, NULL,
2936             EVENTHANDLER_PRI_FIRST);
2937 }
2938
2939 static void
2940 uma_shutdown(void)
2941 {
2942
2943         booted = BOOT_SHUTDOWN;
2944 }
2945
2946 static uma_keg_t
2947 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
2948                 int align, uint32_t flags)
2949 {
2950         struct uma_kctor_args args;
2951
2952         args.size = size;
2953         args.uminit = uminit;
2954         args.fini = fini;
2955         args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
2956         args.flags = flags;
2957         args.zone = zone;
2958         return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK));
2959 }
2960
2961 /* Public functions */
2962 /* See uma.h */
2963 void
2964 uma_set_align(int align)
2965 {
2966
2967         if (align != UMA_ALIGN_CACHE)
2968                 uma_align_cache = align;
2969 }
2970
2971 /* See uma.h */
2972 uma_zone_t
2973 uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
2974                 uma_init uminit, uma_fini fini, int align, uint32_t flags)
2975
2976 {
2977         struct uma_zctor_args args;
2978         uma_zone_t res;
2979
2980         KASSERT(powerof2(align + 1), ("invalid zone alignment %d for \"%s\"",
2981             align, name));
2982
2983         /* This stuff is essential for the zone ctor */
2984         memset(&args, 0, sizeof(args));
2985         args.name = name;
2986         args.size = size;
2987         args.ctor = ctor;
2988         args.dtor = dtor;
2989         args.uminit = uminit;
2990         args.fini = fini;
2991 #ifdef  INVARIANTS
2992         /*
2993          * Inject procedures which check for memory use after free if we are
2994          * allowed to scramble the memory while it is not allocated.  This
2995          * requires that: UMA is actually able to access the memory, no init
2996          * or fini procedures, no dependency on the initial value of the
2997          * memory, and no (legitimate) use of the memory after free.  Note,
2998          * the ctor and dtor do not need to be empty.
2999          */
3000         if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOTOUCH |
3001             UMA_ZONE_NOFREE))) && uminit == NULL && fini == NULL) {
3002                 args.uminit = trash_init;
3003                 args.fini = trash_fini;
3004         }
3005 #endif
3006         args.align = align;
3007         args.flags = flags;
3008         args.keg = NULL;
3009
3010         sx_slock(&uma_reclaim_lock);
3011         res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
3012         sx_sunlock(&uma_reclaim_lock);
3013
3014         return (res);
3015 }
3016
3017 /* See uma.h */
3018 uma_zone_t
3019 uma_zsecond_create(const char *name, uma_ctor ctor, uma_dtor dtor,
3020     uma_init zinit, uma_fini zfini, uma_zone_t master)
3021 {
3022         struct uma_zctor_args args;
3023         uma_keg_t keg;
3024         uma_zone_t res;
3025
3026         keg = master->uz_keg;
3027         memset(&args, 0, sizeof(args));
3028         args.name = name;
3029         args.size = keg->uk_size;
3030         args.ctor = ctor;
3031         args.dtor = dtor;
3032         args.uminit = zinit;
3033         args.fini = zfini;
3034         args.align = keg->uk_align;
3035         args.flags = keg->uk_flags | UMA_ZONE_SECONDARY;
3036         args.keg = keg;
3037
3038         sx_slock(&uma_reclaim_lock);
3039         res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
3040         sx_sunlock(&uma_reclaim_lock);
3041
3042         return (res);
3043 }
3044
3045 /* See uma.h */
3046 uma_zone_t
3047 uma_zcache_create(const char *name, int size, uma_ctor ctor, uma_dtor dtor,
3048     uma_init zinit, uma_fini zfini, uma_import zimport, uma_release zrelease,
3049     void *arg, int flags)
3050 {
3051         struct uma_zctor_args args;
3052
3053         memset(&args, 0, sizeof(args));
3054         args.name = name;
3055         args.size = size;
3056         args.ctor = ctor;
3057         args.dtor = dtor;
3058         args.uminit = zinit;
3059         args.fini = zfini;
3060         args.import = zimport;
3061         args.release = zrelease;
3062         args.arg = arg;
3063         args.align = 0;
3064         args.flags = flags | UMA_ZFLAG_CACHE;
3065
3066         return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK));
3067 }
3068
3069 /* See uma.h */
3070 void
3071 uma_zdestroy(uma_zone_t zone)
3072 {
3073
3074         /*
3075          * Large slabs are expensive to reclaim, so don't bother doing
3076          * unnecessary work if we're shutting down.
3077          */
3078         if (booted == BOOT_SHUTDOWN &&
3079             zone->uz_fini == NULL && zone->uz_release == zone_release)
3080                 return;
3081         sx_slock(&uma_reclaim_lock);
3082         zone_free_item(zones, zone, NULL, SKIP_NONE);
3083         sx_sunlock(&uma_reclaim_lock);
3084 }
3085
3086 void
3087 uma_zwait(uma_zone_t zone)
3088 {
3089
3090         if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
3091                 uma_zfree_smr(zone, uma_zalloc_smr(zone, M_WAITOK));
3092         else if ((zone->uz_flags & UMA_ZONE_PCPU) != 0)
3093                 uma_zfree_pcpu(zone, uma_zalloc_pcpu(zone, M_WAITOK));
3094         else
3095                 uma_zfree(zone, uma_zalloc(zone, M_WAITOK));
3096 }
3097
3098 void *
3099 uma_zalloc_pcpu_arg(uma_zone_t zone, void *udata, int flags)
3100 {
3101         void *item, *pcpu_item;
3102 #ifdef SMP
3103         int i;
3104
3105         MPASS(zone->uz_flags & UMA_ZONE_PCPU);
3106 #endif
3107         item = uma_zalloc_arg(zone, udata, flags & ~M_ZERO);
3108         if (item == NULL)
3109                 return (NULL);
3110         pcpu_item = zpcpu_base_to_offset(item);
3111         if (flags & M_ZERO) {
3112 #ifdef SMP
3113                 for (i = 0; i <= mp_maxid; i++)
3114                         bzero(zpcpu_get_cpu(pcpu_item, i), zone->uz_size);
3115 #else
3116                 bzero(item, zone->uz_size);
3117 #endif
3118         }
3119         return (pcpu_item);
3120 }
3121
3122 /*
3123  * A stub while both regular and pcpu cases are identical.
3124  */
3125 void
3126 uma_zfree_pcpu_arg(uma_zone_t zone, void *pcpu_item, void *udata)
3127 {
3128         void *item;
3129
3130 #ifdef SMP
3131         MPASS(zone->uz_flags & UMA_ZONE_PCPU);
3132 #endif
3133         item = zpcpu_offset_to_base(pcpu_item);
3134         uma_zfree_arg(zone, item, udata);
3135 }
3136
3137 static inline void *
3138 item_ctor(uma_zone_t zone, int uz_flags, int size, void *udata, int flags,
3139     void *item)
3140 {
3141 #ifdef INVARIANTS
3142         bool skipdbg;
3143
3144         skipdbg = uma_dbg_zskip(zone, item);
3145         if (!skipdbg && (zone->uz_flags & UMA_ZFLAG_TRASH) != 0 &&
3146             zone->uz_ctor != trash_ctor)
3147                 trash_ctor(item, size, udata, flags);
3148 #endif
3149         /* Check flags before loading ctor pointer. */
3150         if (__predict_false((uz_flags & UMA_ZFLAG_CTORDTOR) != 0) &&
3151             __predict_false(zone->uz_ctor != NULL) &&
3152             zone->uz_ctor(item, size, udata, flags) != 0) {
3153                 counter_u64_add(zone->uz_fails, 1);
3154                 zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT);
3155                 return (NULL);
3156         }
3157 #ifdef INVARIANTS
3158         if (!skipdbg)
3159                 uma_dbg_alloc(zone, NULL, item);
3160 #endif
3161         if (__predict_false(flags & M_ZERO))
3162                 return (memset(item, 0, size));
3163
3164         return (item);
3165 }
3166
3167 static inline void
3168 item_dtor(uma_zone_t zone, void *item, int size, void *udata,
3169     enum zfreeskip skip)
3170 {
3171 #ifdef INVARIANTS
3172         bool skipdbg;
3173
3174         skipdbg = uma_dbg_zskip(zone, item);
3175         if (skip == SKIP_NONE && !skipdbg) {
3176                 if ((zone->uz_flags & UMA_ZONE_MALLOC) != 0)
3177                         uma_dbg_free(zone, udata, item);
3178                 else
3179                         uma_dbg_free(zone, NULL, item);
3180         }
3181 #endif
3182         if (__predict_true(skip < SKIP_DTOR)) {
3183                 if (zone->uz_dtor != NULL)
3184                         zone->uz_dtor(item, size, udata);
3185 #ifdef INVARIANTS
3186                 if (!skipdbg && (zone->uz_flags & UMA_ZFLAG_TRASH) != 0 &&
3187                     zone->uz_dtor != trash_dtor)
3188                         trash_dtor(item, size, udata);
3189 #endif
3190         }
3191 }
3192
3193 #if defined(INVARIANTS) || defined(DEBUG_MEMGUARD) || defined(WITNESS)
3194 #define UMA_ZALLOC_DEBUG
3195 static int
3196 uma_zalloc_debug(uma_zone_t zone, void **itemp, void *udata, int flags)
3197 {
3198         int error;
3199
3200         error = 0;
3201 #ifdef WITNESS
3202         if (flags & M_WAITOK) {
3203                 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
3204                     "uma_zalloc_debug: zone \"%s\"", zone->uz_name);
3205         }
3206 #endif
3207
3208 #ifdef INVARIANTS
3209         KASSERT((flags & M_EXEC) == 0,
3210             ("uma_zalloc_debug: called with M_EXEC"));
3211         KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
3212             ("uma_zalloc_debug: called within spinlock or critical section"));
3213         KASSERT((zone->uz_flags & UMA_ZONE_PCPU) == 0 || (flags & M_ZERO) == 0,
3214             ("uma_zalloc_debug: allocating from a pcpu zone with M_ZERO"));
3215 #endif
3216
3217 #ifdef DEBUG_MEMGUARD
3218         if ((zone->uz_flags & UMA_ZONE_SMR) == 0 && memguard_cmp_zone(zone)) {
3219                 void *item;
3220                 item = memguard_alloc(zone->uz_size, flags);
3221                 if (item != NULL) {
3222                         error = EJUSTRETURN;
3223                         if (zone->uz_init != NULL &&
3224                             zone->uz_init(item, zone->uz_size, flags) != 0) {
3225                                 *itemp = NULL;
3226                                 return (error);
3227                         }
3228                         if (zone->uz_ctor != NULL &&
3229                             zone->uz_ctor(item, zone->uz_size, udata,
3230                             flags) != 0) {
3231                                 counter_u64_add(zone->uz_fails, 1);
3232                                 zone->uz_fini(item, zone->uz_size);
3233                                 *itemp = NULL;
3234                                 return (error);
3235                         }
3236                         *itemp = item;
3237                         return (error);
3238                 }
3239                 /* This is unfortunate but should not be fatal. */
3240         }
3241 #endif
3242         return (error);
3243 }
3244
3245 static int
3246 uma_zfree_debug(uma_zone_t zone, void *item, void *udata)
3247 {
3248         KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
3249             ("uma_zfree_debug: called with spinlock or critical section held"));
3250
3251 #ifdef DEBUG_MEMGUARD
3252         if ((zone->uz_flags & UMA_ZONE_SMR) == 0 && is_memguard_addr(item)) {
3253                 if (zone->uz_dtor != NULL)
3254                         zone->uz_dtor(item, zone->uz_size, udata);
3255                 if (zone->uz_fini != NULL)
3256                         zone->uz_fini(item, zone->uz_size);
3257                 memguard_free(item);
3258                 return (EJUSTRETURN);
3259         }
3260 #endif
3261         return (0);
3262 }
3263 #endif
3264
3265 static inline void *
3266 cache_alloc_item(uma_zone_t zone, uma_cache_t cache, uma_cache_bucket_t bucket,
3267     void *udata, int flags)
3268 {
3269         void *item;
3270         int size, uz_flags;
3271
3272         item = cache_bucket_pop(cache, bucket);
3273         size = cache_uz_size(cache);
3274         uz_flags = cache_uz_flags(cache);
3275         critical_exit();
3276         return (item_ctor(zone, uz_flags, size, udata, flags, item));
3277 }
3278
3279 static __noinline void *
3280 cache_alloc_retry(uma_zone_t zone, uma_cache_t cache, void *udata, int flags)
3281 {
3282         uma_cache_bucket_t bucket;
3283         int domain;
3284
3285         while (cache_alloc(zone, cache, udata, flags)) {
3286                 cache = &zone->uz_cpu[curcpu];
3287                 bucket = &cache->uc_allocbucket;
3288                 if (__predict_false(bucket->ucb_cnt == 0))
3289                         continue;
3290                 return (cache_alloc_item(zone, cache, bucket, udata, flags));
3291         }
3292         critical_exit();
3293
3294         /*
3295          * We can not get a bucket so try to return a single item.
3296          */
3297         if (zone->uz_flags & UMA_ZONE_FIRSTTOUCH)
3298                 domain = PCPU_GET(domain);
3299         else
3300                 domain = UMA_ANYDOMAIN;
3301         return (zone_alloc_item(zone, udata, domain, flags));
3302 }
3303
3304 /* See uma.h */
3305 void *
3306 uma_zalloc_smr(uma_zone_t zone, int flags)
3307 {
3308         uma_cache_bucket_t bucket;
3309         uma_cache_t cache;
3310
3311 #ifdef UMA_ZALLOC_DEBUG
3312         void *item;
3313
3314         KASSERT((zone->uz_flags & UMA_ZONE_SMR) != 0,
3315             ("uma_zalloc_arg: called with non-SMR zone.\n"));
3316         if (uma_zalloc_debug(zone, &item, NULL, flags) == EJUSTRETURN)
3317                 return (item);
3318 #endif
3319
3320         critical_enter();
3321         cache = &zone->uz_cpu[curcpu];
3322         bucket = &cache->uc_allocbucket;
3323         if (__predict_false(bucket->ucb_cnt == 0))
3324                 return (cache_alloc_retry(zone, cache, NULL, flags));
3325         return (cache_alloc_item(zone, cache, bucket, NULL, flags));
3326 }
3327
3328 /* See uma.h */
3329 void *
3330 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
3331 {
3332         uma_cache_bucket_t bucket;
3333         uma_cache_t cache;
3334
3335         /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
3336         random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
3337
3338         /* This is the fast path allocation */
3339         CTR3(KTR_UMA, "uma_zalloc_arg zone %s(%p) flags %d", zone->uz_name,
3340             zone, flags);
3341
3342 #ifdef UMA_ZALLOC_DEBUG
3343         void *item;
3344
3345         KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0,
3346             ("uma_zalloc_arg: called with SMR zone.\n"));
3347         if (uma_zalloc_debug(zone, &item, udata, flags) == EJUSTRETURN)
3348                 return (item);
3349 #endif
3350
3351         /*
3352          * If possible, allocate from the per-CPU cache.  There are two
3353          * requirements for safe access to the per-CPU cache: (1) the thread
3354          * accessing the cache must not be preempted or yield during access,
3355          * and (2) the thread must not migrate CPUs without switching which
3356          * cache it accesses.  We rely on a critical section to prevent
3357          * preemption and migration.  We release the critical section in
3358          * order to acquire the zone mutex if we are unable to allocate from
3359          * the current cache; when we re-acquire the critical section, we
3360          * must detect and handle migration if it has occurred.
3361          */
3362         critical_enter();
3363         cache = &zone->uz_cpu[curcpu];
3364         bucket = &cache->uc_allocbucket;
3365         if (__predict_false(bucket->ucb_cnt == 0))
3366                 return (cache_alloc_retry(zone, cache, udata, flags));
3367         return (cache_alloc_item(zone, cache, bucket, udata, flags));
3368 }
3369
3370 /*
3371  * Replenish an alloc bucket and possibly restore an old one.  Called in
3372  * a critical section.  Returns in a critical section.
3373  *
3374  * A false return value indicates an allocation failure.
3375  * A true return value indicates success and the caller should retry.
3376  */
3377 static __noinline bool
3378 cache_alloc(uma_zone_t zone, uma_cache_t cache, void *udata, int flags)
3379 {
3380         uma_bucket_t bucket;
3381         int domain;
3382         bool new;
3383
3384         CRITICAL_ASSERT(curthread);
3385
3386         /*
3387          * If we have run out of items in our alloc bucket see
3388          * if we can switch with the free bucket.
3389          *
3390          * SMR Zones can't re-use the free bucket until the sequence has
3391          * expired.
3392          */
3393         if ((cache_uz_flags(cache) & UMA_ZONE_SMR) == 0 &&
3394             cache->uc_freebucket.ucb_cnt != 0) {
3395                 cache_bucket_swap(&cache->uc_freebucket,
3396                     &cache->uc_allocbucket);
3397                 return (true);
3398         }
3399
3400         /*
3401          * Discard any empty allocation bucket while we hold no locks.
3402          */
3403         bucket = cache_bucket_unload_alloc(cache);
3404         critical_exit();
3405
3406         if (bucket != NULL) {
3407                 KASSERT(bucket->ub_cnt == 0,
3408                     ("cache_alloc: Entered with non-empty alloc bucket."));
3409                 bucket_free(zone, bucket, udata);
3410         }
3411
3412         /* Short-circuit for zones without buckets and low memory. */
3413         if (zone->uz_bucket_size == 0 || bucketdisable) {
3414                 critical_enter();
3415                 return (false);
3416         }
3417
3418         /*
3419          * Attempt to retrieve the item from the per-CPU cache has failed, so
3420          * we must go back to the zone.  This requires the zdom lock, so we
3421          * must drop the critical section, then re-acquire it when we go back
3422          * to the cache.  Since the critical section is released, we may be
3423          * preempted or migrate.  As such, make sure not to maintain any
3424          * thread-local state specific to the cache from prior to releasing
3425          * the critical section.
3426          */
3427         domain = PCPU_GET(domain);
3428         if ((cache_uz_flags(cache) & UMA_ZONE_ROUNDROBIN) != 0)
3429                 domain = zone_domain_highest(zone, domain);
3430         bucket = cache_fetch_bucket(zone, cache, domain);
3431         if (bucket == NULL) {
3432                 bucket = zone_alloc_bucket(zone, udata, domain, flags);
3433                 new = true;
3434         } else
3435                 new = false;
3436
3437         CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p",
3438             zone->uz_name, zone, bucket);
3439         if (bucket == NULL) {
3440                 critical_enter();
3441                 return (false);
3442         }
3443
3444         /*
3445          * See if we lost the race or were migrated.  Cache the
3446          * initialized bucket to make this less likely or claim
3447          * the memory directly.
3448          */
3449         critical_enter();
3450         cache = &zone->uz_cpu[curcpu];
3451         if (cache->uc_allocbucket.ucb_bucket == NULL &&
3452             ((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) == 0 ||
3453             domain == PCPU_GET(domain))) {
3454                 if (new)
3455                         atomic_add_long(&ZDOM_GET(zone, domain)->uzd_imax,
3456                             bucket->ub_cnt);
3457                 cache_bucket_load_alloc(cache, bucket);
3458                 return (true);
3459         }
3460
3461         /*
3462          * We lost the race, release this bucket and start over.
3463          */
3464         critical_exit();
3465         zone_put_bucket(zone, domain, bucket, udata, false);
3466         critical_enter();
3467
3468         return (true);
3469 }
3470
3471 void *
3472 uma_zalloc_domain(uma_zone_t zone, void *udata, int domain, int flags)
3473 {
3474
3475         /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
3476         random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
3477
3478         /* This is the fast path allocation */
3479         CTR4(KTR_UMA, "uma_zalloc_domain zone %s(%p) domain %d flags %d",
3480             zone->uz_name, zone, domain, flags);
3481
3482         if (flags & M_WAITOK) {
3483                 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
3484                     "uma_zalloc_domain: zone \"%s\"", zone->uz_name);
3485         }
3486         KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
3487             ("uma_zalloc_domain: called with spinlock or critical section held"));
3488
3489         return (zone_alloc_item(zone, udata, domain, flags));
3490 }
3491
3492 /*
3493  * Find a slab with some space.  Prefer slabs that are partially used over those
3494  * that are totally full.  This helps to reduce fragmentation.
3495  *
3496  * If 'rr' is 1, search all domains starting from 'domain'.  Otherwise check
3497  * only 'domain'.
3498  */
3499 static uma_slab_t
3500 keg_first_slab(uma_keg_t keg, int domain, bool rr)
3501 {
3502         uma_domain_t dom;
3503         uma_slab_t slab;
3504         int start;
3505
3506         KASSERT(domain >= 0 && domain < vm_ndomains,
3507             ("keg_first_slab: domain %d out of range", domain));
3508         KEG_LOCK_ASSERT(keg, domain);
3509
3510         slab = NULL;
3511         start = domain;
3512         do {
3513                 dom = &keg->uk_domain[domain];
3514                 if ((slab = LIST_FIRST(&dom->ud_part_slab)) != NULL)
3515                         return (slab);
3516                 if ((slab = LIST_FIRST(&dom->ud_free_slab)) != NULL) {
3517                         LIST_REMOVE(slab, us_link);
3518                         dom->ud_free_slabs--;
3519                         LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
3520                         return (slab);
3521                 }
3522                 if (rr)
3523                         domain = (domain + 1) % vm_ndomains;
3524         } while (domain != start);
3525
3526         return (NULL);
3527 }
3528
3529 /*
3530  * Fetch an existing slab from a free or partial list.  Returns with the
3531  * keg domain lock held if a slab was found or unlocked if not.
3532  */
3533 static uma_slab_t
3534 keg_fetch_free_slab(uma_keg_t keg, int domain, bool rr, int flags)
3535 {
3536         uma_slab_t slab;
3537         uint32_t reserve;
3538
3539         /* HASH has a single free list. */
3540         if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0)
3541                 domain = 0;
3542
3543         KEG_LOCK(keg, domain);
3544         reserve = (flags & M_USE_RESERVE) != 0 ? 0 : keg->uk_reserve;
3545         if (keg->uk_domain[domain].ud_free_items <= reserve ||
3546             (slab = keg_first_slab(keg, domain, rr)) == NULL) {
3547                 KEG_UNLOCK(keg, domain);
3548                 return (NULL);
3549         }
3550         return (slab);
3551 }
3552
3553 static uma_slab_t
3554 keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, const int flags)
3555 {
3556         struct vm_domainset_iter di;
3557         uma_slab_t slab;
3558         int aflags, domain;
3559         bool rr;
3560
3561 restart:
3562         /*
3563          * Use the keg's policy if upper layers haven't already specified a
3564          * domain (as happens with first-touch zones).
3565          *
3566          * To avoid races we run the iterator with the keg lock held, but that
3567          * means that we cannot allow the vm_domainset layer to sleep.  Thus,
3568          * clear M_WAITOK and handle low memory conditions locally.
3569          */
3570         rr = rdomain == UMA_ANYDOMAIN;
3571         if (rr) {
3572                 aflags = (flags & ~M_WAITOK) | M_NOWAIT;
3573                 vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
3574                     &aflags);
3575         } else {
3576                 aflags = flags;
3577                 domain = rdomain;
3578         }
3579
3580         for (;;) {
3581                 slab = keg_fetch_free_slab(keg, domain, rr, flags);
3582                 if (slab != NULL)
3583                         return (slab);
3584
3585                 /*
3586                  * M_NOVM means don't ask at all!
3587                  */
3588                 if (flags & M_NOVM)
3589                         break;
3590
3591                 slab = keg_alloc_slab(keg, zone, domain, flags, aflags);
3592                 if (slab != NULL)
3593                         return (slab);
3594                 if (!rr && (flags & M_WAITOK) == 0)
3595                         break;
3596                 if (rr && vm_domainset_iter_policy(&di, &domain) != 0) {
3597                         if ((flags & M_WAITOK) != 0) {
3598                                 vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask);
3599                                 goto restart;
3600                         }
3601                         break;
3602                 }
3603         }
3604
3605         /*
3606          * We might not have been able to get a slab but another cpu
3607          * could have while we were unlocked.  Check again before we
3608          * fail.
3609          */
3610         if ((slab = keg_fetch_free_slab(keg, domain, rr, flags)) != NULL)
3611                 return (slab);
3612
3613         return (NULL);
3614 }
3615
3616 static void *
3617 slab_alloc_item(uma_keg_t keg, uma_slab_t slab)
3618 {
3619         uma_domain_t dom;
3620         void *item;
3621         int freei;
3622
3623         KEG_LOCK_ASSERT(keg, slab->us_domain);
3624
3625         dom = &keg->uk_domain[slab->us_domain];
3626         freei = BIT_FFS(keg->uk_ipers, &slab->us_free) - 1;
3627         BIT_CLR(keg->uk_ipers, freei, &slab->us_free);
3628         item = slab_item(slab, keg, freei);
3629         slab->us_freecount--;
3630         dom->ud_free_items--;
3631
3632         /*
3633          * Move this slab to the full list.  It must be on the partial list, so
3634          * we do not need to update the free slab count.  In particular,
3635          * keg_fetch_slab() always returns slabs on the partial list.
3636          */
3637         if (slab->us_freecount == 0) {
3638                 LIST_REMOVE(slab, us_link);
3639                 LIST_INSERT_HEAD(&dom->ud_full_slab, slab, us_link);
3640         }
3641
3642         return (item);
3643 }
3644
3645 static int
3646 zone_import(void *arg, void **bucket, int max, int domain, int flags)
3647 {
3648         uma_domain_t dom;
3649         uma_zone_t zone;
3650         uma_slab_t slab;
3651         uma_keg_t keg;
3652 #ifdef NUMA
3653         int stripe;
3654 #endif
3655         int i;
3656
3657         zone = arg;
3658         slab = NULL;
3659         keg = zone->uz_keg;
3660         /* Try to keep the buckets totally full */
3661         for (i = 0; i < max; ) {
3662                 if ((slab = keg_fetch_slab(keg, zone, domain, flags)) == NULL)
3663                         break;
3664 #ifdef NUMA
3665                 stripe = howmany(max, vm_ndomains);
3666 #endif
3667                 dom = &keg->uk_domain[slab->us_domain];
3668                 while (slab->us_freecount && i < max) {
3669                         bucket[i++] = slab_alloc_item(keg, slab);
3670                         if (dom->ud_free_items <= keg->uk_reserve)
3671                                 break;
3672 #ifdef NUMA
3673                         /*
3674                          * If the zone is striped we pick a new slab for every
3675                          * N allocations.  Eliminating this conditional will
3676                          * instead pick a new domain for each bucket rather
3677                          * than stripe within each bucket.  The current option
3678                          * produces more fragmentation and requires more cpu
3679                          * time but yields better distribution.
3680                          */
3681                         if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0 &&
3682                             vm_ndomains > 1 && --stripe == 0)
3683                                 break;
3684 #endif
3685                 }
3686                 KEG_UNLOCK(keg, slab->us_domain);
3687                 /* Don't block if we allocated any successfully. */
3688                 flags &= ~M_WAITOK;
3689                 flags |= M_NOWAIT;
3690         }
3691
3692         return i;
3693 }
3694
3695 static int
3696 zone_alloc_limit_hard(uma_zone_t zone, int count, int flags)
3697 {
3698         uint64_t old, new, total, max;
3699
3700         /*
3701          * The hard case.  We're going to sleep because there were existing
3702          * sleepers or because we ran out of items.  This routine enforces
3703          * fairness by keeping fifo order.
3704          *
3705          * First release our ill gotten gains and make some noise.
3706          */
3707         for (;;) {
3708                 zone_free_limit(zone, count);
3709                 zone_log_warning(zone);
3710                 zone_maxaction(zone);
3711                 if (flags & M_NOWAIT)
3712                         return (0);
3713
3714                 /*
3715                  * We need to allocate an item or set ourself as a sleeper
3716                  * while the sleepq lock is held to avoid wakeup races.  This
3717                  * is essentially a home rolled semaphore.
3718                  */
3719                 sleepq_lock(&zone->uz_max_items);
3720                 old = zone->uz_items;
3721                 do {
3722                         MPASS(UZ_ITEMS_SLEEPERS(old) < UZ_ITEMS_SLEEPERS_MAX);
3723                         /* Cache the max since we will evaluate twice. */
3724                         max = zone->uz_max_items;
3725                         if (UZ_ITEMS_SLEEPERS(old) != 0 ||
3726                             UZ_ITEMS_COUNT(old) >= max)
3727                                 new = old + UZ_ITEMS_SLEEPER;
3728                         else
3729                                 new = old + MIN(count, max - old);
3730                 } while (atomic_fcmpset_64(&zone->uz_items, &old, new) == 0);
3731
3732                 /* We may have successfully allocated under the sleepq lock. */
3733                 if (UZ_ITEMS_SLEEPERS(new) == 0) {
3734                         sleepq_release(&zone->uz_max_items);
3735                         return (new - old);
3736                 }
3737
3738                 /*
3739                  * This is in a different cacheline from uz_items so that we
3740                  * don't constantly invalidate the fastpath cacheline when we
3741                  * adjust item counts.  This could be limited to toggling on
3742                  * transitions.
3743                  */
3744                 atomic_add_32(&zone->uz_sleepers, 1);
3745                 atomic_add_64(&zone->uz_sleeps, 1);
3746
3747                 /*
3748                  * We have added ourselves as a sleeper.  The sleepq lock
3749                  * protects us from wakeup races.  Sleep now and then retry.
3750                  */
3751                 sleepq_add(&zone->uz_max_items, NULL, "zonelimit", 0, 0);
3752                 sleepq_wait(&zone->uz_max_items, PVM);
3753
3754                 /*
3755                  * After wakeup, remove ourselves as a sleeper and try
3756                  * again.  We no longer have the sleepq lock for protection.
3757                  *
3758                  * Subract ourselves as a sleeper while attempting to add
3759                  * our count.
3760                  */
3761                 atomic_subtract_32(&zone->uz_sleepers, 1);
3762                 old = atomic_fetchadd_64(&zone->uz_items,
3763                     -(UZ_ITEMS_SLEEPER - count));
3764                 /* We're no longer a sleeper. */
3765                 old -= UZ_ITEMS_SLEEPER;
3766
3767                 /*
3768                  * If we're still at the limit, restart.  Notably do not
3769                  * block on other sleepers.  Cache the max value to protect
3770                  * against changes via sysctl.
3771                  */
3772                 total = UZ_ITEMS_COUNT(old);
3773                 max = zone->uz_max_items;
3774                 if (total >= max)
3775                         continue;
3776                 /* Truncate if necessary, otherwise wake other sleepers. */
3777                 if (total + count > max) {
3778                         zone_free_limit(zone, total + count - max);
3779                         count = max - total;
3780                 } else if (total + count < max && UZ_ITEMS_SLEEPERS(old) != 0)
3781                         wakeup_one(&zone->uz_max_items);
3782
3783                 return (count);
3784         }
3785 }
3786
3787 /*
3788  * Allocate 'count' items from our max_items limit.  Returns the number
3789  * available.  If M_NOWAIT is not specified it will sleep until at least
3790  * one item can be allocated.
3791  */
3792 static int
3793 zone_alloc_limit(uma_zone_t zone, int count, int flags)
3794 {
3795         uint64_t old;
3796         uint64_t max;
3797
3798         max = zone->uz_max_items;
3799         MPASS(max > 0);
3800
3801         /*
3802          * We expect normal allocations to succeed with a simple
3803          * fetchadd.
3804          */
3805         old = atomic_fetchadd_64(&zone->uz_items, count);
3806         if (__predict_true(old + count <= max))
3807                 return (count);
3808
3809         /*
3810          * If we had some items and no sleepers just return the
3811          * truncated value.  We have to release the excess space
3812          * though because that may wake sleepers who weren't woken
3813          * because we were temporarily over the limit.
3814          */
3815         if (old < max) {
3816                 zone_free_limit(zone, (old + count) - max);
3817                 return (max - old);
3818         }
3819         return (zone_alloc_limit_hard(zone, count, flags));
3820 }
3821
3822 /*
3823  * Free a number of items back to the limit.
3824  */
3825 static void
3826 zone_free_limit(uma_zone_t zone, int count)
3827 {
3828         uint64_t old;
3829
3830         MPASS(count > 0);
3831
3832         /*
3833          * In the common case we either have no sleepers or
3834          * are still over the limit and can just return.
3835          */
3836         old = atomic_fetchadd_64(&zone->uz_items, -count);
3837         if (__predict_true(UZ_ITEMS_SLEEPERS(old) == 0 ||
3838            UZ_ITEMS_COUNT(old) - count >= zone->uz_max_items))
3839                 return;
3840
3841         /*
3842          * Moderate the rate of wakeups.  Sleepers will continue
3843          * to generate wakeups if necessary.
3844          */
3845         wakeup_one(&zone->uz_max_items);
3846 }
3847
3848 static uma_bucket_t
3849 zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags)
3850 {
3851         uma_bucket_t bucket;
3852         int maxbucket, cnt;
3853
3854         CTR3(KTR_UMA, "zone_alloc_bucket zone %s(%p) domain %d", zone->uz_name,
3855             zone, domain);
3856
3857         /* Avoid allocs targeting empty domains. */
3858         if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain))
3859                 domain = UMA_ANYDOMAIN;
3860         if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0)
3861                 domain = UMA_ANYDOMAIN;
3862
3863         if (zone->uz_max_items > 0)
3864                 maxbucket = zone_alloc_limit(zone, zone->uz_bucket_size,
3865                     M_NOWAIT);
3866         else
3867                 maxbucket = zone->uz_bucket_size;
3868         if (maxbucket == 0)
3869                 return (false);
3870
3871         /* Don't wait for buckets, preserve caller's NOVM setting. */
3872         bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM));
3873         if (bucket == NULL) {
3874                 cnt = 0;
3875                 goto out;
3876         }
3877
3878         bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket,
3879             MIN(maxbucket, bucket->ub_entries), domain, flags);
3880
3881         /*
3882          * Initialize the memory if necessary.
3883          */
3884         if (bucket->ub_cnt != 0 && zone->uz_init != NULL) {
3885                 int i;
3886
3887                 for (i = 0; i < bucket->ub_cnt; i++)
3888                         if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size,
3889                             flags) != 0)
3890                                 break;
3891                 /*
3892                  * If we couldn't initialize the whole bucket, put the
3893                  * rest back onto the freelist.
3894                  */
3895                 if (i != bucket->ub_cnt) {
3896                         zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i],
3897                             bucket->ub_cnt - i);
3898 #ifdef INVARIANTS
3899                         bzero(&bucket->ub_bucket[i],
3900                             sizeof(void *) * (bucket->ub_cnt - i));
3901 #endif
3902                         bucket->ub_cnt = i;
3903                 }
3904         }
3905
3906         cnt = bucket->ub_cnt;
3907         if (bucket->ub_cnt == 0) {
3908                 bucket_free(zone, bucket, udata);
3909                 counter_u64_add(zone->uz_fails, 1);
3910                 bucket = NULL;
3911         }
3912 out:
3913         if (zone->uz_max_items > 0 && cnt < maxbucket)
3914                 zone_free_limit(zone, maxbucket - cnt);
3915
3916         return (bucket);
3917 }
3918
3919 /*
3920  * Allocates a single item from a zone.
3921  *
3922  * Arguments
3923  *      zone   The zone to alloc for.
3924  *      udata  The data to be passed to the constructor.
3925  *      domain The domain to allocate from or UMA_ANYDOMAIN.
3926  *      flags  M_WAITOK, M_NOWAIT, M_ZERO.
3927  *
3928  * Returns
3929  *      NULL if there is no memory and M_NOWAIT is set
3930  *      An item if successful
3931  */
3932
3933 static void *
3934 zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags)
3935 {
3936         void *item;
3937
3938         if (zone->uz_max_items > 0 && zone_alloc_limit(zone, 1, flags) == 0)
3939                 return (NULL);
3940
3941         /* Avoid allocs targeting empty domains. */
3942         if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain))
3943                 domain = UMA_ANYDOMAIN;
3944
3945         if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1)
3946                 goto fail_cnt;
3947
3948         /*
3949          * We have to call both the zone's init (not the keg's init)
3950          * and the zone's ctor.  This is because the item is going from
3951          * a keg slab directly to the user, and the user is expecting it
3952          * to be both zone-init'd as well as zone-ctor'd.
3953          */
3954         if (zone->uz_init != NULL) {
3955                 if (zone->uz_init(item, zone->uz_size, flags) != 0) {
3956                         zone_free_item(zone, item, udata, SKIP_FINI | SKIP_CNT);
3957                         goto fail_cnt;
3958                 }
3959         }
3960         item = item_ctor(zone, zone->uz_flags, zone->uz_size, udata, flags,
3961             item);
3962         if (item == NULL)
3963                 goto fail;
3964
3965         counter_u64_add(zone->uz_allocs, 1);
3966         CTR3(KTR_UMA, "zone_alloc_item item %p from %s(%p)", item,
3967             zone->uz_name, zone);
3968
3969         return (item);
3970
3971 fail_cnt:
3972         counter_u64_add(zone->uz_fails, 1);
3973 fail:
3974         if (zone->uz_max_items > 0)
3975                 zone_free_limit(zone, 1);
3976         CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)",
3977             zone->uz_name, zone);
3978
3979         return (NULL);
3980 }
3981
3982 /* See uma.h */
3983 void
3984 uma_zfree_smr(uma_zone_t zone, void *item)
3985 {
3986         uma_cache_t cache;
3987         uma_cache_bucket_t bucket;
3988         int itemdomain, uz_flags;
3989
3990 #ifdef UMA_ZALLOC_DEBUG
3991         KASSERT((zone->uz_flags & UMA_ZONE_SMR) != 0,
3992             ("uma_zfree_smr: called with non-SMR zone.\n"));
3993         KASSERT(item != NULL, ("uma_zfree_smr: Called with NULL pointer."));
3994         SMR_ASSERT_NOT_ENTERED(zone->uz_smr);
3995         if (uma_zfree_debug(zone, item, NULL) == EJUSTRETURN)
3996                 return;
3997 #endif
3998         cache = &zone->uz_cpu[curcpu];
3999         uz_flags = cache_uz_flags(cache);
4000         itemdomain = 0;
4001 #ifdef NUMA
4002         if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0)
4003                 itemdomain = _vm_phys_domain(pmap_kextract((vm_offset_t)item));
4004 #endif
4005         critical_enter();
4006         do {
4007                 cache = &zone->uz_cpu[curcpu];
4008                 /* SMR Zones must free to the free bucket. */
4009                 bucket = &cache->uc_freebucket;
4010 #ifdef NUMA
4011                 if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 &&
4012                     PCPU_GET(domain) != itemdomain) {
4013                         bucket = &cache->uc_crossbucket;
4014                 }
4015 #endif
4016                 if (__predict_true(bucket->ucb_cnt < bucket->ucb_entries)) {
4017                         cache_bucket_push(cache, bucket, item);
4018                         critical_exit();
4019                         return;
4020                 }
4021         } while (cache_free(zone, cache, NULL, item, itemdomain));
4022         critical_exit();
4023
4024         /*
4025          * If nothing else caught this, we'll just do an internal free.
4026          */
4027         zone_free_item(zone, item, NULL, SKIP_NONE);
4028 }
4029
4030 /* See uma.h */
4031 void
4032 uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
4033 {
4034         uma_cache_t cache;
4035         uma_cache_bucket_t bucket;
4036         int itemdomain, uz_flags;
4037
4038         /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
4039         random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
4040
4041         CTR2(KTR_UMA, "uma_zfree_arg zone %s(%p)", zone->uz_name, zone);
4042
4043 #ifdef UMA_ZALLOC_DEBUG
4044         KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0,
4045             ("uma_zfree_arg: called with SMR zone.\n"));
4046         if (uma_zfree_debug(zone, item, udata) == EJUSTRETURN)
4047                 return;
4048 #endif
4049         /* uma_zfree(..., NULL) does nothing, to match free(9). */
4050         if (item == NULL)
4051                 return;
4052
4053         /*
4054          * We are accessing the per-cpu cache without a critical section to
4055          * fetch size and flags.  This is acceptable, if we are preempted we
4056          * will simply read another cpu's line.
4057          */
4058         cache = &zone->uz_cpu[curcpu];
4059         uz_flags = cache_uz_flags(cache);
4060         if (UMA_ALWAYS_CTORDTOR ||
4061             __predict_false((uz_flags & UMA_ZFLAG_CTORDTOR) != 0))
4062                 item_dtor(zone, item, cache_uz_size(cache), udata, SKIP_NONE);
4063
4064         /*
4065          * The race here is acceptable.  If we miss it we'll just have to wait
4066          * a little longer for the limits to be reset.
4067          */
4068         if (__predict_false(uz_flags & UMA_ZFLAG_LIMIT)) {
4069                 if (zone->uz_sleepers > 0)
4070                         goto zfree_item;
4071         }
4072
4073         /*
4074          * If possible, free to the per-CPU cache.  There are two
4075          * requirements for safe access to the per-CPU cache: (1) the thread
4076          * accessing the cache must not be preempted or yield during access,
4077          * and (2) the thread must not migrate CPUs without switching which
4078          * cache it accesses.  We rely on a critical section to prevent
4079          * preemption and migration.  We release the critical section in
4080          * order to acquire the zone mutex if we are unable to free to the
4081          * current cache; when we re-acquire the critical section, we must
4082          * detect and handle migration if it has occurred.
4083          */
4084         itemdomain = 0;
4085 #ifdef NUMA
4086         if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0)
4087                 itemdomain = _vm_phys_domain(pmap_kextract((vm_offset_t)item));
4088 #endif
4089         critical_enter();
4090         do {
4091                 cache = &zone->uz_cpu[curcpu];
4092                 /*
4093                  * Try to free into the allocbucket first to give LIFO
4094                  * ordering for cache-hot datastructures.  Spill over
4095                  * into the freebucket if necessary.  Alloc will swap
4096                  * them if one runs dry.
4097                  */
4098                 bucket = &cache->uc_allocbucket;
4099 #ifdef NUMA
4100                 if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 &&
4101                     PCPU_GET(domain) != itemdomain) {
4102                         bucket = &cache->uc_crossbucket;
4103                 } else
4104 #endif
4105                 if (bucket->ucb_cnt >= bucket->ucb_entries)
4106                         bucket = &cache->uc_freebucket;
4107                 if (__predict_true(bucket->ucb_cnt < bucket->ucb_entries)) {
4108                         cache_bucket_push(cache, bucket, item);
4109                         critical_exit();
4110                         return;
4111                 }
4112         } while (cache_free(zone, cache, udata, item, itemdomain));
4113         critical_exit();
4114
4115         /*
4116          * If nothing else caught this, we'll just do an internal free.
4117          */
4118 zfree_item:
4119         zone_free_item(zone, item, udata, SKIP_DTOR);
4120 }
4121
4122 #ifdef NUMA
4123 /*
4124  * sort crossdomain free buckets to domain correct buckets and cache
4125  * them.
4126  */
4127 static void
4128 zone_free_cross(uma_zone_t zone, uma_bucket_t bucket, void *udata)
4129 {
4130         struct uma_bucketlist fullbuckets;
4131         uma_zone_domain_t zdom;
4132         uma_bucket_t b;
4133         smr_seq_t seq;
4134         void *item;
4135         int domain;
4136
4137         CTR3(KTR_UMA,
4138             "uma_zfree: zone %s(%p) draining cross bucket %p",
4139             zone->uz_name, zone, bucket);
4140
4141         /*
4142          * It is possible for buckets to arrive here out of order so we fetch
4143          * the current smr seq rather than accepting the bucket's.
4144          */
4145         seq = SMR_SEQ_INVALID;
4146         if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
4147                 seq = smr_advance(zone->uz_smr);
4148
4149         /*
4150          * To avoid having ndomain * ndomain buckets for sorting we have a
4151          * lock on the current crossfree bucket.  A full matrix with
4152          * per-domain locking could be used if necessary.
4153          */
4154         STAILQ_INIT(&fullbuckets);
4155         ZONE_CROSS_LOCK(zone);
4156         while (bucket->ub_cnt > 0) {
4157                 item = bucket->ub_bucket[bucket->ub_cnt - 1];
4158                 domain = _vm_phys_domain(pmap_kextract((vm_offset_t)item));
4159                 zdom = ZDOM_GET(zone, domain);
4160                 if (zdom->uzd_cross == NULL) {
4161                         zdom->uzd_cross = bucket_alloc(zone, udata, M_NOWAIT);
4162                         if (zdom->uzd_cross == NULL)
4163                                 break;
4164                 }
4165                 b = zdom->uzd_cross;
4166                 b->ub_bucket[b->ub_cnt++] = item;
4167                 b->ub_seq = seq;
4168                 if (b->ub_cnt == b->ub_entries) {
4169                         STAILQ_INSERT_HEAD(&fullbuckets, b, ub_link);
4170                         zdom->uzd_cross = NULL;
4171                 }
4172                 bucket->ub_cnt--;
4173         }
4174         ZONE_CROSS_UNLOCK(zone);
4175         if (bucket->ub_cnt == 0)
4176                 bucket->ub_seq = SMR_SEQ_INVALID;
4177         bucket_free(zone, bucket, udata);
4178
4179         while ((b = STAILQ_FIRST(&fullbuckets)) != NULL) {
4180                 STAILQ_REMOVE_HEAD(&fullbuckets, ub_link);
4181                 domain = _vm_phys_domain(pmap_kextract(
4182                     (vm_offset_t)b->ub_bucket[0]));
4183                 zone_put_bucket(zone, domain, b, udata, true);
4184         }
4185 }
4186 #endif
4187
4188 static void
4189 zone_free_bucket(uma_zone_t zone, uma_bucket_t bucket, void *udata,
4190     int itemdomain, bool ws)
4191 {
4192
4193 #ifdef NUMA
4194         /*
4195          * Buckets coming from the wrong domain will be entirely for the
4196          * only other domain on two domain systems.  In this case we can
4197          * simply cache them.  Otherwise we need to sort them back to
4198          * correct domains.
4199          */
4200         if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 &&
4201             vm_ndomains > 2 && PCPU_GET(domain) != itemdomain) {
4202                 zone_free_cross(zone, bucket, udata);
4203                 return;
4204         }
4205 #endif
4206
4207         /*
4208          * Attempt to save the bucket in the zone's domain bucket cache.
4209          */
4210         CTR3(KTR_UMA,
4211             "uma_zfree: zone %s(%p) putting bucket %p on free list",
4212             zone->uz_name, zone, bucket);
4213         /* ub_cnt is pointing to the last free item */
4214         if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0)
4215                 itemdomain = zone_domain_lowest(zone, itemdomain);
4216         zone_put_bucket(zone, itemdomain, bucket, udata, ws);
4217 }
4218
4219 /*
4220  * Populate a free or cross bucket for the current cpu cache.  Free any
4221  * existing full bucket either to the zone cache or back to the slab layer.
4222  *
4223  * Enters and returns in a critical section.  false return indicates that
4224  * we can not satisfy this free in the cache layer.  true indicates that
4225  * the caller should retry.
4226  */
4227 static __noinline bool
4228 cache_free(uma_zone_t zone, uma_cache_t cache, void *udata, void *item,
4229     int itemdomain)
4230 {
4231         uma_cache_bucket_t cbucket;
4232         uma_bucket_t newbucket, bucket;
4233
4234         CRITICAL_ASSERT(curthread);
4235
4236         if (zone->uz_bucket_size == 0)
4237                 return false;
4238
4239         cache = &zone->uz_cpu[curcpu];
4240         newbucket = NULL;
4241
4242         /*
4243          * FIRSTTOUCH domains need to free to the correct zdom.  When
4244          * enabled this is the zdom of the item.   The bucket is the
4245          * cross bucket if the current domain and itemdomain do not match.
4246          */
4247         cbucket = &cache->uc_freebucket;
4248 #ifdef NUMA
4249         if ((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) != 0) {
4250                 if (PCPU_GET(domain) != itemdomain) {
4251                         cbucket = &cache->uc_crossbucket;
4252                         if (cbucket->ucb_cnt != 0)
4253                                 counter_u64_add(zone->uz_xdomain,
4254                                     cbucket->ucb_cnt);
4255                 }
4256         }
4257 #endif
4258         bucket = cache_bucket_unload(cbucket);
4259         KASSERT(bucket == NULL || bucket->ub_cnt == bucket->ub_entries,
4260             ("cache_free: Entered with non-full free bucket."));
4261
4262         /* We are no longer associated with this CPU. */
4263         critical_exit();
4264
4265         /*
4266          * Don't let SMR zones operate without a free bucket.  Force
4267          * a synchronize and re-use this one.  We will only degrade
4268          * to a synchronize every bucket_size items rather than every
4269          * item if we fail to allocate a bucket.
4270          */
4271         if ((zone->uz_flags & UMA_ZONE_SMR) != 0) {
4272                 if (bucket != NULL)
4273                         bucket->ub_seq = smr_advance(zone->uz_smr);
4274                 newbucket = bucket_alloc(zone, udata, M_NOWAIT);
4275                 if (newbucket == NULL && bucket != NULL) {
4276                         bucket_drain(zone, bucket);
4277                         newbucket = bucket;
4278                         bucket = NULL;
4279                 }
4280         } else if (!bucketdisable)
4281                 newbucket = bucket_alloc(zone, udata, M_NOWAIT);
4282
4283         if (bucket != NULL)
4284                 zone_free_bucket(zone, bucket, udata, itemdomain, true);
4285
4286         critical_enter();
4287         if ((bucket = newbucket) == NULL)
4288                 return (false);
4289         cache = &zone->uz_cpu[curcpu];
4290 #ifdef NUMA
4291         /*
4292          * Check to see if we should be populating the cross bucket.  If it
4293          * is already populated we will fall through and attempt to populate
4294          * the free bucket.
4295          */
4296         if ((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) != 0) {
4297                 if (PCPU_GET(domain) != itemdomain &&
4298                     cache->uc_crossbucket.ucb_bucket == NULL) {
4299                         cache_bucket_load_cross(cache, bucket);
4300                         return (true);
4301                 }
4302         }
4303 #endif
4304         /*
4305          * We may have lost the race to fill the bucket or switched CPUs.
4306          */
4307         if (cache->uc_freebucket.ucb_bucket != NULL) {
4308                 critical_exit();
4309                 bucket_free(zone, bucket, udata);
4310                 critical_enter();
4311         } else
4312                 cache_bucket_load_free(cache, bucket);
4313
4314         return (true);
4315 }
4316
4317 void
4318 uma_zfree_domain(uma_zone_t zone, void *item, void *udata)
4319 {
4320
4321         /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
4322         random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
4323
4324         CTR2(KTR_UMA, "uma_zfree_domain zone %s(%p)", zone->uz_name, zone);
4325
4326         KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
4327             ("uma_zfree_domain: called with spinlock or critical section held"));
4328
4329         /* uma_zfree(..., NULL) does nothing, to match free(9). */
4330         if (item == NULL)
4331                 return;
4332         zone_free_item(zone, item, udata, SKIP_NONE);
4333 }
4334
4335 static void
4336 slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item)
4337 {
4338         uma_keg_t keg;
4339         uma_domain_t dom;
4340         int freei;
4341
4342         keg = zone->uz_keg;
4343         KEG_LOCK_ASSERT(keg, slab->us_domain);
4344
4345         /* Do we need to remove from any lists? */
4346         dom = &keg->uk_domain[slab->us_domain];
4347         if (slab->us_freecount + 1 == keg->uk_ipers) {
4348                 LIST_REMOVE(slab, us_link);
4349                 LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link);
4350                 dom->ud_free_slabs++;
4351         } else if (slab->us_freecount == 0) {
4352                 LIST_REMOVE(slab, us_link);
4353                 LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
4354         }
4355
4356         /* Slab management. */
4357         freei = slab_item_index(slab, keg, item);
4358         BIT_SET(keg->uk_ipers, freei, &slab->us_free);
4359         slab->us_freecount++;
4360
4361         /* Keg statistics. */
4362         dom->ud_free_items++;
4363 }
4364
4365 static void
4366 zone_release(void *arg, void **bucket, int cnt)
4367 {
4368         struct mtx *lock;
4369         uma_zone_t zone;
4370         uma_slab_t slab;
4371         uma_keg_t keg;
4372         uint8_t *mem;
4373         void *item;
4374         int i;
4375
4376         zone = arg;
4377         keg = zone->uz_keg;
4378         lock = NULL;
4379         if (__predict_false((zone->uz_flags & UMA_ZFLAG_HASH) != 0))
4380                 lock = KEG_LOCK(keg, 0);
4381         for (i = 0; i < cnt; i++) {
4382                 item = bucket[i];
4383                 if (__predict_true((zone->uz_flags & UMA_ZFLAG_VTOSLAB) != 0)) {
4384                         slab = vtoslab((vm_offset_t)item);
4385                 } else {
4386                         mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
4387                         if ((zone->uz_flags & UMA_ZFLAG_HASH) != 0)
4388                                 slab = hash_sfind(&keg->uk_hash, mem);
4389                         else
4390                                 slab = (uma_slab_t)(mem + keg->uk_pgoff);
4391                 }
4392                 if (lock != KEG_LOCKPTR(keg, slab->us_domain)) {
4393                         if (lock != NULL)
4394                                 mtx_unlock(lock);
4395                         lock = KEG_LOCK(keg, slab->us_domain);
4396                 }
4397                 slab_free_item(zone, slab, item);
4398         }
4399         if (lock != NULL)
4400                 mtx_unlock(lock);
4401 }
4402
4403 /*
4404  * Frees a single item to any zone.
4405  *
4406  * Arguments:
4407  *      zone   The zone to free to
4408  *      item   The item we're freeing
4409  *      udata  User supplied data for the dtor
4410  *      skip   Skip dtors and finis
4411  */
4412 static __noinline void
4413 zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip)
4414 {
4415
4416         /*
4417          * If a free is sent directly to an SMR zone we have to
4418          * synchronize immediately because the item can instantly
4419          * be reallocated. This should only happen in degenerate
4420          * cases when no memory is available for per-cpu caches.
4421          */
4422         if ((zone->uz_flags & UMA_ZONE_SMR) != 0 && skip == SKIP_NONE)
4423                 smr_synchronize(zone->uz_smr);
4424
4425         item_dtor(zone, item, zone->uz_size, udata, skip);
4426
4427         if (skip < SKIP_FINI && zone->uz_fini)
4428                 zone->uz_fini(item, zone->uz_size);
4429
4430         zone->uz_release(zone->uz_arg, &item, 1);
4431
4432         if (skip & SKIP_CNT)
4433                 return;
4434
4435         counter_u64_add(zone->uz_frees, 1);
4436
4437         if (zone->uz_max_items > 0)
4438                 zone_free_limit(zone, 1);
4439 }
4440
4441 /* See uma.h */
4442 int
4443 uma_zone_set_max(uma_zone_t zone, int nitems)
4444 {
4445         struct uma_bucket_zone *ubz;
4446         int count;
4447
4448         /*
4449          * XXX This can misbehave if the zone has any allocations with
4450          * no limit and a limit is imposed.  There is currently no
4451          * way to clear a limit.
4452          */
4453         ZONE_LOCK(zone);
4454         ubz = bucket_zone_max(zone, nitems);
4455         count = ubz != NULL ? ubz->ubz_entries : 0;
4456         zone->uz_bucket_size_max = zone->uz_bucket_size = count;
4457         if (zone->uz_bucket_size_min > zone->uz_bucket_size_max)
4458                 zone->uz_bucket_size_min = zone->uz_bucket_size_max;
4459         zone->uz_max_items = nitems;
4460         zone->uz_flags |= UMA_ZFLAG_LIMIT;
4461         zone_update_caches(zone);
4462         /* We may need to wake waiters. */
4463         wakeup(&zone->uz_max_items);
4464         ZONE_UNLOCK(zone);
4465
4466         return (nitems);
4467 }
4468
4469 /* See uma.h */
4470 void
4471 uma_zone_set_maxcache(uma_zone_t zone, int nitems)
4472 {
4473         struct uma_bucket_zone *ubz;
4474         int bpcpu;
4475
4476         ZONE_LOCK(zone);
4477         ubz = bucket_zone_max(zone, nitems);
4478         if (ubz != NULL) {
4479                 bpcpu = 2;
4480                 if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0)
4481                         /* Count the cross-domain bucket. */
4482                         bpcpu++;
4483                 nitems -= ubz->ubz_entries * bpcpu * mp_ncpus;
4484                 zone->uz_bucket_size_max = ubz->ubz_entries;
4485         } else {
4486                 zone->uz_bucket_size_max = zone->uz_bucket_size = 0;
4487         }
4488         if (zone->uz_bucket_size_min > zone->uz_bucket_size_max)
4489                 zone->uz_bucket_size_min = zone->uz_bucket_size_max;
4490         zone->uz_bucket_max = nitems / vm_ndomains;
4491         ZONE_UNLOCK(zone);
4492 }
4493
4494 /* See uma.h */
4495 int
4496 uma_zone_get_max(uma_zone_t zone)
4497 {
4498         int nitems;
4499
4500         nitems = atomic_load_64(&zone->uz_max_items);
4501
4502         return (nitems);
4503 }
4504
4505 /* See uma.h */
4506 void
4507 uma_zone_set_warning(uma_zone_t zone, const char *warning)
4508 {
4509
4510         ZONE_ASSERT_COLD(zone);
4511         zone->uz_warning = warning;
4512 }
4513
4514 /* See uma.h */
4515 void
4516 uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction)
4517 {
4518
4519         ZONE_ASSERT_COLD(zone);
4520         TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone);
4521 }
4522
4523 /* See uma.h */
4524 int
4525 uma_zone_get_cur(uma_zone_t zone)
4526 {
4527         int64_t nitems;
4528         u_int i;
4529
4530         nitems = 0;
4531         if (zone->uz_allocs != EARLY_COUNTER && zone->uz_frees != EARLY_COUNTER)
4532                 nitems = counter_u64_fetch(zone->uz_allocs) -
4533                     counter_u64_fetch(zone->uz_frees);
4534         CPU_FOREACH(i)
4535                 nitems += atomic_load_64(&zone->uz_cpu[i].uc_allocs) -
4536                     atomic_load_64(&zone->uz_cpu[i].uc_frees);
4537
4538         return (nitems < 0 ? 0 : nitems);
4539 }
4540
4541 static uint64_t
4542 uma_zone_get_allocs(uma_zone_t zone)
4543 {
4544         uint64_t nitems;
4545         u_int i;
4546
4547         nitems = 0;
4548         if (zone->uz_allocs != EARLY_COUNTER)
4549                 nitems = counter_u64_fetch(zone->uz_allocs);
4550         CPU_FOREACH(i)
4551                 nitems += atomic_load_64(&zone->uz_cpu[i].uc_allocs);
4552
4553         return (nitems);
4554 }
4555
4556 static uint64_t
4557 uma_zone_get_frees(uma_zone_t zone)
4558 {
4559         uint64_t nitems;
4560         u_int i;
4561
4562         nitems = 0;
4563         if (zone->uz_frees != EARLY_COUNTER)
4564                 nitems = counter_u64_fetch(zone->uz_frees);
4565         CPU_FOREACH(i)
4566                 nitems += atomic_load_64(&zone->uz_cpu[i].uc_frees);
4567
4568         return (nitems);
4569 }
4570
4571 #ifdef INVARIANTS
4572 /* Used only for KEG_ASSERT_COLD(). */
4573 static uint64_t
4574 uma_keg_get_allocs(uma_keg_t keg)
4575 {
4576         uma_zone_t z;
4577         uint64_t nitems;
4578
4579         nitems = 0;
4580         LIST_FOREACH(z, &keg->uk_zones, uz_link)
4581                 nitems += uma_zone_get_allocs(z);
4582
4583         return (nitems);
4584 }
4585 #endif
4586
4587 /* See uma.h */
4588 void
4589 uma_zone_set_init(uma_zone_t zone, uma_init uminit)
4590 {
4591         uma_keg_t keg;
4592
4593         KEG_GET(zone, keg);
4594         KEG_ASSERT_COLD(keg);
4595         keg->uk_init = uminit;
4596 }
4597
4598 /* See uma.h */
4599 void
4600 uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
4601 {
4602         uma_keg_t keg;
4603
4604         KEG_GET(zone, keg);
4605         KEG_ASSERT_COLD(keg);
4606         keg->uk_fini = fini;
4607 }
4608
4609 /* See uma.h */
4610 void
4611 uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
4612 {
4613
4614         ZONE_ASSERT_COLD(zone);
4615         zone->uz_init = zinit;
4616 }
4617
4618 /* See uma.h */
4619 void
4620 uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
4621 {
4622
4623         ZONE_ASSERT_COLD(zone);
4624         zone->uz_fini = zfini;
4625 }
4626
4627 /* See uma.h */
4628 void
4629 uma_zone_set_freef(uma_zone_t zone, uma_free freef)
4630 {
4631         uma_keg_t keg;
4632
4633         KEG_GET(zone, keg);
4634         KEG_ASSERT_COLD(keg);
4635         keg->uk_freef = freef;
4636 }
4637
4638 /* See uma.h */
4639 void
4640 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
4641 {
4642         uma_keg_t keg;
4643
4644         KEG_GET(zone, keg);
4645         KEG_ASSERT_COLD(keg);
4646         keg->uk_allocf = allocf;
4647 }
4648
4649 /* See uma.h */
4650 void
4651 uma_zone_set_smr(uma_zone_t zone, smr_t smr)
4652 {
4653
4654         ZONE_ASSERT_COLD(zone);
4655
4656         zone->uz_flags |= UMA_ZONE_SMR;
4657         zone->uz_smr = smr;
4658         zone_update_caches(zone);
4659 }
4660
4661 smr_t
4662 uma_zone_get_smr(uma_zone_t zone)
4663 {
4664
4665         return (zone->uz_smr);
4666 }
4667
4668 /* See uma.h */
4669 void
4670 uma_zone_reserve(uma_zone_t zone, int items)
4671 {
4672         uma_keg_t keg;
4673
4674         KEG_GET(zone, keg);
4675         KEG_ASSERT_COLD(keg);
4676         keg->uk_reserve = items;
4677 }
4678
4679 /* See uma.h */
4680 int
4681 uma_zone_reserve_kva(uma_zone_t zone, int count)
4682 {
4683         uma_keg_t keg;
4684         vm_offset_t kva;
4685         u_int pages;
4686
4687         KEG_GET(zone, keg);
4688         KEG_ASSERT_COLD(keg);
4689         ZONE_ASSERT_COLD(zone);
4690
4691         pages = howmany(count, keg->uk_ipers) * keg->uk_ppera;
4692
4693 #ifdef UMA_MD_SMALL_ALLOC
4694         if (keg->uk_ppera > 1) {
4695 #else
4696         if (1) {
4697 #endif
4698                 kva = kva_alloc((vm_size_t)pages * PAGE_SIZE);
4699                 if (kva == 0)
4700                         return (0);
4701         } else
4702                 kva = 0;
4703
4704         MPASS(keg->uk_kva == 0);
4705         keg->uk_kva = kva;
4706         keg->uk_offset = 0;
4707         zone->uz_max_items = pages * keg->uk_ipers;
4708 #ifdef UMA_MD_SMALL_ALLOC
4709         keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc;
4710 #else
4711         keg->uk_allocf = noobj_alloc;
4712 #endif
4713         keg->uk_flags |= UMA_ZFLAG_LIMIT | UMA_ZONE_NOFREE;
4714         zone->uz_flags |= UMA_ZFLAG_LIMIT | UMA_ZONE_NOFREE;
4715         zone_update_caches(zone);
4716
4717         return (1);
4718 }
4719
4720 /* See uma.h */
4721 void
4722 uma_prealloc(uma_zone_t zone, int items)
4723 {
4724         struct vm_domainset_iter di;
4725         uma_domain_t dom;
4726         uma_slab_t slab;
4727         uma_keg_t keg;
4728         int aflags, domain, slabs;
4729
4730         KEG_GET(zone, keg);
4731         slabs = howmany(items, keg->uk_ipers);
4732         while (slabs-- > 0) {
4733                 aflags = M_NOWAIT;
4734                 vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
4735                     &aflags);
4736                 for (;;) {
4737                         slab = keg_alloc_slab(keg, zone, domain, M_WAITOK,
4738                             aflags);
4739                         if (slab != NULL) {
4740                                 dom = &keg->uk_domain[slab->us_domain];
4741                                 /*
4742                                  * keg_alloc_slab() always returns a slab on the
4743                                  * partial list.
4744                                  */
4745                                 LIST_REMOVE(slab, us_link);
4746                                 LIST_INSERT_HEAD(&dom->ud_free_slab, slab,
4747                                     us_link);
4748                                 dom->ud_free_slabs++;
4749                                 KEG_UNLOCK(keg, slab->us_domain);
4750                                 break;
4751                         }
4752                         if (vm_domainset_iter_policy(&di, &domain) != 0)
4753                                 vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask);
4754                 }
4755         }
4756 }
4757
4758 /*
4759  * Returns a snapshot of memory consumption in bytes.
4760  */
4761 size_t
4762 uma_zone_memory(uma_zone_t zone)
4763 {
4764         size_t sz;
4765         int i;
4766
4767         sz = 0;
4768         if (zone->uz_flags & UMA_ZFLAG_CACHE) {
4769                 for (i = 0; i < vm_ndomains; i++)
4770                         sz += ZDOM_GET(zone, i)->uzd_nitems;
4771                 return (sz * zone->uz_size);
4772         }
4773         for (i = 0; i < vm_ndomains; i++)
4774                 sz += zone->uz_keg->uk_domain[i].ud_pages;
4775
4776         return (sz * PAGE_SIZE);
4777 }
4778
4779 /* See uma.h */
4780 void
4781 uma_reclaim(int req)
4782 {
4783
4784         CTR0(KTR_UMA, "UMA: vm asked us to release pages!");
4785         sx_xlock(&uma_reclaim_lock);
4786         bucket_enable();
4787
4788         switch (req) {
4789         case UMA_RECLAIM_TRIM:
4790                 zone_foreach(zone_trim, NULL);
4791                 break;
4792         case UMA_RECLAIM_DRAIN:
4793         case UMA_RECLAIM_DRAIN_CPU:
4794                 zone_foreach(zone_drain, NULL);
4795                 if (req == UMA_RECLAIM_DRAIN_CPU) {
4796                         pcpu_cache_drain_safe(NULL);
4797                         zone_foreach(zone_drain, NULL);
4798                 }
4799                 break;
4800         default:
4801                 panic("unhandled reclamation request %d", req);
4802         }
4803
4804         /*
4805          * Some slabs may have been freed but this zone will be visited early
4806          * we visit again so that we can free pages that are empty once other
4807          * zones are drained.  We have to do the same for buckets.
4808          */
4809         zone_drain(slabzones[0], NULL);
4810         zone_drain(slabzones[1], NULL);
4811         bucket_zone_drain();
4812         sx_xunlock(&uma_reclaim_lock);
4813 }
4814
4815 static volatile int uma_reclaim_needed;
4816
4817 void
4818 uma_reclaim_wakeup(void)
4819 {
4820
4821         if (atomic_fetchadd_int(&uma_reclaim_needed, 1) == 0)
4822                 wakeup(uma_reclaim);
4823 }
4824
4825 void
4826 uma_reclaim_worker(void *arg __unused)
4827 {
4828
4829         for (;;) {
4830                 sx_xlock(&uma_reclaim_lock);
4831                 while (atomic_load_int(&uma_reclaim_needed) == 0)
4832                         sx_sleep(uma_reclaim, &uma_reclaim_lock, PVM, "umarcl",
4833                             hz);
4834                 sx_xunlock(&uma_reclaim_lock);
4835                 EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM);
4836                 uma_reclaim(UMA_RECLAIM_DRAIN_CPU);
4837                 atomic_store_int(&uma_reclaim_needed, 0);
4838                 /* Don't fire more than once per-second. */
4839                 pause("umarclslp", hz);
4840         }
4841 }
4842
4843 /* See uma.h */
4844 void
4845 uma_zone_reclaim(uma_zone_t zone, int req)
4846 {
4847
4848         switch (req) {
4849         case UMA_RECLAIM_TRIM:
4850                 zone_trim(zone, NULL);
4851                 break;
4852         case UMA_RECLAIM_DRAIN:
4853                 zone_drain(zone, NULL);
4854                 break;
4855         case UMA_RECLAIM_DRAIN_CPU:
4856                 pcpu_cache_drain_safe(zone);
4857                 zone_drain(zone, NULL);
4858                 break;
4859         default:
4860                 panic("unhandled reclamation request %d", req);
4861         }
4862 }
4863
4864 /* See uma.h */
4865 int
4866 uma_zone_exhausted(uma_zone_t zone)
4867 {
4868
4869         return (atomic_load_32(&zone->uz_sleepers) > 0);
4870 }
4871
4872 unsigned long
4873 uma_limit(void)
4874 {
4875
4876         return (uma_kmem_limit);
4877 }
4878
4879 void
4880 uma_set_limit(unsigned long limit)
4881 {
4882
4883         uma_kmem_limit = limit;
4884 }
4885
4886 unsigned long
4887 uma_size(void)
4888 {
4889
4890         return (atomic_load_long(&uma_kmem_total));
4891 }
4892
4893 long
4894 uma_avail(void)
4895 {
4896
4897         return (uma_kmem_limit - uma_size());
4898 }
4899
4900 #ifdef DDB
4901 /*
4902  * Generate statistics across both the zone and its per-cpu cache's.  Return
4903  * desired statistics if the pointer is non-NULL for that statistic.
4904  *
4905  * Note: does not update the zone statistics, as it can't safely clear the
4906  * per-CPU cache statistic.
4907  *
4908  */
4909 static void
4910 uma_zone_sumstat(uma_zone_t z, long *cachefreep, uint64_t *allocsp,
4911     uint64_t *freesp, uint64_t *sleepsp, uint64_t *xdomainp)
4912 {
4913         uma_cache_t cache;
4914         uint64_t allocs, frees, sleeps, xdomain;
4915         int cachefree, cpu;
4916
4917         allocs = frees = sleeps = xdomain = 0;
4918         cachefree = 0;
4919         CPU_FOREACH(cpu) {
4920                 cache = &z->uz_cpu[cpu];
4921                 cachefree += cache->uc_allocbucket.ucb_cnt;
4922                 cachefree += cache->uc_freebucket.ucb_cnt;
4923                 xdomain += cache->uc_crossbucket.ucb_cnt;
4924                 cachefree += cache->uc_crossbucket.ucb_cnt;
4925                 allocs += cache->uc_allocs;
4926                 frees += cache->uc_frees;
4927         }
4928         allocs += counter_u64_fetch(z->uz_allocs);
4929         frees += counter_u64_fetch(z->uz_frees);
4930         xdomain += counter_u64_fetch(z->uz_xdomain);
4931         sleeps += z->uz_sleeps;
4932         if (cachefreep != NULL)
4933                 *cachefreep = cachefree;
4934         if (allocsp != NULL)
4935                 *allocsp = allocs;
4936         if (freesp != NULL)
4937                 *freesp = frees;
4938         if (sleepsp != NULL)
4939                 *sleepsp = sleeps;
4940         if (xdomainp != NULL)
4941                 *xdomainp = xdomain;
4942 }
4943 #endif /* DDB */
4944
4945 static int
4946 sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS)
4947 {
4948         uma_keg_t kz;
4949         uma_zone_t z;
4950         int count;
4951
4952         count = 0;
4953         rw_rlock(&uma_rwlock);
4954         LIST_FOREACH(kz, &uma_kegs, uk_link) {
4955                 LIST_FOREACH(z, &kz->uk_zones, uz_link)
4956                         count++;
4957         }
4958         LIST_FOREACH(z, &uma_cachezones, uz_link)
4959                 count++;
4960
4961         rw_runlock(&uma_rwlock);
4962         return (sysctl_handle_int(oidp, &count, 0, req));
4963 }
4964
4965 static void
4966 uma_vm_zone_stats(struct uma_type_header *uth, uma_zone_t z, struct sbuf *sbuf,
4967     struct uma_percpu_stat *ups, bool internal)
4968 {
4969         uma_zone_domain_t zdom;
4970         uma_cache_t cache;
4971         int i;
4972
4973
4974         for (i = 0; i < vm_ndomains; i++) {
4975                 zdom = ZDOM_GET(z, i);
4976                 uth->uth_zone_free += zdom->uzd_nitems;
4977         }
4978         uth->uth_allocs = counter_u64_fetch(z->uz_allocs);
4979         uth->uth_frees = counter_u64_fetch(z->uz_frees);
4980         uth->uth_fails = counter_u64_fetch(z->uz_fails);
4981         uth->uth_xdomain = counter_u64_fetch(z->uz_xdomain);
4982         uth->uth_sleeps = z->uz_sleeps;
4983
4984         for (i = 0; i < mp_maxid + 1; i++) {
4985                 bzero(&ups[i], sizeof(*ups));
4986                 if (internal || CPU_ABSENT(i))
4987                         continue;
4988                 cache = &z->uz_cpu[i];
4989                 ups[i].ups_cache_free += cache->uc_allocbucket.ucb_cnt;
4990                 ups[i].ups_cache_free += cache->uc_freebucket.ucb_cnt;
4991                 ups[i].ups_cache_free += cache->uc_crossbucket.ucb_cnt;
4992                 ups[i].ups_allocs = cache->uc_allocs;
4993                 ups[i].ups_frees = cache->uc_frees;
4994         }
4995 }
4996
4997 static int
4998 sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS)
4999 {
5000         struct uma_stream_header ush;
5001         struct uma_type_header uth;
5002         struct uma_percpu_stat *ups;
5003         struct sbuf sbuf;
5004         uma_keg_t kz;
5005         uma_zone_t z;
5006         uint64_t items;
5007         uint32_t kfree, pages;
5008         int count, error, i;
5009
5010         error = sysctl_wire_old_buffer(req, 0);
5011         if (error != 0)
5012                 return (error);
5013         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
5014         sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL);
5015         ups = malloc((mp_maxid + 1) * sizeof(*ups), M_TEMP, M_WAITOK);
5016
5017         count = 0;
5018         rw_rlock(&uma_rwlock);
5019         LIST_FOREACH(kz, &uma_kegs, uk_link) {
5020                 LIST_FOREACH(z, &kz->uk_zones, uz_link)
5021                         count++;
5022         }
5023
5024         LIST_FOREACH(z, &uma_cachezones, uz_link)
5025                 count++;
5026
5027         /*
5028          * Insert stream header.
5029          */
5030         bzero(&ush, sizeof(ush));
5031         ush.ush_version = UMA_STREAM_VERSION;
5032         ush.ush_maxcpus = (mp_maxid + 1);
5033         ush.ush_count = count;
5034         (void)sbuf_bcat(&sbuf, &ush, sizeof(ush));
5035
5036         LIST_FOREACH(kz, &uma_kegs, uk_link) {
5037                 kfree = pages = 0;
5038                 for (i = 0; i < vm_ndomains; i++) {
5039                         kfree += kz->uk_domain[i].ud_free_items;
5040                         pages += kz->uk_domain[i].ud_pages;
5041                 }
5042                 LIST_FOREACH(z, &kz->uk_zones, uz_link) {
5043                         bzero(&uth, sizeof(uth));
5044                         strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
5045                         uth.uth_align = kz->uk_align;
5046                         uth.uth_size = kz->uk_size;
5047                         uth.uth_rsize = kz->uk_rsize;
5048                         if (z->uz_max_items > 0) {
5049                                 items = UZ_ITEMS_COUNT(z->uz_items);
5050                                 uth.uth_pages = (items / kz->uk_ipers) *
5051                                         kz->uk_ppera;
5052                         } else
5053                                 uth.uth_pages = pages;
5054                         uth.uth_maxpages = (z->uz_max_items / kz->uk_ipers) *
5055                             kz->uk_ppera;
5056                         uth.uth_limit = z->uz_max_items;
5057                         uth.uth_keg_free = kfree;
5058
5059                         /*
5060                          * A zone is secondary is it is not the first entry
5061                          * on the keg's zone list.
5062                          */
5063                         if ((z->uz_flags & UMA_ZONE_SECONDARY) &&
5064                             (LIST_FIRST(&kz->uk_zones) != z))
5065                                 uth.uth_zone_flags = UTH_ZONE_SECONDARY;
5066                         uma_vm_zone_stats(&uth, z, &sbuf, ups,
5067                             kz->uk_flags & UMA_ZFLAG_INTERNAL);
5068                         (void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
5069                         for (i = 0; i < mp_maxid + 1; i++)
5070                                 (void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i]));
5071                 }
5072         }
5073         LIST_FOREACH(z, &uma_cachezones, uz_link) {
5074                 bzero(&uth, sizeof(uth));
5075                 strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
5076                 uth.uth_size = z->uz_size;
5077                 uma_vm_zone_stats(&uth, z, &sbuf, ups, false);
5078                 (void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
5079                 for (i = 0; i < mp_maxid + 1; i++)
5080                         (void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i]));
5081         }
5082
5083         rw_runlock(&uma_rwlock);
5084         error = sbuf_finish(&sbuf);
5085         sbuf_delete(&sbuf);
5086         free(ups, M_TEMP);
5087         return (error);
5088 }
5089
5090 int
5091 sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS)
5092 {
5093         uma_zone_t zone = *(uma_zone_t *)arg1;
5094         int error, max;
5095
5096         max = uma_zone_get_max(zone);
5097         error = sysctl_handle_int(oidp, &max, 0, req);
5098         if (error || !req->newptr)
5099                 return (error);
5100
5101         uma_zone_set_max(zone, max);
5102
5103         return (0);
5104 }
5105
5106 int
5107 sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS)
5108 {
5109         uma_zone_t zone;
5110         int cur;
5111
5112         /*
5113          * Some callers want to add sysctls for global zones that
5114          * may not yet exist so they pass a pointer to a pointer.
5115          */
5116         if (arg2 == 0)
5117                 zone = *(uma_zone_t *)arg1;
5118         else
5119                 zone = arg1;
5120         cur = uma_zone_get_cur(zone);
5121         return (sysctl_handle_int(oidp, &cur, 0, req));
5122 }
5123
5124 static int
5125 sysctl_handle_uma_zone_allocs(SYSCTL_HANDLER_ARGS)
5126 {
5127         uma_zone_t zone = arg1;
5128         uint64_t cur;
5129
5130         cur = uma_zone_get_allocs(zone);
5131         return (sysctl_handle_64(oidp, &cur, 0, req));
5132 }
5133
5134 static int
5135 sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS)
5136 {
5137         uma_zone_t zone = arg1;
5138         uint64_t cur;
5139
5140         cur = uma_zone_get_frees(zone);
5141         return (sysctl_handle_64(oidp, &cur, 0, req));
5142 }
5143
5144 static int
5145 sysctl_handle_uma_zone_flags(SYSCTL_HANDLER_ARGS)
5146 {
5147         struct sbuf sbuf;
5148         uma_zone_t zone = arg1;
5149         int error;
5150
5151         sbuf_new_for_sysctl(&sbuf, NULL, 0, req);
5152         if (zone->uz_flags != 0)
5153                 sbuf_printf(&sbuf, "0x%b", zone->uz_flags, PRINT_UMA_ZFLAGS);
5154         else
5155                 sbuf_printf(&sbuf, "0");
5156         error = sbuf_finish(&sbuf);
5157         sbuf_delete(&sbuf);
5158
5159         return (error);
5160 }
5161
5162 static int
5163 sysctl_handle_uma_slab_efficiency(SYSCTL_HANDLER_ARGS)
5164 {
5165         uma_keg_t keg = arg1;
5166         int avail, effpct, total;
5167
5168         total = keg->uk_ppera * PAGE_SIZE;
5169         if ((keg->uk_flags & UMA_ZFLAG_OFFPAGE) != 0)
5170                 total += slabzone(keg->uk_ipers)->uz_keg->uk_rsize;
5171         /*
5172          * We consider the client's requested size and alignment here, not the
5173          * real size determination uk_rsize, because we also adjust the real
5174          * size for internal implementation reasons (max bitset size).
5175          */
5176         avail = keg->uk_ipers * roundup2(keg->uk_size, keg->uk_align + 1);
5177         if ((keg->uk_flags & UMA_ZONE_PCPU) != 0)
5178                 avail *= mp_maxid + 1;
5179         effpct = 100 * avail / total;
5180         return (sysctl_handle_int(oidp, &effpct, 0, req));
5181 }
5182
5183 static int
5184 sysctl_handle_uma_zone_items(SYSCTL_HANDLER_ARGS)
5185 {
5186         uma_zone_t zone = arg1;
5187         uint64_t cur;
5188
5189         cur = UZ_ITEMS_COUNT(atomic_load_64(&zone->uz_items));
5190         return (sysctl_handle_64(oidp, &cur, 0, req));
5191 }
5192
5193 #ifdef INVARIANTS
5194 static uma_slab_t
5195 uma_dbg_getslab(uma_zone_t zone, void *item)
5196 {
5197         uma_slab_t slab;
5198         uma_keg_t keg;
5199         uint8_t *mem;
5200
5201         /*
5202          * It is safe to return the slab here even though the
5203          * zone is unlocked because the item's allocation state
5204          * essentially holds a reference.
5205          */
5206         mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
5207         if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0)
5208                 return (NULL);
5209         if (zone->uz_flags & UMA_ZFLAG_VTOSLAB)
5210                 return (vtoslab((vm_offset_t)mem));
5211         keg = zone->uz_keg;
5212         if ((keg->uk_flags & UMA_ZFLAG_HASH) == 0)
5213                 return ((uma_slab_t)(mem + keg->uk_pgoff));
5214         KEG_LOCK(keg, 0);
5215         slab = hash_sfind(&keg->uk_hash, mem);
5216         KEG_UNLOCK(keg, 0);
5217
5218         return (slab);
5219 }
5220
5221 static bool
5222 uma_dbg_zskip(uma_zone_t zone, void *mem)
5223 {
5224
5225         if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0)
5226                 return (true);
5227
5228         return (uma_dbg_kskip(zone->uz_keg, mem));
5229 }
5230
5231 static bool
5232 uma_dbg_kskip(uma_keg_t keg, void *mem)
5233 {
5234         uintptr_t idx;
5235
5236         if (dbg_divisor == 0)
5237                 return (true);
5238
5239         if (dbg_divisor == 1)
5240                 return (false);
5241
5242         idx = (uintptr_t)mem >> PAGE_SHIFT;
5243         if (keg->uk_ipers > 1) {
5244                 idx *= keg->uk_ipers;
5245                 idx += ((uintptr_t)mem & PAGE_MASK) / keg->uk_rsize;
5246         }
5247
5248         if ((idx / dbg_divisor) * dbg_divisor != idx) {
5249                 counter_u64_add(uma_skip_cnt, 1);
5250                 return (true);
5251         }
5252         counter_u64_add(uma_dbg_cnt, 1);
5253
5254         return (false);
5255 }
5256
5257 /*
5258  * Set up the slab's freei data such that uma_dbg_free can function.
5259  *
5260  */
5261 static void
5262 uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
5263 {
5264         uma_keg_t keg;
5265         int freei;
5266
5267         if (slab == NULL) {
5268                 slab = uma_dbg_getslab(zone, item);
5269                 if (slab == NULL)
5270                         panic("uma: item %p did not belong to zone %s\n",
5271                             item, zone->uz_name);
5272         }
5273         keg = zone->uz_keg;
5274         freei = slab_item_index(slab, keg, item);
5275
5276         if (BIT_ISSET(keg->uk_ipers, freei, slab_dbg_bits(slab, keg)))
5277                 panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n",
5278                     item, zone, zone->uz_name, slab, freei);
5279         BIT_SET_ATOMIC(keg->uk_ipers, freei, slab_dbg_bits(slab, keg));
5280 }
5281
5282 /*
5283  * Verifies freed addresses.  Checks for alignment, valid slab membership
5284  * and duplicate frees.
5285  *
5286  */
5287 static void
5288 uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
5289 {
5290         uma_keg_t keg;
5291         int freei;
5292
5293         if (slab == NULL) {
5294                 slab = uma_dbg_getslab(zone, item);
5295                 if (slab == NULL)
5296                         panic("uma: Freed item %p did not belong to zone %s\n",
5297                             item, zone->uz_name);
5298         }
5299         keg = zone->uz_keg;
5300         freei = slab_item_index(slab, keg, item);
5301
5302         if (freei >= keg->uk_ipers)
5303                 panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n",
5304                     item, zone, zone->uz_name, slab, freei);
5305
5306         if (slab_item(slab, keg, freei) != item)
5307                 panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n",
5308                     item, zone, zone->uz_name, slab, freei);
5309
5310         if (!BIT_ISSET(keg->uk_ipers, freei, slab_dbg_bits(slab, keg)))
5311                 panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n",
5312                     item, zone, zone->uz_name, slab, freei);
5313
5314         BIT_CLR_ATOMIC(keg->uk_ipers, freei, slab_dbg_bits(slab, keg));
5315 }
5316 #endif /* INVARIANTS */
5317
5318 #ifdef DDB
5319 static int64_t
5320 get_uma_stats(uma_keg_t kz, uma_zone_t z, uint64_t *allocs, uint64_t *used,
5321     uint64_t *sleeps, long *cachefree, uint64_t *xdomain)
5322 {
5323         uint64_t frees;
5324         int i;
5325
5326         if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
5327                 *allocs = counter_u64_fetch(z->uz_allocs);
5328                 frees = counter_u64_fetch(z->uz_frees);
5329                 *sleeps = z->uz_sleeps;
5330                 *cachefree = 0;
5331                 *xdomain = 0;
5332         } else
5333                 uma_zone_sumstat(z, cachefree, allocs, &frees, sleeps,
5334                     xdomain);
5335         for (i = 0; i < vm_ndomains; i++) {
5336                 *cachefree += ZDOM_GET(z, i)->uzd_nitems;
5337                 if (!((z->uz_flags & UMA_ZONE_SECONDARY) &&
5338                     (LIST_FIRST(&kz->uk_zones) != z)))
5339                         *cachefree += kz->uk_domain[i].ud_free_items;
5340         }
5341         *used = *allocs - frees;
5342         return (((int64_t)*used + *cachefree) * kz->uk_size);
5343 }
5344
5345 DB_SHOW_COMMAND(uma, db_show_uma)
5346 {
5347         const char *fmt_hdr, *fmt_entry;
5348         uma_keg_t kz;
5349         uma_zone_t z;
5350         uint64_t allocs, used, sleeps, xdomain;
5351         long cachefree;
5352         /* variables for sorting */
5353         uma_keg_t cur_keg;
5354         uma_zone_t cur_zone, last_zone;
5355         int64_t cur_size, last_size, size;
5356         int ties;
5357
5358         /* /i option produces machine-parseable CSV output */
5359         if (modif[0] == 'i') {
5360                 fmt_hdr = "%s,%s,%s,%s,%s,%s,%s,%s,%s\n";
5361                 fmt_entry = "\"%s\",%ju,%jd,%ld,%ju,%ju,%u,%jd,%ju\n";
5362         } else {
5363                 fmt_hdr = "%18s %6s %7s %7s %11s %7s %7s %10s %8s\n";
5364                 fmt_entry = "%18s %6ju %7jd %7ld %11ju %7ju %7u %10jd %8ju\n";
5365         }
5366
5367         db_printf(fmt_hdr, "Zone", "Size", "Used", "Free", "Requests",
5368             "Sleeps", "Bucket", "Total Mem", "XFree");
5369
5370         /* Sort the zones with largest size first. */
5371         last_zone = NULL;
5372         last_size = INT64_MAX;
5373         for (;;) {
5374                 cur_zone = NULL;
5375                 cur_size = -1;
5376                 ties = 0;
5377                 LIST_FOREACH(kz, &uma_kegs, uk_link) {
5378                         LIST_FOREACH(z, &kz->uk_zones, uz_link) {
5379                                 /*
5380                                  * In the case of size ties, print out zones
5381                                  * in the order they are encountered.  That is,
5382                                  * when we encounter the most recently output
5383                                  * zone, we have already printed all preceding
5384                                  * ties, and we must print all following ties.
5385                                  */
5386                                 if (z == last_zone) {
5387                                         ties = 1;
5388                                         continue;
5389                                 }
5390                                 size = get_uma_stats(kz, z, &allocs, &used,
5391                                     &sleeps, &cachefree, &xdomain);
5392                                 if (size > cur_size && size < last_size + ties)
5393                                 {
5394                                         cur_size = size;
5395                                         cur_zone = z;
5396                                         cur_keg = kz;
5397                                 }
5398                         }
5399                 }
5400                 if (cur_zone == NULL)
5401                         break;
5402
5403                 size = get_uma_stats(cur_keg, cur_zone, &allocs, &used,
5404                     &sleeps, &cachefree, &xdomain);
5405                 db_printf(fmt_entry, cur_zone->uz_name,
5406                     (uintmax_t)cur_keg->uk_size, (intmax_t)used, cachefree,
5407                     (uintmax_t)allocs, (uintmax_t)sleeps,
5408                     (unsigned)cur_zone->uz_bucket_size, (intmax_t)size,
5409                     xdomain);
5410
5411                 if (db_pager_quit)
5412                         return;
5413                 last_zone = cur_zone;
5414                 last_size = cur_size;
5415         }
5416 }
5417
5418 DB_SHOW_COMMAND(umacache, db_show_umacache)
5419 {
5420         uma_zone_t z;
5421         uint64_t allocs, frees;
5422         long cachefree;
5423         int i;
5424
5425         db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
5426             "Requests", "Bucket");
5427         LIST_FOREACH(z, &uma_cachezones, uz_link) {
5428                 uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL, NULL);
5429                 for (i = 0; i < vm_ndomains; i++)
5430                         cachefree += ZDOM_GET(z, i)->uzd_nitems;
5431                 db_printf("%18s %8ju %8jd %8ld %12ju %8u\n",
5432                     z->uz_name, (uintmax_t)z->uz_size,
5433                     (intmax_t)(allocs - frees), cachefree,
5434                     (uintmax_t)allocs, z->uz_bucket_size);
5435                 if (db_pager_quit)
5436                         return;
5437         }
5438 }
5439 #endif  /* DDB */