sys/vm/uma_core.c

   1 /*
   2  * Copyright (c) 2002, Jeffrey Roberson <jroberson@chesapeake.net>
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice unmodified, this list of conditions, and the following
  10  *    disclaimer.
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in the
  13  *    documentation and/or other materials provided with the distribution.
  14  *
  15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25  *
  26  * $FreeBSD$
  27  *
  28  */
  29
  30 /*
  31  * uma_core.c  Implementation of the Universal Memory allocator
  32  *
  33  * This allocator is intended to replace the multitude of similar object caches
  34  * in the standard FreeBSD kernel.  The intent is to be flexible as well as
  35  * effecient.  A primary design goal is to return unused memory to the rest of
  36  * the system.  This will make the system as a whole more flexible due to the
  37  * ability to move memory to subsystems which most need it instead of leaving
  38  * pools of reserved memory unused.
  39  *
  40  * The basic ideas stem from similar slab/zone based allocators whose algorithms
  41  * are well known.
  42  *
  43  */
  44
  45 /*
  46  * TODO:
  47  *      - Improve memory usage for large allocations
  48  *      - Improve INVARIANTS (0xdeadc0de write out)
  49  *      - Investigate cache size adjustments
  50  */
  51
  52 /* I should really use ktr.. */
  53 /*
  54 #define UMA_DEBUG 1
  55 #define UMA_DEBUG_ALLOC 1
  56 #define UMA_DEBUG_ALLOC_1 1
  57 */
  58
  59
  60 #include "opt_param.h"
  61 #include <sys/param.h>
  62 #include <sys/systm.h>
  63 #include <sys/kernel.h>
  64 #include <sys/types.h>
  65 #include <sys/queue.h>
  66 #include <sys/malloc.h>
  67 #include <sys/lock.h>
  68 #include <sys/sysctl.h>
  69 #include <machine/types.h>
  70 #include <sys/mutex.h>
  71 #include <sys/smp.h>
  72
  73 #include <vm/vm.h>
  74 #include <vm/vm_object.h>
  75 #include <vm/vm_page.h>
  76 #include <vm/vm_param.h>
  77 #include <vm/vm_map.h>
  78 #include <vm/vm_kern.h>
  79 #include <vm/vm_extern.h>
  80 #include <vm/uma.h>
  81 #include <vm/uma_int.h>
  82
  83 /*
  84  * This is the zone from which all zones are spawned.  The idea is that even
  85  * the zone heads are allocated from the allocator, so we use the bss section
  86  * to bootstrap us.
  87  */
  88 static struct uma_zone master_zone;
  89 static uma_zone_t zones = &master_zone;
  90
  91 /* This is the zone from which all of uma_slab_t's are allocated. */
  92 static uma_zone_t slabzone;
  93
  94 /*
  95  * The initial hash tables come out of this zone so they can be allocated
  96  * prior to malloc coming up.
  97  */
  98 static uma_zone_t hashzone;
  99
 100 /*
 101  * Zone that buckets come from.
 102  */
 103 static uma_zone_t bucketzone;
 104
 105 /* Linked list of all zones in the system */
 106 static LIST_HEAD(,uma_zone) uma_zones = LIST_HEAD_INITIALIZER(&uma_zones);
 107
 108 /* This mutex protects the zone list */
 109 static struct mtx uma_mtx;
 110
 111 /* Linked list of boot time pages */
 112 static LIST_HEAD(,uma_slab) uma_boot_pages =
 113     LIST_HEAD_INITIALIZER(&uma_boot_pages);
 114
 115 /* Count of free boottime pages */
 116 static int uma_boot_free = 0;
 117
 118 /* Is the VM done starting up? */
 119 static int booted = 0;
 120
 121 /* This is the handle used to schedule our working set calculator */
 122 static struct callout uma_callout;
 123
 124 /* This is mp_maxid + 1, for use while looping over each cpu */
 125 static int maxcpu;
 126
 127 /*
 128  * This structure is passed as the zone ctor arg so that I don't have to create
 129  * a special allocation function just for zones.
 130  */
 131 struct uma_zctor_args {
 132         char *name;
 133         int size;
 134         uma_ctor ctor;
 135         uma_dtor dtor;
 136         uma_init uminit;
 137         uma_fini fini;
 138         int align;
 139         u_int16_t flags;
 140 };
 141
 142 /*
 143  * This is the malloc hash table which is used to find the zone that a
 144  * malloc allocation came from.  It is not currently resizeable.  The
 145  * memory for the actual hash bucket is allocated in kmeminit.
 146  */
 147 struct uma_hash mhash;
 148 struct uma_hash *mallochash = &mhash;
 149
 150 /* Prototypes.. */
 151
 152 static void *obj_alloc(uma_zone_t, int, u_int8_t *, int);
 153 static void *page_alloc(uma_zone_t, int, u_int8_t *, int);
 154 static void page_free(void *, int, u_int8_t);
 155 static uma_slab_t slab_zalloc(uma_zone_t, int);
 156 static void cache_drain(uma_zone_t);
 157 static void bucket_drain(uma_zone_t, uma_bucket_t);
 158 static void zone_drain(uma_zone_t);
 159 static void zone_ctor(void *, int, void *);
 160 static void zero_init(void *, int);
 161 static void zone_small_init(uma_zone_t zone);
 162 static void zone_large_init(uma_zone_t zone);
 163 static void zone_foreach(void (*zfunc)(uma_zone_t));
 164 static void zone_timeout(uma_zone_t zone);
 165 static void hash_expand(struct uma_hash *);
 166 static void uma_timeout(void *);
 167 static void uma_startup3(void);
 168 static void *uma_zalloc_internal(uma_zone_t, void *, int, int *, int);
 169 static void uma_zfree_internal(uma_zone_t,
 170     void *, void *, int);
 171 void uma_print_zone(uma_zone_t);
 172 void uma_print_stats(void);
 173 static int sysctl_vm_zone(SYSCTL_HANDLER_ARGS);
 174
 175 SYSCTL_OID(_vm, OID_AUTO, zone, CTLTYPE_STRING|CTLFLAG_RD,
 176     NULL, 0, sysctl_vm_zone, "A", "Zone Info");
 177 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
 178
 179
 180 /*
 181  * Routine called by timeout which is used to fire off some time interval
 182  * based calculations.  (working set, stats, etc.)
 183  *
 184  * Arguments:
 185  *      arg   Unused
 186  *
 187  * Returns:
 188  *      Nothing
 189  */
 190 static void
 191 uma_timeout(void *unused)
 192 {
 193         zone_foreach(zone_timeout);
 194
 195         /* Reschedule this event */
 196         callout_reset(&uma_callout, UMA_WORKING_TIME * hz, uma_timeout, NULL);
 197 }
 198
 199 /*
 200  * Routine to perform timeout driven calculations.  This does the working set
 201  * as well as hash expanding, and per cpu statistics aggregation.
 202  *
 203  *  Arguments:
 204  *      zone  The zone to operate on
 205  *
 206  *  Returns:
 207  *      Nothing
 208  */
 209 static void
 210 zone_timeout(uma_zone_t zone)
 211 {
 212         uma_cache_t cache;
 213         u_int64_t alloc;
 214         int free;
 215         int cpu;
 216
 217         alloc = 0;
 218         free = 0;
 219
 220         /*
 221          * Aggregate per cpu cache statistics back to the zone.
 222          *
 223          * I may rewrite this to set a flag in the per cpu cache instead of
 224          * locking.  If the flag is not cleared on the next round I will have
 225          * to lock and do it here instead so that the statistics don't get too
 226          * far out of sync.
 227          */
 228         if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL)) {
 229                 for (cpu = 0; cpu < maxcpu; cpu++) {
 230                         if (CPU_ABSENT(cpu))
 231                                 continue;
 232                         CPU_LOCK(zone, cpu);
 233                         cache = &zone->uz_cpu[cpu];
 234                         /* Add them up, and reset */
 235                         alloc += cache->uc_allocs;
 236                         cache->uc_allocs = 0;
 237                         if (cache->uc_allocbucket)
 238                                 free += cache->uc_allocbucket->ub_ptr + 1;
 239                         if (cache->uc_freebucket)
 240                                 free += cache->uc_freebucket->ub_ptr + 1;
 241                         CPU_UNLOCK(zone, cpu);
 242                 }
 243         }
 244
 245         /* Now push these stats back into the zone.. */
 246         ZONE_LOCK(zone);
 247         zone->uz_allocs += alloc;
 248
 249         /*
 250          * cachefree is an instantanious snapshot of what is in the per cpu
 251          * caches, not an accurate counter
 252          */
 253         zone->uz_cachefree = free;
 254
 255         /*
 256          * Expand the zone hash table.
 257          *
 258          * This is done if the number of slabs is larger than the hash size.
 259          * What I'm trying to do here is completely reduce collisions.  This
 260          * may be a little aggressive.  Should I allow for two collisions max?
 261          */
 262
 263         if ((zone->uz_flags & UMA_ZFLAG_OFFPAGE) &&
 264             !(zone->uz_flags & UMA_ZFLAG_MALLOC)) {
 265                 if (zone->uz_pages / zone->uz_ppera
 266                     >= zone->uz_hash.uh_hashsize)
 267                         hash_expand(&zone->uz_hash);
 268         }
 269
 270         /*
 271          * Here we compute the working set size as the total number of items
 272          * left outstanding since the last time interval.  This is slightly
 273          * suboptimal. What we really want is the highest number of outstanding
 274          * items during the last time quantum.  This should be close enough.
 275          *
 276          * The working set size is used to throttle the zone_drain function.
 277          * We don't want to return memory that we may need again immediately.
 278          */
 279         alloc = zone->uz_allocs - zone->uz_oallocs;
 280         zone->uz_oallocs = zone->uz_allocs;
 281         zone->uz_wssize = alloc;
 282
 283         ZONE_UNLOCK(zone);
 284 }
 285
 286 /*
 287  * Expands the hash table for OFFPAGE zones.  This is done from zone_timeout
 288  * to reduce collisions.  This must not be done in the regular allocation path,
 289  * otherwise, we can recurse on the vm while allocating pages.
 290  *
 291  * Arguments:
 292  *      hash  The hash you want to expand by a factor of two.
 293  *
 294  * Returns:
 295  *      Nothing
 296  *
 297  * Discussion:
 298  */
 299 static void
 300 hash_expand(struct uma_hash *hash)
 301 {
 302         struct slabhead *newhash;
 303         struct slabhead *oldhash;
 304         uma_slab_t slab;
 305         int hzonefree;
 306         int hashsize;
 307         int alloc;
 308         int hval;
 309         int i;
 310
 311
 312         /*
 313          * Remember the old hash size and see if it has to go back to the
 314          * hash zone, or malloc.  The hash zone is used for the initial hash
 315          */
 316
 317         hashsize = hash->uh_hashsize;
 318         oldhash = hash->uh_slab_hash;
 319
 320         if (hashsize == UMA_HASH_SIZE_INIT)
 321                 hzonefree = 1;
 322         else
 323                 hzonefree = 0;
 324
 325
 326         /* We're just going to go to a power of two greater */
 327         if (hash->uh_hashsize)  {
 328                 alloc = sizeof(hash->uh_slab_hash[0]) * (hash->uh_hashsize * 2);
 329                 /* XXX Shouldn't be abusing DEVBUF here */
 330                 newhash = (struct slabhead *)malloc(alloc, M_DEVBUF, M_NOWAIT);
 331                 if (newhash == NULL) {
 332                         return;
 333                 }
 334                 hash->uh_hashsize *= 2;
 335         } else {
 336                 alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
 337                 newhash = uma_zalloc_internal(hashzone, NULL, M_WAITOK, NULL, -1);
 338                 hash->uh_hashsize = UMA_HASH_SIZE_INIT;
 339         }
 340
 341         bzero(newhash, alloc);
 342
 343         hash->uh_hashmask = hash->uh_hashsize - 1;
 344
 345         /*
 346          * I need to investigate hash algorithms for resizing without a
 347          * full rehash.
 348          */
 349
 350         for (i = 0; i < hashsize; i++)
 351                 while (!SLIST_EMPTY(&hash->uh_slab_hash[i])) {
 352                         slab = SLIST_FIRST(&hash->uh_slab_hash[i]);
 353                         SLIST_REMOVE_HEAD(&hash->uh_slab_hash[i], us_hlink);
 354                         hval = UMA_HASH(hash, slab->us_data);
 355                         SLIST_INSERT_HEAD(&newhash[hval], slab, us_hlink);
 356                 }
 357
 358         if (hash->uh_slab_hash) {
 359                 if (hzonefree)
 360                         uma_zfree_internal(hashzone,
 361                             hash->uh_slab_hash, NULL, 0);
 362                 else
 363                         free(hash->uh_slab_hash, M_DEVBUF);
 364         }
 365         hash->uh_slab_hash = newhash;
 366
 367         return;
 368 }
 369
 370 /*
 371  * Frees all outstanding items in a bucket
 372  *
 373  * Arguments:
 374  *      zone   The zone to free to, must be unlocked.
 375  *      bucket The free/alloc bucket with items, cpu queue must be locked.
 376  *
 377  * Returns:
 378  *      Nothing
 379  */
 380
 381 static void
 382 bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
 383 {
 384         uma_slab_t slab;
 385         int mzone;
 386         void *item;
 387
 388         if (bucket == NULL)
 389                 return;
 390
 391         slab = NULL;
 392         mzone = 0;
 393
 394         /* We have to lookup the slab again for malloc.. */
 395         if (zone->uz_flags & UMA_ZFLAG_MALLOC)
 396                 mzone = 1;
 397
 398         while (bucket->ub_ptr > -1)  {
 399                 item = bucket->ub_bucket[bucket->ub_ptr];
 400 #ifdef INVARIANTS
 401                 bucket->ub_bucket[bucket->ub_ptr] = NULL;
 402                 KASSERT(item != NULL,
 403                     ("bucket_drain: botched ptr, item is NULL"));
 404 #endif
 405                 bucket->ub_ptr--;
 406                 /*
 407                  * This is extremely inefficient.  The slab pointer was passed
 408                  * to uma_zfree_arg, but we lost it because the buckets don't
 409                  * hold them.  This will go away when free() gets a size passed
 410                  * to it.
 411                  */
 412                 if (mzone)
 413                         slab = hash_sfind(mallochash,
 414                             (u_int8_t *)((unsigned long)item &
 415                            (~UMA_SLAB_MASK)));
 416                 uma_zfree_internal(zone, item, slab, 1);
 417         }
 418 }
 419
 420 /*
 421  * Drains the per cpu caches for a zone.
 422  *
 423  * Arguments:
 424  *      zone  The zone to drain, must be unlocked.
 425  *
 426  * Returns:
 427  *      Nothing
 428  *
 429  * This function returns with the zone locked so that the per cpu queues can
 430  * not be filled until zone_drain is finished.
 431  *
 432  */
 433 static void
 434 cache_drain(uma_zone_t zone)
 435 {
 436         uma_bucket_t bucket;
 437         uma_cache_t cache;
 438         int cpu;
 439
 440         /*
 441          * Flush out the per cpu queues.
 442          *
 443          * XXX This causes unneccisary thrashing due to immediately having
 444          * empty per cpu queues.  I need to improve this.
 445          */
 446
 447         /*
 448          * We have to lock each cpu cache before locking the zone
 449          */
 450         ZONE_UNLOCK(zone);
 451
 452         for (cpu = 0; cpu < maxcpu; cpu++) {
 453                 if (CPU_ABSENT(cpu))
 454                         continue;
 455                 CPU_LOCK(zone, cpu);
 456                 cache = &zone->uz_cpu[cpu];
 457                 bucket_drain(zone, cache->uc_allocbucket);
 458                 bucket_drain(zone, cache->uc_freebucket);
 459         }
 460
 461         /*
 462          * Drain the bucket queues and free the buckets, we just keep two per
 463          * cpu (alloc/free).
 464          */
 465         ZONE_LOCK(zone);
 466         while ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) {
 467                 LIST_REMOVE(bucket, ub_link);
 468                 ZONE_UNLOCK(zone);
 469                 bucket_drain(zone, bucket);
 470                 uma_zfree_internal(bucketzone, bucket, NULL, 0);
 471                 ZONE_LOCK(zone);
 472         }
 473
 474         /* Now we do the free queue.. */
 475         while ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
 476                 LIST_REMOVE(bucket, ub_link);
 477                 uma_zfree_internal(bucketzone, bucket, NULL, 0);
 478         }
 479
 480         /* We unlock here, but they will all block until the zone is unlocked */
 481         for (cpu = 0; cpu < maxcpu; cpu++) {
 482                 if (CPU_ABSENT(cpu))
 483                         continue;
 484                 CPU_UNLOCK(zone, cpu);
 485         }
 486 }
 487
 488 /*
 489  * Frees pages from a zone back to the system.  This is done on demand from
 490  * the pageout daemon.
 491  *
 492  * Arguments:
 493  *      zone  The zone to free pages from
 494  *
 495  * Returns:
 496  *      Nothing.
 497  */
 498 static void
 499 zone_drain(uma_zone_t zone)
 500 {
 501         uma_slab_t slab;
 502         uma_slab_t n;
 503         u_int64_t extra;
 504         u_int8_t flags;
 505         u_int8_t *mem;
 506         int i;
 507
 508         /*
 509          * We don't want to take pages from staticly allocated zones at this
 510          * time
 511          */
 512         if (zone->uz_flags & UMA_ZFLAG_NOFREE || zone->uz_freef == NULL)
 513                 return;
 514
 515         ZONE_LOCK(zone);
 516
 517         if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
 518                 cache_drain(zone);
 519
 520         if (zone->uz_free < zone->uz_wssize)
 521                 goto finished;
 522 #ifdef UMA_DEBUG
 523         printf("%s working set size: %llu free items: %u\n",
 524             zone->uz_name, (unsigned long long)zone->uz_wssize, zone->uz_free);
 525 #endif
 526         extra = zone->uz_wssize - zone->uz_free;
 527         extra /= zone->uz_ipers;
 528
 529         /* extra is now the number of extra slabs that we can free */
 530
 531         if (extra == 0)
 532                 goto finished;
 533
 534         slab = LIST_FIRST(&zone->uz_free_slab);
 535         while (slab && extra) {
 536                 n = LIST_NEXT(slab, us_link);
 537
 538                 /* We have no where to free these to */
 539                 if (slab->us_flags & UMA_SLAB_BOOT) {
 540                         slab = n;
 541                         continue;
 542                 }
 543
 544                 LIST_REMOVE(slab, us_link);
 545                 zone->uz_pages -= zone->uz_ppera;
 546                 zone->uz_free -= zone->uz_ipers;
 547                 if (zone->uz_fini)
 548                         for (i = 0; i < zone->uz_ipers; i++)
 549                                 zone->uz_fini(
 550                                     slab->us_data + (zone->uz_rsize * i),
 551                                     zone->uz_size);
 552                 flags = slab->us_flags;
 553                 mem = slab->us_data;
 554                 if (zone->uz_flags & UMA_ZFLAG_OFFPAGE) {
 555                         if (zone->uz_flags & UMA_ZFLAG_MALLOC) {
 556                                 UMA_HASH_REMOVE(mallochash,
 557                                     slab, slab->us_data);
 558                         } else {
 559                                 UMA_HASH_REMOVE(&zone->uz_hash,
 560                                     slab, slab->us_data);
 561                         }
 562                         uma_zfree_internal(slabzone, slab, NULL, 0);
 563                 } else if (zone->uz_flags & UMA_ZFLAG_MALLOC)
 564                         UMA_HASH_REMOVE(mallochash, slab, slab->us_data);
 565 #ifdef UMA_DEBUG
 566                 printf("%s: Returning %d bytes.\n",
 567                     zone->uz_name, UMA_SLAB_SIZE * zone->uz_ppera);
 568 #endif
 569                 zone->uz_freef(mem, UMA_SLAB_SIZE * zone->uz_ppera, flags);
 570
 571                 slab = n;
 572                 extra--;
 573         }
 574
 575 finished:
 576         ZONE_UNLOCK(zone);
 577 }
 578
 579 /*
 580  * Allocate a new slab for a zone.  This does not insert the slab onto a list.
 581  *
 582  * Arguments:
 583  *      zone  The zone to allocate slabs for
 584  *      wait  Shall we wait?
 585  *
 586  * Returns:
 587  *      The slab that was allocated or NULL if there is no memory and the
 588  *      caller specified M_NOWAIT.
 589  *
 590  */
 591 static uma_slab_t
 592 slab_zalloc(uma_zone_t zone, int wait)
 593 {
 594         uma_slab_t slab;        /* Starting slab */
 595         u_int8_t *mem;
 596         u_int8_t flags;
 597         int i;
 598
 599 #ifdef UMA_DEBUG
 600         printf("slab_zalloc:  Allocating a new slab for %s\n", zone->uz_name);
 601 #endif
 602
 603         if (booted || (zone->uz_flags & UMA_ZFLAG_PRIVALLOC)) {
 604                 ZONE_UNLOCK(zone);
 605                 mtx_lock(&Giant);
 606                 slab = (uma_slab_t )zone->uz_allocf(zone,
 607                     zone->uz_ppera * UMA_SLAB_SIZE, &flags, wait);
 608                 mtx_unlock(&Giant);
 609                 ZONE_LOCK(zone);
 610                 if (slab != NULL)
 611                         slab->us_data = (u_int8_t *)slab;
 612                 else
 613                         return (NULL);
 614         } else {
 615
 616                 if (zone->uz_ppera > 1)
 617                         panic("UMA: Attemping to allocate multiple pages before vm has started.\n");
 618                 if (zone->uz_flags & UMA_ZFLAG_MALLOC)
 619                         panic("Mallocing before uma_startup2 has been called.\n");
 620                 if (uma_boot_free == 0)
 621                         panic("UMA: Ran out of pre init pages, increase UMA_BOOT_PAGES\n");
 622                 slab = LIST_FIRST(&uma_boot_pages);
 623                 LIST_REMOVE(slab, us_link);
 624                 uma_boot_free--;
 625         }
 626
 627         mem = slab->us_data;
 628
 629         /* Alloc slab structure for offpage, otherwise adjust it's position */
 630         if (!(zone->uz_flags & UMA_ZFLAG_OFFPAGE)) {
 631                 slab = (uma_slab_t )(mem + zone->uz_pgoff);
 632         } else  {
 633                 slab = uma_zalloc_internal(slabzone, NULL, wait, NULL, -1);
 634                 if (slab == NULL)       /* XXX This should go away */
 635                         panic("UMA: No free slab structures");
 636                 if (!(zone->uz_flags & UMA_ZFLAG_MALLOC))
 637                         UMA_HASH_INSERT(&zone->uz_hash, slab, mem);
 638         }
 639         if (zone->uz_flags & UMA_ZFLAG_MALLOC) {
 640 #ifdef UMA_DEBUG
 641                 printf("Inserting %p into malloc hash from slab %p\n",
 642                     mem, slab);
 643 #endif
 644                 UMA_HASH_INSERT(mallochash, slab, mem);
 645         }
 646
 647         slab->us_zone = zone;
 648         slab->us_data = mem;
 649
 650         /*
 651          * This is intended to spread data out across cache lines.
 652          *
 653          * This code doesn't seem to work properly on x86, and on alpha
 654          * it makes absolutely no performance difference. I'm sure it could
 655          * use some tuning, but sun makes outrageous claims about it's
 656          * performance.
 657          */
 658 #if 0
 659         if (zone->uz_cachemax) {
 660                 slab->us_data += zone->uz_cacheoff;
 661                 zone->uz_cacheoff += UMA_CACHE_INC;
 662                 if (zone->uz_cacheoff > zone->uz_cachemax)
 663                         zone->uz_cacheoff = 0;
 664         }
 665 #endif
 666
 667         slab->us_freecount = zone->uz_ipers;
 668         slab->us_firstfree = 0;
 669         slab->us_flags = flags;
 670         for (i = 0; i < zone->uz_ipers; i++)
 671                 slab->us_freelist[i] = i+1;
 672
 673         if (zone->uz_init)
 674                 for (i = 0; i < zone->uz_ipers; i++)
 675                         zone->uz_init(slab->us_data + (zone->uz_rsize * i),
 676                             zone->uz_size);
 677
 678         zone->uz_pages += zone->uz_ppera;
 679         zone->uz_free += zone->uz_ipers;
 680
 681         return (slab);
 682 }
 683
 684 /*
 685  * Allocates a number of pages from the system
 686  *
 687  * Arguments:
 688  *      zone  Unused
 689  *      bytes  The number of bytes requested
 690  *      wait  Shall we wait?
 691  *
 692  * Returns:
 693  *      A pointer to the alloced memory or possibly
 694  *      NULL if M_NOWAIT is set.
 695  */
 696 static void *
 697 page_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait)
 698 {
 699         void *p;        /* Returned page */
 700
 701         /*
 702          * XXX The original zone allocator did this, but I don't think it's
 703          * neccisary in current.
 704          */
 705
 706         if (lockstatus(&kernel_map->lock, NULL)) {
 707                 *pflag = UMA_SLAB_KMEM;
 708                 p = (void *) kmem_malloc(kmem_map, bytes, wait);
 709         } else {
 710                 *pflag = UMA_SLAB_KMAP;
 711                 p = (void *) kmem_alloc(kernel_map, bytes);
 712         }
 713
 714         return (p);
 715 }
 716
 717 /*
 718  * Allocates a number of pages from within an object
 719  *
 720  * Arguments:
 721  *      zone   Unused
 722  *      bytes  The number of bytes requested
 723  *      wait   Shall we wait?
 724  *
 725  * Returns:
 726  *      A pointer to the alloced memory or possibly
 727  *      NULL if M_NOWAIT is set.
 728  */
 729 static void *
 730 obj_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
 731 {
 732         vm_offset_t zkva;
 733         vm_offset_t retkva;
 734         vm_page_t p;
 735         int pages;
 736
 737
 738         if (zone->uz_pages + zone->uz_ppera > zone->uz_maxpages)
 739                 return (NULL);
 740
 741         retkva = NULL;
 742         pages = zone->uz_pages;
 743
 744         /*
 745          * This looks a little weird since we're getting one page at a time
 746          */
 747         while (bytes > 0) {
 748                 p = vm_page_alloc(zone->uz_obj, pages,
 749                     VM_ALLOC_INTERRUPT);
 750                 if (p == NULL)
 751                         return (NULL);
 752
 753                 zkva = zone->uz_kva + pages * PAGE_SIZE;
 754                 if (retkva == NULL)
 755                         retkva = zkva;
 756                 pmap_qenter(zkva, &p, 1);
 757                 bytes -= PAGE_SIZE;
 758                 pages += 1;
 759         }
 760
 761         *flags = UMA_SLAB_PRIV;
 762
 763         return ((void *)retkva);
 764 }
 765
 766 /*
 767  * Frees a number of pages to the system
 768  *
 769  * Arguments:
 770  *      mem   A pointer to the memory to be freed
 771  *      size  The size of the memory being freed
 772  *      flags The original p->us_flags field
 773  *
 774  * Returns:
 775  *      Nothing
 776  *
 777  */
 778 static void
 779 page_free(void *mem, int size, u_int8_t flags)
 780 {
 781         vm_map_t map;
 782         if (flags & UMA_SLAB_KMEM)
 783                 map = kmem_map;
 784         else if (flags & UMA_SLAB_KMAP)
 785                 map = kernel_map;
 786         else
 787                 panic("UMA: page_free used with invalid flags %d\n", flags);
 788
 789         kmem_free(map, (vm_offset_t)mem, size);
 790 }
 791
 792 /*
 793  * Zero fill initializer
 794  *
 795  * Arguments/Returns follow uma_init specifications
 796  *
 797  */
 798 static void
 799 zero_init(void *mem, int size)
 800 {
 801         bzero(mem, size);
 802 }
 803
 804 /*
 805  * Finish creating a small uma zone.  This calculates ipers, and the zone size.
 806  *
 807  * Arguments
 808  *      zone  The zone we should initialize
 809  *
 810  * Returns
 811  *      Nothing
 812  */
 813 static void
 814 zone_small_init(uma_zone_t zone)
 815 {
 816         int rsize;
 817         int memused;
 818         int ipers;
 819
 820         rsize = zone->uz_size;
 821
 822         if (rsize < UMA_SMALLEST_UNIT)
 823                 rsize = UMA_SMALLEST_UNIT;
 824
 825         if (rsize & zone->uz_align)
 826                 rsize = (rsize & ~zone->uz_align) + (zone->uz_align + 1);
 827
 828         zone->uz_rsize = rsize;
 829
 830         rsize += 1;     /* Account for the byte of linkage */
 831         zone->uz_ipers = (UMA_SLAB_SIZE - sizeof(struct uma_slab)) / rsize;
 832         zone->uz_ppera = 1;
 833
 834         memused = zone->uz_ipers * zone->uz_rsize;
 835
 836         /* Can we do any better? */
 837         if ((UMA_SLAB_SIZE - memused) >= UMA_MAX_WASTE) {
 838                 if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
 839                         return;
 840                 ipers = UMA_SLAB_SIZE / zone->uz_rsize;
 841                 if (ipers > zone->uz_ipers) {
 842                         zone->uz_flags |= UMA_ZFLAG_OFFPAGE;
 843                         zone->uz_ipers = ipers;
 844                 }
 845         }
 846
 847 }
 848
 849 /*
 850  * Finish creating a large (> UMA_SLAB_SIZE) uma zone.  Just give in and do
 851  * OFFPAGE for now.  When I can allow for more dynamic slab sizes this will be
 852  * more complicated.
 853  *
 854  * Arguments
 855  *      zone  The zone we should initialize
 856  *
 857  * Returns
 858  *      Nothing
 859  */
 860 static void
 861 zone_large_init(uma_zone_t zone)
 862 {
 863         int pages;
 864
 865         pages = zone->uz_size / UMA_SLAB_SIZE;
 866
 867         /* Account for remainder */
 868         if ((pages * UMA_SLAB_SIZE) < zone->uz_size)
 869                 pages++;
 870
 871         zone->uz_ppera = pages;
 872         zone->uz_ipers = 1;
 873
 874         zone->uz_flags |= UMA_ZFLAG_OFFPAGE;
 875         zone->uz_rsize = zone->uz_size;
 876 }
 877
 878 /*
 879  * Zone header ctor.  This initializes all fields, locks, etc.  And inserts
 880  * the zone onto the global zone list.
 881  *
 882  * Arguments/Returns follow uma_ctor specifications
 883  *      udata  Actually uma_zcreat_args
 884  *
 885  */
 886
 887 static void
 888 zone_ctor(void *mem, int size, void *udata)
 889 {
 890         struct uma_zctor_args *arg = udata;
 891         uma_zone_t zone = mem;
 892         int cplen;
 893         int cpu;
 894
 895         bzero(zone, size);
 896         zone->uz_name = arg->name;
 897         zone->uz_size = arg->size;
 898         zone->uz_ctor = arg->ctor;
 899         zone->uz_dtor = arg->dtor;
 900         zone->uz_init = arg->uminit;
 901         zone->uz_align = arg->align;
 902         zone->uz_free = 0;
 903         zone->uz_pages = 0;
 904         zone->uz_flags = 0;
 905         zone->uz_allocf = page_alloc;
 906         zone->uz_freef = page_free;
 907
 908         if (arg->flags & UMA_ZONE_ZINIT)
 909                 zone->uz_init = zero_init;
 910
 911         if (arg->flags & UMA_ZONE_INTERNAL)
 912                 zone->uz_flags |= UMA_ZFLAG_INTERNAL;
 913
 914         if (arg->flags & UMA_ZONE_MALLOC)
 915                 zone->uz_flags |= UMA_ZFLAG_MALLOC;
 916
 917         if (arg->flags & UMA_ZONE_NOFREE)
 918                 zone->uz_flags |= UMA_ZFLAG_NOFREE;
 919
 920         if (zone->uz_size > UMA_SLAB_SIZE)
 921                 zone_large_init(zone);
 922         else
 923                 zone_small_init(zone);
 924
 925         /* We do this so that the per cpu lock name is unique for each zone */
 926         memcpy(zone->uz_lname, "PCPU ", 5);
 927         cplen = min(strlen(zone->uz_name) + 1, LOCKNAME_LEN - 6);
 928         memcpy(zone->uz_lname+5, zone->uz_name, cplen);
 929         zone->uz_lname[LOCKNAME_LEN - 1] = '\0';
 930
 931         /*
 932          * If we're putting the slab header in the actual page we need to
 933          * figure out where in each page it goes.  This calculates a right
 934          * justified offset into the memory on a ALIGN_PTR boundary.
 935          */
 936         if (!(zone->uz_flags & UMA_ZFLAG_OFFPAGE)) {
 937                 int totsize;
 938                 int waste;
 939
 940                 /* Size of the slab struct and free list */
 941                 totsize = sizeof(struct uma_slab) + zone->uz_ipers;
 942                 if (totsize & UMA_ALIGN_PTR)
 943                         totsize = (totsize & ~UMA_ALIGN_PTR) +
 944                             (UMA_ALIGN_PTR + 1);
 945                 zone->uz_pgoff = UMA_SLAB_SIZE - totsize;
 946
 947                 waste = zone->uz_pgoff;
 948                 waste -= (zone->uz_ipers * zone->uz_rsize);
 949
 950                 /*
 951                  * This calculates how much space we have for cache line size
 952                  * optimizations.  It works by offseting each slab slightly.
 953                  * Currently it breaks on x86, and so it is disabled.
 954                  */
 955
 956                 if (zone->uz_align < UMA_CACHE_INC && waste > UMA_CACHE_INC) {
 957                         zone->uz_cachemax = waste - UMA_CACHE_INC;
 958                         zone->uz_cacheoff = 0;
 959                 }
 960
 961                 totsize = zone->uz_pgoff + sizeof(struct uma_slab)
 962                     + zone->uz_ipers;
 963                 /* I don't think it's possible, but I'll make sure anyway */
 964                 if (totsize > UMA_SLAB_SIZE) {
 965                         printf("zone %s ipers %d rsize %d size %d\n",
 966                             zone->uz_name, zone->uz_ipers, zone->uz_rsize,
 967                             zone->uz_size);
 968                         panic("UMA slab won't fit.\n");
 969                 }
 970         } else {
 971                 /* hash_expand here to allocate the initial hash table */
 972                 hash_expand(&zone->uz_hash);
 973                 zone->uz_pgoff = 0;
 974         }
 975
 976 #ifdef UMA_DEBUG
 977         printf("%s(%p) size = %d ipers = %d ppera = %d pgoff = %d\n",
 978             zone->uz_name, zone,
 979             zone->uz_size, zone->uz_ipers,
 980             zone->uz_ppera, zone->uz_pgoff);
 981 #endif
 982         ZONE_LOCK_INIT(zone);
 983
 984         mtx_lock(&uma_mtx);
 985         LIST_INSERT_HEAD(&uma_zones, zone, uz_link);
 986         mtx_unlock(&uma_mtx);
 987
 988         /*
 989          * Some internal zones don't have room allocated for the per cpu
 990          * caches.  If we're internal, bail out here.
 991          */
 992
 993         if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
 994                 return;
 995
 996         for (cpu = 0; cpu < maxcpu; cpu++) {
 997                 if (zone->uz_ipers < UMA_BUCKET_SIZE)
 998                         zone->uz_cpu[cpu].uc_count = zone->uz_ipers - 1;
 999                 else
1000                         zone->uz_cpu[cpu].uc_count = UMA_BUCKET_SIZE - 1;
1001                 CPU_LOCK_INIT(zone, cpu);
1002         }
1003 }
1004
1005 /*
1006  * Traverses every zone in the system and calls a callback
1007  *
1008  * Arguments:
1009  *      zfunc  A pointer to a function which accepts a zone
1010  *              as an argument.
1011  *
1012  * Returns:
1013  *      Nothing
1014  */
1015 static void
1016 zone_foreach(void (*zfunc)(uma_zone_t))
1017 {
1018         uma_zone_t zone;
1019
1020         mtx_lock(&uma_mtx);
1021         LIST_FOREACH(zone, &uma_zones, uz_link) {
1022                 zfunc(zone);
1023         }
1024         mtx_unlock(&uma_mtx);
1025 }
1026
1027 /* Public functions */
1028 /* See uma.h */
1029 void
1030 uma_startup(void *bootmem)
1031 {
1032         struct uma_zctor_args args;
1033         uma_slab_t slab;
1034         int slabsize;
1035         int i;
1036
1037 #ifdef UMA_DEBUG
1038         printf("Creating uma zone headers zone.\n");
1039 #endif
1040 #ifdef SMP
1041         maxcpu = mp_maxid + 1;
1042 #else
1043         maxcpu = 1;
1044 #endif
1045 #ifdef UMA_DEBUG
1046         printf("Max cpu = %d, mp_maxid = %d\n", maxcpu, mp_maxid);
1047         Debugger("stop");
1048 #endif
1049         mtx_init(&uma_mtx, "UMA lock", MTX_DEF);
1050         /* "manually" Create the initial zone */
1051         args.name = "UMA Zones";
1052         args.size = sizeof(struct uma_zone) +
1053             (sizeof(struct uma_cache) * (maxcpu - 1));
1054         args.ctor = zone_ctor;
1055         args.dtor = NULL;
1056         args.uminit = zero_init;
1057         args.fini = NULL;
1058         args.align = 32 - 1;
1059         args.flags = UMA_ZONE_INTERNAL;
1060         /* The initial zone has no Per cpu queues so it's smaller */
1061         zone_ctor(zones, sizeof(struct uma_zone), &args);
1062
1063 #ifdef UMA_DEBUG
1064         printf("Filling boot free list.\n");
1065 #endif
1066         for (i = 0; i < UMA_BOOT_PAGES; i++) {
1067                 slab = (uma_slab_t)((u_int8_t *)bootmem + (i * UMA_SLAB_SIZE));
1068                 slab->us_data = (u_int8_t *)slab;
1069                 slab->us_flags = UMA_SLAB_BOOT;
1070                 LIST_INSERT_HEAD(&uma_boot_pages, slab, us_link);
1071                 uma_boot_free++;
1072         }
1073
1074 #ifdef UMA_DEBUG
1075         printf("Creating slab zone.\n");
1076 #endif
1077
1078         /*
1079          * This is the max number of free list items we'll have with
1080          * offpage slabs.
1081          */
1082
1083         slabsize = UMA_SLAB_SIZE - sizeof(struct uma_slab);
1084         slabsize /= UMA_MAX_WASTE;
1085         slabsize++;                     /* In case there it's rounded */
1086         slabsize += sizeof(struct uma_slab);
1087
1088         /* Now make a zone for slab headers */
1089         slabzone = uma_zcreate("UMA Slabs",
1090                                 slabsize,
1091                                 NULL, NULL, NULL, NULL,
1092                                 UMA_ALIGN_PTR, UMA_ZONE_INTERNAL);
1093
1094         hashzone = uma_zcreate("UMA Hash",
1095             sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
1096             NULL, NULL, NULL, NULL,
1097             UMA_ALIGN_PTR, UMA_ZONE_INTERNAL);
1098
1099         bucketzone = uma_zcreate("UMA Buckets", sizeof(struct uma_bucket),
1100             NULL, NULL, NULL, NULL,
1101             UMA_ALIGN_PTR, UMA_ZONE_INTERNAL);
1102
1103
1104 #ifdef UMA_DEBUG
1105         printf("UMA startup complete.\n");
1106 #endif
1107 }
1108
1109 /* see uma.h */
1110 void
1111 uma_startup2(void *hashmem, u_long elems)
1112 {
1113         bzero(hashmem, elems * sizeof(void *));
1114         mallochash->uh_slab_hash = hashmem;
1115         mallochash->uh_hashsize = elems;
1116         mallochash->uh_hashmask = elems - 1;
1117         booted = 1;
1118 #ifdef UMA_DEBUG
1119         printf("UMA startup2 complete.\n");
1120 #endif
1121 }
1122
1123 /*
1124  * Initialize our callout handle
1125  *
1126  */
1127
1128 static void
1129 uma_startup3(void)
1130 {
1131 #ifdef UMA_DEBUG
1132         printf("Starting callout.\n");
1133 #endif
1134         /* We'll be mpsafe once the vm is locked. */
1135         callout_init(&uma_callout, 0);
1136         callout_reset(&uma_callout, UMA_WORKING_TIME * hz, uma_timeout, NULL);
1137 #ifdef UMA_DEBUG
1138         printf("UMA startup3 complete.\n");
1139 #endif
1140 }
1141
1142 /* See uma.h */
1143 uma_zone_t
1144 uma_zcreate(char *name, int size, uma_ctor ctor, uma_dtor dtor, uma_init uminit,
1145                      uma_fini fini, int align, u_int16_t flags)
1146
1147 {
1148         struct uma_zctor_args args;
1149
1150         /* This stuff is essential for the zone ctor */
1151         args.name = name;
1152         args.size = size;
1153         args.ctor = ctor;
1154         args.dtor = dtor;
1155         args.uminit = uminit;
1156         args.fini = fini;
1157         args.align = align;
1158         args.flags = flags;
1159
1160         return (uma_zalloc_internal(zones, &args, M_WAITOK, NULL, -1));
1161 }
1162
1163 /* See uma.h */
1164 void *
1165 uma_zalloc_arg(uma_zone_t zone, void *udata, int wait)
1166 {
1167         void *item;
1168         uma_cache_t cache;
1169         uma_bucket_t bucket;
1170         int isitem;
1171         int cpu;
1172
1173         /* This is the fast path allocation */
1174 #ifdef UMA_DEBUG_ALLOC_1
1175         printf("Allocating one item from %s(%p)\n", zone->uz_name, zone);
1176 #endif
1177         cpu = PCPU_GET(cpuid);
1178         CPU_LOCK(zone, cpu);
1179         cache = &zone->uz_cpu[cpu];
1180         cache->uc_allocs++;
1181
1182 zalloc_start:
1183         bucket = cache->uc_allocbucket;
1184
1185         if (bucket) {
1186                 if (bucket->ub_ptr > -1) {
1187                         item = bucket->ub_bucket[bucket->ub_ptr];
1188 #ifdef INVARIANTS
1189                         bucket->ub_bucket[bucket->ub_ptr] = NULL;
1190 #endif
1191                         bucket->ub_ptr--;
1192                         KASSERT(item != NULL,
1193                             ("uma_zalloc: Bucket pointer mangled."));
1194                         cache->uc_allocs++;
1195                         CPU_UNLOCK(zone, cpu);
1196                         if (zone->uz_ctor)
1197                                 zone->uz_ctor(item, zone->uz_size, udata);
1198                         return (item);
1199                 } else if (cache->uc_freebucket) {
1200                         /*
1201                          * We have run out of items in our allocbucket.
1202                          * See if we can switch with our free bucket.
1203                          */
1204                         if (cache->uc_freebucket->ub_ptr > -1) {
1205                                 uma_bucket_t swap;
1206
1207 #ifdef UMA_DEBUG_ALLOC
1208                                 printf("uma_zalloc: Swapping empty with alloc.\n");
1209 #endif
1210                                 swap = cache->uc_freebucket;
1211                                 cache->uc_freebucket = cache->uc_allocbucket;
1212                                 cache->uc_allocbucket = swap;
1213
1214                                 goto zalloc_start;
1215                         }
1216                 }
1217         }
1218         /*
1219          * We can get here for three reasons:
1220          *
1221          * 1) The buckets are NULL
1222          * 2) The zone is INTERNAL, and so it has no buckets.
1223          * 3) The alloc and free buckets are both empty.
1224          *
1225          * Just handoff to uma_zalloc_internal to do the hard stuff
1226          *
1227          */
1228 #ifdef UMA_DEBUG_ALLOC
1229         printf("uma_zalloc: Falling back to zalloc_internal.\n");
1230 #endif
1231
1232         item = uma_zalloc_internal(zone, udata, wait, &isitem, cpu);
1233
1234 #ifdef UMA_DEBUG
1235         printf("uma_zalloc: zalloc_internal completed.\n");
1236 #endif
1237
1238         if (item && isitem == 0)
1239                 goto zalloc_start;
1240
1241         /*
1242          * If isitem is set then we should just return it. The cpu lock
1243          * was unlocked when we couldn't get a bucket.
1244          */
1245
1246 #ifdef INVARIANTS
1247         if (wait == M_WAITOK)
1248                 KASSERT(item != NULL,
1249                     ("uma_zalloc: WAITOK set but we're returning NULL"));
1250 #endif
1251         return item;
1252 }
1253
1254 /*
1255  * Allocates an item for an internal zone OR fills a bucket
1256  *
1257  * Arguments
1258  *      zone   The zone to alloc for.
1259  *      udata  The data to be passed to the constructor.
1260  *      wait   M_WAITOK or M_NOWAIT.
1261  *      isitem The returned value is an item if this is true.
1262  *      cpu    The cpu # of the cache that we should use, or -1.
1263  *
1264  * Returns
1265  *      NULL if there is no memory and M_NOWAIT is set
1266  *      An item if called on an interal zone
1267  *      Non NULL if called to fill a bucket and it was successful.
1268  *
1269  * Discussion:
1270  *      This was much cleaner before it had to do per cpu caches.  It is
1271  *      complicated now because it has to handle the simple internal case, and
1272  *      the more involved bucket filling and allocation.  The isitem is there
1273  *      to remove a failure case.  You shouldn't fail on allocating from a zone
1274  *      because there were no buckets.  This allows the exported zalloc to just
1275  *      return the item.
1276  *
1277  */
1278
1279 static void *
1280 uma_zalloc_internal(uma_zone_t zone, void *udata, int wait, int *isitem, int cpu)
1281 {
1282         uma_bucket_t bucket;
1283         uma_cache_t cache;
1284         uma_slab_t slab;
1285         u_int8_t freei;
1286         void *item;
1287
1288         bucket = NULL;
1289         cache = NULL;
1290         item = NULL;
1291
1292         /*
1293          * This is to stop us from allocating per cpu buckets while we're running
1294          * out of UMA_BOOT_PAGES.  Otherwise, we would exhaust the boot pages.
1295          */
1296
1297         if (!booted && zone == bucketzone)
1298                 return (NULL);
1299
1300 #ifdef UMA_DEBUG_ALLOC
1301         printf("INTERNAL: Allocating one item from %s(%p)\n", zone->uz_name, zone);
1302 #endif
1303         if (isitem != NULL)
1304                 *isitem = 0;
1305
1306         ZONE_LOCK(zone);
1307
1308         /* We got here because we need to fill some buckets */
1309         if (cpu != -1) {
1310                 cache = &zone->uz_cpu[cpu];
1311
1312                 zone->uz_allocs += cache->uc_allocs;
1313                 /* Check the free list */
1314                 bucket = LIST_FIRST(&zone->uz_full_bucket);
1315                 if (bucket) {
1316                         LIST_REMOVE(bucket, ub_link);
1317                         /* Our old one is now a free bucket */
1318                         if (cache->uc_allocbucket) {
1319                                 KASSERT(cache->uc_allocbucket->ub_ptr == -1,
1320                                     ("uma_zalloc_internal: Freeing a non free bucket."));
1321                                 LIST_INSERT_HEAD(&zone->uz_free_bucket,
1322                                     cache->uc_allocbucket, ub_link);
1323                         }
1324                         KASSERT(bucket->ub_ptr != -1,
1325                             ("uma_zalloc_internal: Returning an empty bucket."));
1326                         /*zone->uz_free -= bucket->ub_ptr + 1;*/
1327                         cache->uc_allocbucket = bucket;
1328                         ZONE_UNLOCK(zone);
1329                         return (bucket);
1330                 }
1331                 /* Bump up our uc_count so we get here less */
1332                 if (cache->uc_count < UMA_BUCKET_SIZE - 1)
1333                         cache->uc_count++;
1334                 /* Nothing on the free list, try to re-use the old one */
1335                 bucket = cache->uc_allocbucket;
1336                 if (bucket == NULL) {
1337                         /* Nope, we need a new one */
1338                         CPU_UNLOCK(zone, cpu);
1339                         ZONE_UNLOCK(zone);
1340                         bucket = uma_zalloc_internal(bucketzone,
1341                             NULL, wait, NULL, -1);
1342                         CPU_LOCK(zone, cpu);
1343                         ZONE_LOCK(zone);
1344                         /* Did we lose the race? */
1345                         if (cache->uc_allocbucket) {
1346 #ifdef UMA_DEBUG
1347                                 printf("uma_zalloc_internal: Lost race with another CPU.\n");
1348 #endif
1349                                 if (bucket)
1350                                         uma_zfree_internal(bucketzone,
1351                                             bucket, NULL, 0);
1352                                 ZONE_UNLOCK(zone);
1353                                 return (cache->uc_allocbucket);
1354                         }
1355                         cache->uc_allocbucket = bucket;
1356
1357                         if (bucket) {
1358 #ifdef INVARIANTS
1359                                 bzero(bucket, bucketzone->uz_size);
1360 #endif
1361                                 bucket->ub_ptr = -1;
1362                         } else {
1363                                 /*
1364                                  * We may not get a bucket if we recurse, so
1365                                  * return an actual item. The rest of this code
1366                                  * does the right thing if the cache is NULL.
1367                                  */
1368 #ifdef UMA_DEBUG
1369                                 printf("uma_zalloc_internal: Bucketzone returned NULL\n");
1370 #endif
1371                                 CPU_UNLOCK(zone, cpu);
1372                                 cache = NULL;
1373                                 cpu = -1;
1374                         }
1375                 }
1376         }
1377
1378 new_slab:
1379
1380         /* Find a slab with some space */
1381         if (zone->uz_free) {
1382                 if (!LIST_EMPTY(&zone->uz_part_slab)) {
1383                         slab = LIST_FIRST(&zone->uz_part_slab);
1384                 } else {
1385                         slab = LIST_FIRST(&zone->uz_free_slab);
1386                         LIST_REMOVE(slab, us_link);
1387                         LIST_INSERT_HEAD(&zone->uz_part_slab, slab, us_link);
1388                 }
1389         } else {
1390                 /*
1391                  * This is to prevent us from recursively trying to allocate
1392                  * buckets.  The problem is that if an allocation forces us to
1393                  * grab a new bucket we will call page_alloc, which will go off
1394                  * and cause the vm to allocate vm_map_entries.  If we need new
1395                  * buckets there too we will recurse in kmem_alloc and bad
1396                  * things happen.  So instead we return a NULL bucket, and make
1397                  * the code that allocates buckets smart enough to deal with it                  */
1398                 if (zone == bucketzone && zone->uz_recurse != 0) {
1399                         ZONE_UNLOCK(zone);
1400                         return (NULL);
1401                 }
1402                 zone->uz_recurse++;
1403                 slab = slab_zalloc(zone, wait);
1404                 zone->uz_recurse--;
1405                 if (slab)  {
1406                         LIST_INSERT_HEAD(&zone->uz_part_slab, slab, us_link);
1407                 /*
1408                  * We might not have been able to get a page, but another cpu
1409                  * could have while we were unlocked.
1410                  */
1411                 } else if (zone->uz_free == 0) {
1412                         ZONE_UNLOCK(zone);
1413                         /* If we're filling a bucket return what we have */
1414                         if (bucket != NULL && bucket->ub_ptr != -1) {
1415                                 return (bucket);
1416                         } else
1417                                 return (NULL);
1418                 } else {
1419                         /* Another cpu must have succeeded */
1420                         if ((slab = LIST_FIRST(&zone->uz_part_slab)) == NULL) {
1421                                 slab = LIST_FIRST(&zone->uz_free_slab);
1422                                 LIST_REMOVE(slab, us_link);
1423                                 LIST_INSERT_HEAD(&zone->uz_part_slab,
1424                                     slab, us_link);
1425                         }
1426                 }
1427         }
1428
1429         while (slab->us_freecount) {
1430                 freei = slab->us_firstfree;
1431                 slab->us_firstfree = slab->us_freelist[freei];
1432 #ifdef INVARIANTS
1433                 slab->us_freelist[freei] = 255;
1434 #endif
1435                 slab->us_freecount--;
1436                 zone->uz_free--;
1437                 item = slab->us_data + (zone->uz_rsize * freei);
1438
1439                 if (cache == NULL) {
1440                         zone->uz_allocs++;
1441                         break;
1442                 }
1443
1444                 bucket->ub_bucket[++bucket->ub_ptr] = item;
1445
1446                 /* Don't overfill the bucket! */
1447                 if (bucket->ub_ptr == cache->uc_count)
1448                         break;
1449         }
1450
1451         /* Move this slab to the full list */
1452         if (slab->us_freecount == 0) {
1453                 LIST_REMOVE(slab, us_link);
1454                 LIST_INSERT_HEAD(&zone->uz_full_slab, slab, us_link);
1455         }
1456
1457         if (cache != NULL) {
1458                 /* Try to keep the buckets totally full, but don't block */
1459                 if (bucket->ub_ptr < cache->uc_count) {
1460                         wait = M_NOWAIT;
1461                         goto new_slab;
1462                 }
1463         }
1464
1465         ZONE_UNLOCK(zone);
1466
1467         /* Only construct at this time if we're not filling a bucket */
1468         if (cache == NULL)  {
1469                 if (zone->uz_ctor)
1470                         zone->uz_ctor(item, zone->uz_size, udata);
1471
1472                 if (isitem != NULL)
1473                         *isitem = 1;
1474         }
1475
1476         return (item);
1477 }
1478
1479 /* See uma.h */
1480 void
1481 uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
1482 {
1483         uma_cache_t cache;
1484         uma_bucket_t bucket;
1485         int cpu;
1486
1487         /* This is the fast path free */
1488 #ifdef UMA_DEBUG_ALLOC_1
1489         printf("Freeing item %p to %s(%p)\n", item, zone->uz_name, zone);
1490 #endif
1491         cpu = PCPU_GET(cpuid);
1492         CPU_LOCK(zone, cpu);
1493         cache = &zone->uz_cpu[cpu];
1494
1495 zfree_start:
1496         bucket = cache->uc_freebucket;
1497
1498         if (bucket) {
1499                 /* Do we have room in our bucket? */
1500                 if (bucket->ub_ptr < cache->uc_count) {
1501                         bucket->ub_ptr++;
1502                         KASSERT(bucket->ub_bucket[bucket->ub_ptr] == NULL,
1503                             ("uma_zfree: Freeing to non free bucket index."));
1504                         bucket->ub_bucket[bucket->ub_ptr] = item;
1505                         CPU_UNLOCK(zone, cpu);
1506                         if (zone->uz_dtor)
1507                                 zone->uz_dtor(item, zone->uz_size, udata);
1508                         return;
1509                 } else if (cache->uc_allocbucket) {
1510 #ifdef UMA_DEBUG_ALLOC
1511                         printf("uma_zfree: Swapping buckets.\n");
1512 #endif
1513                         /*
1514                          * We have run out of space in our freebucket.
1515                          * See if we can switch with our alloc bucket.
1516                          */
1517                         if (cache->uc_allocbucket->ub_ptr <
1518                             cache->uc_freebucket->ub_ptr) {
1519                                 uma_bucket_t swap;
1520
1521                                 swap = cache->uc_freebucket;
1522                                 cache->uc_freebucket = cache->uc_allocbucket;
1523                                 cache->uc_allocbucket = swap;
1524
1525                                 goto zfree_start;
1526                         }
1527                 }
1528         }
1529
1530         /*
1531          * We can get here for three reasons:
1532          *
1533          * 1) The buckets are NULL
1534          * 2) The zone is INTERNAL, and so it has no buckets.
1535          * 3) The alloc and free buckets are both somewhat full.
1536          *
1537          */
1538
1539         ZONE_LOCK(zone);
1540
1541         if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL)) {
1542                 bucket = cache->uc_freebucket;
1543                 cache->uc_freebucket = NULL;
1544
1545                 /* Can we throw this on the zone full list? */
1546                 if (bucket != NULL) {
1547 #ifdef UMA_DEBUG_ALLOC
1548                         printf("uma_zfree: Putting old bucket on the free list.\n");
1549 #endif
1550                         /* ub_ptr is pointing to the last free item */
1551                         KASSERT(bucket->ub_ptr != -1,
1552                             ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n"));
1553                         /*zone->uz_free += bucket->ub_ptr + 1;*/
1554                         LIST_INSERT_HEAD(&zone->uz_full_bucket,
1555                             bucket, ub_link);
1556                         bucket = LIST_FIRST(&zone->uz_free_bucket);
1557                         if (bucket)
1558                                 LIST_REMOVE(bucket, ub_link);
1559                 }
1560                 /*
1561                  * Do we need to alloc one? Either the freebucket was NULL
1562                  * or the free_bucket list was empty.
1563                  */
1564                 if (bucket == NULL) {
1565 #ifdef UMA_DEBUG_ALLOC
1566                         printf("uma_zfree: Allocating new free bucket.\n");
1567 #endif
1568                         /* This has to be done so we don't recurse on a lock */
1569                         ZONE_UNLOCK(zone);
1570                         CPU_UNLOCK(zone, cpu);
1571                         bucket = uma_zalloc_internal(bucketzone,
1572                             NULL, M_NOWAIT, NULL, -1);
1573                         CPU_LOCK(zone, cpu);
1574                         ZONE_LOCK(zone);
1575                         if (bucket) {
1576 #ifdef INVARIANTS
1577                                 bzero(bucket, bucketzone->uz_size);
1578 #endif
1579                                 bucket->ub_ptr = -1;
1580                         }
1581                         /* Did we lose the race? */
1582                         if (cache->uc_freebucket != NULL) {
1583                                 if (bucket)
1584                                         uma_zfree_internal(bucketzone,
1585                                             bucket, NULL, 0);
1586                                 ZONE_UNLOCK(zone);
1587                                 goto zfree_start;
1588                         }
1589                         /* If we couldn't get one just free directly */
1590                         if (bucket == NULL)
1591                                 goto zfree_internal;
1592                 }
1593                 cache->uc_freebucket = bucket;
1594                 ZONE_UNLOCK(zone);
1595                 goto zfree_start;
1596         }
1597
1598 zfree_internal:
1599
1600         CPU_UNLOCK(zone, cpu);
1601         ZONE_UNLOCK(zone);
1602         uma_zfree_internal(zone, item, udata, 0);
1603
1604         return;
1605
1606 }
1607
1608 /*
1609  * Frees an item to an INTERNAL zone or allocates a free bucket
1610  *
1611  * Arguments:
1612  *      zone   The zone to free to
1613  *      item   The item we're freeing
1614  *      udata  User supplied data for the dtor
1615  *      skip   Skip the dtor, it was done in uma_zfree_arg
1616  */
1617
1618 static void
1619 uma_zfree_internal(uma_zone_t zone, void *item, void *udata, int skip)
1620 {
1621         uma_slab_t slab;
1622         u_int8_t *mem;
1623         u_int8_t freei;
1624
1625         ZONE_LOCK(zone);
1626
1627         if (!(zone->uz_flags & UMA_ZFLAG_MALLOC)) {
1628                 mem = (u_int8_t *)((unsigned long)item & (~UMA_SLAB_MASK));
1629                 if (zone->uz_flags & UMA_ZFLAG_OFFPAGE)
1630                         slab = hash_sfind(&zone->uz_hash, mem);
1631                 else {
1632                         mem += zone->uz_pgoff;
1633                         slab = (uma_slab_t)mem;
1634                 }
1635         } else {
1636                 slab = (uma_slab_t)udata;
1637         }
1638
1639         /* Do we need to remove from any lists? */
1640         if (slab->us_freecount+1 == zone->uz_ipers) {
1641                 LIST_REMOVE(slab, us_link);
1642                 LIST_INSERT_HEAD(&zone->uz_free_slab, slab, us_link);
1643         } else if (slab->us_freecount == 0) {
1644                 LIST_REMOVE(slab, us_link);
1645                 LIST_INSERT_HEAD(&zone->uz_part_slab, slab, us_link);
1646         }
1647
1648         /* Slab management stuff */
1649         freei = ((unsigned long)item - (unsigned long)slab->us_data)
1650                 / zone->uz_rsize;
1651 #ifdef INVARIANTS
1652         if (((freei * zone->uz_rsize) + slab->us_data) != item)
1653                 panic("zone: %s(%p) slab %p freed address %p unaligned.\n",
1654                     zone->uz_name, zone, slab, item);
1655         if (freei >= zone->uz_ipers)
1656                 panic("zone: %s(%p) slab %p freelist %i out of range 0-%d\n",
1657                     zone->uz_name, zone, slab, freei, zone->uz_ipers-1);
1658
1659         if (slab->us_freelist[freei] != 255) {
1660                 printf("Slab at %p, freei %d = %d.\n",
1661                     slab, freei, slab->us_freelist[freei]);
1662                 panic("Duplicate free of item %p from zone %p(%s)\n",
1663                     item, zone, zone->uz_name);
1664         }
1665 #endif
1666         slab->us_freelist[freei] = slab->us_firstfree;
1667         slab->us_firstfree = freei;
1668         slab->us_freecount++;
1669
1670         /* Zone statistics */
1671         zone->uz_free++;
1672
1673         ZONE_UNLOCK(zone);
1674
1675         if (!skip && zone->uz_dtor)
1676                 zone->uz_dtor(item, zone->uz_size, udata);
1677 }
1678
1679 /* See uma.h */
1680 void
1681 uma_zone_set_freef(uma_zone_t zone, uma_free freef)
1682 {
1683         ZONE_LOCK(zone);
1684
1685         zone->uz_freef = freef;
1686
1687         ZONE_UNLOCK(zone);
1688 }
1689
1690 /* See uma.h */
1691 void
1692 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
1693 {
1694         ZONE_LOCK(zone);
1695
1696         zone->uz_flags |= UMA_ZFLAG_PRIVALLOC;
1697         zone->uz_allocf = allocf;
1698
1699         ZONE_UNLOCK(zone);
1700 }
1701
1702 /* See uma.h */
1703 int
1704 uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int count)
1705 {
1706         int pages;
1707         vm_offset_t kva;
1708
1709         ZONE_LOCK(zone);
1710         mtx_lock(&Giant);
1711
1712         zone->uz_obj = obj;
1713         pages = count / zone->uz_ipers;
1714
1715         if (pages * zone->uz_ipers < count)
1716                 pages++;
1717         zone->uz_kva = NULL;
1718         ZONE_UNLOCK(zone);
1719         kva = kmem_alloc_pageable(kernel_map, pages * UMA_SLAB_SIZE);
1720         ZONE_LOCK(zone);
1721
1722         zone->uz_kva = kva;
1723
1724         if (zone->uz_kva == 0) {
1725                 ZONE_UNLOCK(zone);
1726                 return (0);
1727         }
1728
1729         zone->uz_maxpages = pages;
1730
1731         if (zone->uz_obj == NULL)
1732                 zone->uz_obj = vm_object_allocate(OBJT_DEFAULT,
1733                     zone->uz_maxpages);
1734         else
1735                 _vm_object_allocate(OBJT_DEFAULT,
1736                     zone->uz_maxpages, zone->uz_obj);
1737
1738         zone->uz_allocf = obj_alloc;
1739         zone->uz_flags |= UMA_ZFLAG_NOFREE | UMA_ZFLAG_PRIVALLOC;
1740
1741         mtx_unlock(&Giant);
1742         ZONE_UNLOCK(zone);
1743
1744         return (1);
1745 }
1746
1747 /* See uma.h */
1748 void
1749 uma_prealloc(uma_zone_t zone, int items)
1750 {
1751         int slabs;
1752         uma_slab_t slab;
1753
1754         ZONE_LOCK(zone);
1755         slabs = items / zone->uz_ipers;
1756         if (slabs * zone->uz_ipers < items)
1757                 slabs++;
1758
1759         while (slabs > 0) {
1760                 slab = slab_zalloc(zone, M_WAITOK);
1761                 LIST_INSERT_HEAD(&zone->uz_free_slab, slab, us_link);
1762                 slabs--;
1763         }
1764         ZONE_UNLOCK(zone);
1765 }
1766
1767 /* See uma.h */
1768 void
1769 uma_reclaim(void)
1770 {
1771         /*
1772          * You might think that the delay below would improve performance since
1773          * the allocator will give away memory that it may ask for immediately.
1774          * Really, it makes things worse, since cpu cycles are so much cheaper
1775          * than disk activity.
1776          */
1777 #if 0
1778         static struct timeval tv = {0};
1779         struct timeval now;
1780         getmicrouptime(&now);
1781         if (now.tv_sec > tv.tv_sec + 30)
1782                 tv = now;
1783         else
1784                 return;
1785 #endif
1786 #ifdef UMA_DEBUG
1787         printf("UMA: vm asked us to release pages!\n");
1788 #endif
1789         zone_foreach(zone_drain);
1790
1791         /*
1792          * Some slabs may have been freed but this zone will be visited early
1793          * we visit again so that we can free pages that are empty once other
1794          * zones are drained.  We have to do the same for buckets.
1795          */
1796         zone_drain(slabzone);
1797         zone_drain(bucketzone);
1798 }
1799
1800 void *
1801 uma_large_malloc(int size, int wait)
1802 {
1803         void *mem;
1804         uma_slab_t slab;
1805         u_int8_t flags;
1806
1807         slab = uma_zalloc_internal(slabzone, NULL, wait, NULL, -1);
1808         if (slab == NULL)
1809                 return (NULL);
1810
1811         mem = page_alloc(NULL, size, &flags, wait);
1812         if (mem) {
1813                 slab->us_data = mem;
1814                 slab->us_flags = flags | UMA_SLAB_MALLOC;
1815                 slab->us_size = size;
1816                 UMA_HASH_INSERT(mallochash, slab, mem);
1817         } else {
1818                 uma_zfree_internal(slabzone, slab, NULL, 0);
1819         }
1820
1821
1822         return (mem);
1823 }
1824
1825 void
1826 uma_large_free(uma_slab_t slab)
1827 {
1828         UMA_HASH_REMOVE(mallochash, slab, slab->us_data);
1829         page_free(slab->us_data, slab->us_size, slab->us_flags);
1830         uma_zfree_internal(slabzone, slab, NULL, 0);
1831 }
1832
1833 void
1834 uma_print_stats(void)
1835 {
1836         zone_foreach(uma_print_zone);
1837 }
1838
1839 void
1840 uma_print_zone(uma_zone_t zone)
1841 {
1842         printf("%s(%p) size %d(%d) flags %d ipers %d ppera %d out %d free %d\n",
1843             zone->uz_name, zone, zone->uz_size, zone->uz_rsize, zone->uz_flags,
1844             zone->uz_ipers, zone->uz_ppera,
1845             (zone->uz_ipers * zone->uz_pages) - zone->uz_free, zone->uz_free);
1846 }
1847
1848 /*
1849  * Sysctl handler for vm.zone
1850  *
1851  * stolen from vm_zone.c
1852  */
1853 static int
1854 sysctl_vm_zone(SYSCTL_HANDLER_ARGS)
1855 {
1856         int error, len, cnt;
1857         const int linesize = 128;       /* conservative */
1858         int totalfree;
1859         char *tmpbuf, *offset;
1860         uma_zone_t z;
1861         char *p;
1862
1863         cnt = 0;
1864         LIST_FOREACH(z, &uma_zones, uz_link)
1865                 cnt++;
1866         MALLOC(tmpbuf, char *, (cnt == 0 ? 1 : cnt) * linesize,
1867                         M_TEMP, M_WAITOK);
1868         len = snprintf(tmpbuf, linesize,
1869             "\nITEM            SIZE     LIMIT     USED    FREE  REQUESTS\n\n");
1870         if (cnt == 0)
1871                 tmpbuf[len - 1] = '\0';
1872         error = SYSCTL_OUT(req, tmpbuf, cnt == 0 ? len-1 : len);
1873         if (error || cnt == 0)
1874                 goto out;
1875         offset = tmpbuf;
1876         LIST_FOREACH(z, &uma_zones, uz_link) {
1877                 if (cnt == 0)   /* list may have changed size */
1878                         break;
1879                 ZONE_LOCK(z);
1880                 totalfree = z->uz_free + z->uz_cachefree;
1881                 len = snprintf(offset, linesize,
1882                     "%-12.12s  %6.6u, %8.8u, %6.6u, %6.6u, %8.8llu\n",
1883                     z->uz_name, z->uz_size,
1884                     z->uz_maxpages * z->uz_ipers,
1885                     (z->uz_ipers * (z->uz_pages / z->uz_ppera)) - totalfree,
1886                     totalfree,
1887                     (unsigned long long)z->uz_allocs);
1888                 ZONE_UNLOCK(z);
1889                 for (p = offset + 12; p > offset && *p == ' '; --p)
1890                         /* nothing */ ;
1891                 p[1] = ':';
1892                 cnt--;
1893                 offset += len;
1894         }
1895         *offset++ = '\0';
1896         error = SYSCTL_OUT(req, tmpbuf, offset - tmpbuf);
1897 out:
1898         FREE(tmpbuf, M_TEMP);
1899         return (error);
1900 }