sys/kern/subr_mbuf.c

   1 /*
   2  * Copyright (c) 2001
   3  *      Bosko Milekic <bmilekic@FreeBSD.org>. All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  * 3. The name of the author may not be used to endorse or promote products
  14  *    derived from this software without specific prior written permission.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26  * SUCH DAMAGE.
  27  *
  28  * $FreeBSD$
  29  */
  30
  31 #include "opt_param.h"
  32 #include <sys/param.h>
  33 #include <sys/systm.h>
  34 #include <sys/malloc.h>
  35 #include <sys/mbuf.h>
  36 #include <sys/lock.h>
  37 #include <sys/mutex.h>
  38 #include <sys/condvar.h>
  39 #include <sys/smp.h>
  40 #include <sys/kernel.h>
  41 #include <sys/sysctl.h>
  42 #include <sys/domain.h>
  43 #include <sys/protosw.h>
  44 #include <vm/vm.h>
  45 #include <vm/vm_kern.h>
  46 #include <vm/vm_extern.h>
  47
  48 /*
  49  * Maximum number of PCPU containers. If you know what you're doing you could
  50  * explicitly define MBALLOC_NCPU to be exactly the number of CPUs on your
  51  * system during compilation, and thus prevent kernel structure bloat.
  52  *
  53  * SMP and non-SMP kernels clearly have a different number of possible cpus,
  54  * but because we cannot assume a dense array of CPUs, we always allocate
  55  * and traverse PCPU containers up to NCPU amount and merely check for
  56  * CPU availability.
  57  */
  58 #ifdef  MBALLOC_NCPU
  59 #define NCPU    MBALLOC_NCPU
  60 #else
  61 #define NCPU    MAXCPU
  62 #endif
  63
  64 /*
  65  * The mbuf allocator is heavily based on Alfred Perlstein's
  66  * (alfred@FreeBSD.org) "memcache" allocator which is itself based
  67  * on concepts from several per-CPU memory allocators. The difference
  68  * between this allocator and memcache is that, among other things:
  69  *
  70  * (i) We don't free back to the map from the free() routine - we leave the
  71  *     option of implementing lazy freeing (from a kproc) in the future.
  72  *
  73  * (ii) We allocate from separate sub-maps of kmem_map, thus limiting the
  74  *      maximum number of allocatable objects of a given type. Further,
  75  *      we handle blocking on a cv in the case that the map is starved and
  76  *      we have to rely solely on cached (circulating) objects.
  77  *
  78  * The mbuf allocator keeps all objects that it allocates in mb_buckets.
  79  * The buckets keep a page worth of objects (an object can be an mbuf or an
  80  * mbuf cluster) and facilitate moving larger sets of contiguous objects
  81  * from the per-CPU lists to the main list for the given object. The buckets
  82  * also have an added advantage in that after several moves from a per-CPU
  83  * list to the main list and back to the per-CPU list, contiguous objects
  84  * are kept together, thus trying to put the TLB cache to good use.
  85  *
  86  * The buckets are kept on singly-linked lists called "containers." A container
  87  * is protected by a mutex lock in order to ensure consistency. The mutex lock
  88  * itself is allocated seperately and attached to the container at boot time,
  89  * thus allowing for certain containers to share the same mutex lock. Per-CPU
  90  * containers for mbufs and mbuf clusters all share the same per-CPU
  91  * lock whereas the "general system" containers (i.e. the "main lists") for
  92  * these objects share one global lock.
  93  *
  94  */
  95 struct mb_bucket {
  96         SLIST_ENTRY(mb_bucket)  mb_blist;
  97         int                     mb_owner;
  98         int                     mb_numfree;
  99         void                    *mb_free[0];
 100 };
 101
 102 struct mb_container {
 103         SLIST_HEAD(mc_buckethd, mb_bucket)      mc_bhead;
 104         struct  mtx                             *mc_lock;
 105         int                                     mc_numowner;
 106         u_int                                   mc_starved;
 107         u_long                                  *mc_objcount;
 108         u_long                                  *mc_numpgs;
 109 };
 110
 111 struct mb_gen_list {
 112         struct  mb_container    mb_cont;
 113         struct  cv              mgl_mstarved;
 114 };
 115
 116 struct mb_pcpu_list {
 117         struct  mb_container    mb_cont;
 118 };
 119
 120 /*
 121  * Boot-time configurable object counts that will determine the maximum
 122  * number of permitted objects in the mbuf and mcluster cases. In the
 123  * ext counter (nmbcnt) case, it's just an indicator serving to scale
 124  * kmem_map size properly - in other words, we may be allowed to allocate
 125  * more than nmbcnt counters, whereas we will never be allowed to allocate
 126  * more than nmbufs mbufs or nmbclusters mclusters.
 127  * As for nsfbufs, it is used to indicate how many sendfile(2) buffers will be
 128  * allocatable by the sfbuf allocator (found in uipc_syscalls.c)
 129  */
 130 #ifndef NMBCLUSTERS
 131 #define NMBCLUSTERS     (1024 + maxusers * 64)
 132 #endif
 133 #ifndef NMBUFS
 134 #define NMBUFS          (nmbclusters * 2)
 135 #endif
 136 #ifndef NSFBUFS
 137 #define NSFBUFS         (512 + maxusers * 16)
 138 #endif
 139 #ifndef NMBCNTS
 140 #define NMBCNTS         (nmbclusters + nsfbufs)
 141 #endif
 142 int     nmbufs;
 143 int     nmbclusters;
 144 int     nmbcnt;
 145 int     nsfbufs;
 146
 147 /*
 148  * Perform sanity checks of tunables declared above.
 149  */
 150 static void
 151 tunable_mbinit(void *dummy)
 152 {
 153
 154         /*
 155          * This has to be done before VM init.
 156          */
 157         nmbclusters = NMBCLUSTERS;
 158         TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
 159         nmbufs = NMBUFS;
 160         TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
 161         nsfbufs = NSFBUFS;
 162         TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs);
 163         nmbcnt = NMBCNTS;
 164         TUNABLE_INT_FETCH("kern.ipc.nmbcnt", &nmbcnt);
 165         /* Sanity checks */
 166         if (nmbufs < nmbclusters * 2)
 167                 nmbufs = nmbclusters * 2;
 168         if (nmbcnt < nmbclusters + nsfbufs)
 169                 nmbcnt = nmbclusters + nsfbufs;
 170
 171         return;
 172 }
 173 SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL);
 174
 175 /*
 176  * The freelist structures and mutex locks. The number statically declared
 177  * here depends on the number of CPUs.
 178  *
 179  * We setup in such a way that all the objects (mbufs, clusters)
 180  * share the same mutex lock. It has been established that we do not benefit
 181  * from different locks for different objects, so we use the same lock,
 182  * regardless of object type.
 183  */
 184 struct mb_lstmngr {
 185         struct  mb_gen_list     *ml_genlist;
 186         struct  mb_pcpu_list    *ml_cntlst[NCPU];
 187         struct  mb_bucket       **ml_btable;
 188         vm_map_t                ml_map;
 189         vm_offset_t             ml_mapbase;
 190         vm_offset_t             ml_maptop;
 191         int                     ml_mapfull;
 192         u_int                   ml_objsize;
 193         u_int                   *ml_wmhigh;
 194 };
 195 struct  mb_lstmngr      mb_list_mbuf, mb_list_clust;
 196 struct  mtx             mbuf_gen, mbuf_pcpu[NCPU];
 197
 198 /*
 199  * Local macros for internal allocator structure manipulations.
 200  */
 201 #ifdef SMP
 202 #define MB_GET_PCPU_LIST(mb_lst)          (mb_lst)->ml_cntlst[PCPU_GET(cpuid)]
 203 #else
 204 #define MB_GET_PCPU_LIST(mb_lst)          (mb_lst)->ml_cntlst[0]
 205 #endif
 206
 207 #define MB_GET_PCPU_LIST_NUM(mb_lst, num) (mb_lst)->ml_cntlst[(num)]
 208
 209 #define MB_GET_GEN_LIST(mb_lst)           (mb_lst)->ml_genlist
 210
 211 #define MB_LOCK_CONT(mb_cnt)              mtx_lock((mb_cnt)->mb_cont.mc_lock)
 212
 213 #define MB_UNLOCK_CONT(mb_cnt)            mtx_unlock((mb_cnt)->mb_cont.mc_lock)
 214
 215 #define MB_BUCKET_INDX(mb_obj, mb_lst)                                  \
 216     (int)(((caddr_t)(mb_obj) - (caddr_t)(mb_lst)->ml_mapbase) / PAGE_SIZE)
 217
 218 #define MB_GET_OBJECT(mb_objp, mb_bckt, mb_lst)                         \
 219 {                                                                       \
 220         struct  mc_buckethd     *_mchd = &((mb_lst)->mb_cont.mc_bhead); \
 221                                                                         \
 222         (mb_bckt)->mb_numfree--;                                        \
 223         (mb_objp) = (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)];        \
 224         (*((mb_lst)->mb_cont.mc_objcount))--;                           \
 225         if ((mb_bckt)->mb_numfree == 0) {                               \
 226                 SLIST_REMOVE_HEAD(_mchd, mb_blist);                     \
 227                 SLIST_NEXT((mb_bckt), mb_blist) = NULL;                 \
 228                 (mb_bckt)->mb_owner |= MB_BUCKET_FREE;                  \
 229         }                                                               \
 230 }
 231
 232 #define MB_PUT_OBJECT(mb_objp, mb_bckt, mb_lst)                         \
 233         (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)] = (mb_objp);        \
 234         (mb_bckt)->mb_numfree++;                                        \
 235         (*((mb_lst)->mb_cont.mc_objcount))++;
 236
 237 /*
 238  * Ownership of buckets/containers is represented by integers. The PCPU
 239  * lists range from 0 to NCPU-1. We need a free numerical id for the general
 240  * list (we use NCPU). We also need a non-conflicting free bit to indicate
 241  * that the bucket is free and removed from a container, while not losing
 242  * the bucket's originating container id. We use the highest bit
 243  * for the free marker.
 244  */
 245 #define MB_GENLIST_OWNER        (NCPU)
 246 #define MB_BUCKET_FREE          (1 << (sizeof(int) * 8 - 1))
 247
 248 /*
 249  * sysctl(8) exported objects
 250  */
 251 struct  mbstat  mbstat;                 /* General stats + infos. */
 252 struct  mbpstat mb_statpcpu[NCPU+1];    /* PCPU + Gen. container alloc stats */
 253 int             mbuf_wait =     64;     /* Sleep time for wait code (ticks) */
 254 u_int           mbuf_limit =    512;    /* Upper lim. on # of mbufs per CPU */
 255 u_int           clust_limit =   128;    /* Upper lim. on # of clusts per CPU */
 256 SYSCTL_DECL(_kern_ipc);
 257 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbclusters, CTLFLAG_RD, &nmbclusters, 0,
 258     "Maximum number of mbuf clusters available");
 259 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RD, &nmbufs, 0,
 260     "Maximum number of mbufs available");
 261 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbcnt, CTLFLAG_RD, &nmbcnt, 0,
 262     "Number used to scale kmem_map to ensure sufficient space for counters");
 263 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RD, &nsfbufs, 0,
 264     "Maximum number of sendfile(2) sf_bufs available");
 265 SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW, &mbuf_wait, 0,
 266     "Sleep time of mbuf subsystem wait allocations during exhaustion");
 267 SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_limit, CTLFLAG_RW, &mbuf_limit, 0,
 268     "Upper limit of number of mbufs allowed on each PCPU list");
 269 SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_limit, CTLFLAG_RW, &clust_limit, 0,
 270     "Upper limit of number of mbuf clusters allowed on each PCPU list");
 271 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat,
 272     "Mbuf general information and statistics");
 273 SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mb_statpcpu, CTLFLAG_RD, mb_statpcpu,
 274     sizeof(mb_statpcpu), "S,", "Mbuf allocator per CPU statistics");
 275
 276 /*
 277  * Prototypes of local allocator routines.
 278  */
 279 static __inline void    *mb_alloc(struct mb_lstmngr *, int);
 280 void                    *mb_alloc_wait(struct mb_lstmngr *);
 281 static __inline void     mb_free(struct mb_lstmngr *, void *);
 282 static  void             mbuf_init(void *);
 283 struct  mb_bucket       *mb_pop_cont(struct mb_lstmngr *, int,
 284                             struct mb_pcpu_list *);
 285 void                     mb_reclaim(void);
 286
 287 /*
 288  * Initial allocation numbers. Each parameter represents the number of buckets
 289  * of each object that will be placed initially in each PCPU container for
 290  * said object.
 291  */
 292 #define NMB_MBUF_INIT   4
 293 #define NMB_CLUST_INIT  16
 294
 295 /*
 296  * Initialize the mbuf subsystem.
 297  *
 298  * We sub-divide the kmem_map into several submaps; this way, we don't have
 299  * to worry about artificially limiting the number of mbuf or mbuf cluster
 300  * allocations, due to fear of one type of allocation "stealing" address
 301  * space initially reserved for another.
 302  *
 303  * Setup both the general containers and all the PCPU containers. Populate
 304  * the PCPU containers with initial numbers.
 305  */
 306 MALLOC_DEFINE(M_MBUF, "mbufmgr", "mbuf subsystem management structures");
 307 SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL)
 308 void
 309 mbuf_init(void *dummy)
 310 {
 311         struct  mb_pcpu_list    *pcpu_cnt;
 312         vm_size_t               mb_map_size;
 313         int                     i, j;
 314
 315         /*
 316          * Setup all the submaps, for each type of object that we deal
 317          * with in this allocator.
 318          */
 319         mb_map_size = (vm_size_t)(nmbufs * MSIZE);
 320         mb_map_size = rounddown(mb_map_size, PAGE_SIZE);
 321         mb_list_mbuf.ml_btable = malloc((unsigned long)mb_map_size / PAGE_SIZE *
 322             sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT);
 323         if (mb_list_mbuf.ml_btable == NULL)
 324                 goto bad;
 325         mb_list_mbuf.ml_map = kmem_suballoc(kmem_map,&(mb_list_mbuf.ml_mapbase),
 326             &(mb_list_mbuf.ml_maptop), mb_map_size);
 327         mb_list_mbuf.ml_mapfull = 0;
 328         mb_list_mbuf.ml_objsize = MSIZE;
 329         mb_list_mbuf.ml_wmhigh = &mbuf_limit;
 330
 331         mb_map_size = (vm_size_t)(nmbclusters * MCLBYTES);
 332         mb_map_size = rounddown(mb_map_size, PAGE_SIZE);
 333         mb_list_clust.ml_btable = malloc((unsigned long)mb_map_size / PAGE_SIZE
 334             * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT);
 335         if (mb_list_clust.ml_btable == NULL)
 336                 goto bad;
 337         mb_list_clust.ml_map = kmem_suballoc(kmem_map,
 338             &(mb_list_clust.ml_mapbase), &(mb_list_clust.ml_maptop),
 339             mb_map_size);
 340         mb_list_clust.ml_mapfull = 0;
 341         mb_list_clust.ml_objsize = MCLBYTES;
 342         mb_list_clust.ml_wmhigh = &clust_limit;
 343
 344         /* XXX XXX XXX: mbuf_map->system_map = clust_map->system_map = 1 */
 345
 346         /*
 347          * Allocate required general (global) containers for each object type.
 348          */
 349         mb_list_mbuf.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF,
 350             M_NOWAIT);
 351         mb_list_clust.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF,
 352             M_NOWAIT);
 353         if ((mb_list_mbuf.ml_genlist == NULL) ||
 354             (mb_list_clust.ml_genlist == NULL))
 355                 goto bad;
 356
 357         /*
 358          * Initialize condition variables and general container mutex locks.
 359          */
 360         mtx_init(&mbuf_gen, "mbuf subsystem general lists lock", 0);
 361         cv_init(&(mb_list_mbuf.ml_genlist->mgl_mstarved), "mbuf pool starved");
 362         cv_init(&(mb_list_clust.ml_genlist->mgl_mstarved),
 363             "mcluster pool starved");
 364         mb_list_mbuf.ml_genlist->mb_cont.mc_lock =
 365             mb_list_clust.ml_genlist->mb_cont.mc_lock = &mbuf_gen;
 366
 367         /*
 368          * Setup the general containers for each object.
 369          */
 370         mb_list_mbuf.ml_genlist->mb_cont.mc_numowner =
 371             mb_list_clust.ml_genlist->mb_cont.mc_numowner = MB_GENLIST_OWNER;
 372         mb_list_mbuf.ml_genlist->mb_cont.mc_starved =
 373             mb_list_clust.ml_genlist->mb_cont.mc_starved = 0;
 374         mb_list_mbuf.ml_genlist->mb_cont.mc_objcount =
 375             &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbfree);
 376         mb_list_clust.ml_genlist->mb_cont.mc_objcount =
 377             &(mb_statpcpu[MB_GENLIST_OWNER].mb_clfree);
 378         mb_list_mbuf.ml_genlist->mb_cont.mc_numpgs =
 379             &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbpgs);
 380         mb_list_clust.ml_genlist->mb_cont.mc_numpgs =
 381             &(mb_statpcpu[MB_GENLIST_OWNER].mb_clpgs);
 382         SLIST_INIT(&(mb_list_mbuf.ml_genlist->mb_cont.mc_bhead));
 383         SLIST_INIT(&(mb_list_clust.ml_genlist->mb_cont.mc_bhead));
 384
 385         /*
 386          * Initialize general mbuf statistics
 387          */
 388         mbstat.m_msize = MSIZE;
 389         mbstat.m_mclbytes = MCLBYTES;
 390         mbstat.m_minclsize = MINCLSIZE;
 391         mbstat.m_mlen = MLEN;
 392         mbstat.m_mhlen = MHLEN;
 393
 394         /*
 395          * Allocate and initialize PCPU containers.
 396          */
 397         for (i = 0; i < NCPU; i++) {
 398                 if (CPU_ABSENT(i))
 399                         continue;
 400
 401                 mb_list_mbuf.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list),
 402                     M_MBUF, M_NOWAIT);
 403                 mb_list_clust.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list),
 404                     M_MBUF, M_NOWAIT);
 405                 if ((mb_list_mbuf.ml_cntlst[i] == NULL) ||
 406                     (mb_list_clust.ml_cntlst[i] == NULL))
 407                         goto bad;
 408
 409                 mtx_init(&mbuf_pcpu[i], "mbuf PCPU list lock", 0);
 410                 mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_lock =
 411                     mb_list_clust.ml_cntlst[i]->mb_cont.mc_lock = &mbuf_pcpu[i];
 412
 413                 mb_statpcpu[i].mb_active = 1;
 414                 mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numowner =
 415                     mb_list_clust.ml_cntlst[i]->mb_cont.mc_numowner = i;
 416                 mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_starved =
 417                     mb_list_clust.ml_cntlst[i]->mb_cont.mc_starved = 0;
 418                 mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_objcount =
 419                     &(mb_statpcpu[i].mb_mbfree);
 420                 mb_list_clust.ml_cntlst[i]->mb_cont.mc_objcount =
 421                     &(mb_statpcpu[i].mb_clfree);
 422                 mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numpgs =
 423                     &(mb_statpcpu[i].mb_mbpgs);
 424                 mb_list_clust.ml_cntlst[i]->mb_cont.mc_numpgs =
 425                     &(mb_statpcpu[i].mb_clpgs);
 426
 427                 SLIST_INIT(&(mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_bhead));
 428                 SLIST_INIT(&(mb_list_clust.ml_cntlst[i]->mb_cont.mc_bhead));
 429
 430                 /*
 431                  * Perform initial allocations.
 432                  */
 433                 pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_mbuf, i);
 434                 MB_LOCK_CONT(pcpu_cnt);
 435                 for (j = 0; j < NMB_MBUF_INIT; j++) {
 436                         if (mb_pop_cont(&mb_list_mbuf, M_DONTWAIT, pcpu_cnt)
 437                             == NULL)
 438                                 goto bad;
 439                 }
 440                 MB_UNLOCK_CONT(pcpu_cnt);
 441
 442                 pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_clust, i);
 443                 MB_LOCK_CONT(pcpu_cnt);
 444                 for (j = 0; j < NMB_CLUST_INIT; j++) {
 445                         if (mb_pop_cont(&mb_list_clust, M_DONTWAIT, pcpu_cnt)
 446                             == NULL)
 447                                 goto bad;
 448                 }
 449                 MB_UNLOCK_CONT(pcpu_cnt);
 450         }
 451
 452         return;
 453 bad:
 454         panic("mbuf_init(): failed to initialize mbuf subsystem!");
 455 }
 456
 457 /*
 458  * Populate a given mbuf PCPU container with a bucket full of fresh new
 459  * buffers. Return a pointer to the new bucket (already in the container if
 460  * successful), or return NULL on failure.
 461  *
 462  * LOCKING NOTES:
 463  * PCPU container lock must be held when this is called.
 464  * The lock is dropped here so that we can cleanly call the underlying VM
 465  * code. If we fail, we return with no locks held. If we succeed (i.e. return
 466  * non-NULL), we return with the PCPU lock held, ready for allocation from
 467  * the returned bucket.
 468  */
 469 struct mb_bucket *
 470 mb_pop_cont(struct mb_lstmngr *mb_list, int how, struct mb_pcpu_list *cnt_lst)
 471 {
 472         struct  mb_bucket       *bucket;
 473         caddr_t                 p;
 474         int                     i;
 475
 476         MB_UNLOCK_CONT(cnt_lst);
 477         /*
 478          * If our object's (finite) map is starved now (i.e. no more address
 479          * space), bail out now.
 480          */
 481         if (mb_list->ml_mapfull)
 482                 return (NULL);
 483
 484         bucket = malloc(sizeof(struct mb_bucket) +
 485             PAGE_SIZE / mb_list->ml_objsize * sizeof(void *), M_MBUF,
 486             how == M_TRYWAIT ? M_WAITOK : M_NOWAIT);
 487         if (bucket == NULL)
 488                 return (NULL);
 489
 490         p = (caddr_t)kmem_malloc(mb_list->ml_map, PAGE_SIZE,
 491             how == M_TRYWAIT ? M_WAITOK : M_NOWAIT);
 492         if (p == NULL) {
 493                 free(bucket, M_MBUF);
 494                 return (NULL);
 495         }
 496
 497         bucket->mb_numfree = 0;
 498         mb_list->ml_btable[MB_BUCKET_INDX(p, mb_list)] = bucket;
 499         for (i = 0; i < (PAGE_SIZE / mb_list->ml_objsize); i++) {
 500                 bucket->mb_free[i] = p;
 501                 bucket->mb_numfree++;
 502                 p += mb_list->ml_objsize;
 503         }
 504
 505         MB_LOCK_CONT(cnt_lst);
 506         bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
 507         SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead), bucket, mb_blist);
 508         (*(cnt_lst->mb_cont.mc_numpgs))++;
 509         *(cnt_lst->mb_cont.mc_objcount) += bucket->mb_numfree;
 510
 511         return (bucket);
 512 }
 513
 514 /*
 515  * Allocate an mbuf-subsystem type object.
 516  * The general case is very easy. Complications only arise if our PCPU
 517  * container is empty. Things get worse if the PCPU container is empty,
 518  * the general container is empty, and we've run out of address space
 519  * in our map; then we try to block if we're willing to (M_TRYWAIT).
 520  */
 521 static __inline
 522 void *
 523 mb_alloc(struct mb_lstmngr *mb_list, int how)
 524 {
 525         struct  mb_pcpu_list    *cnt_lst;
 526         struct  mb_bucket       *bucket;
 527         void                    *m;
 528
 529         m = NULL;
 530         cnt_lst = MB_GET_PCPU_LIST(mb_list);
 531         MB_LOCK_CONT(cnt_lst);
 532
 533         if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) != NULL) {
 534                 /*
 535                  * This is the easy allocation case. We just grab an object
 536                  * from a bucket in the PCPU container. At worst, we
 537                  * have just emptied the bucket and so we remove it
 538                  * from the container.
 539                  */
 540                 MB_GET_OBJECT(m, bucket, cnt_lst);
 541                 MB_UNLOCK_CONT(cnt_lst);
 542         } else {
 543                 struct  mb_gen_list *gen_list;
 544
 545                 /*
 546                  * This is the less-common more difficult case. We must
 547                  * first verify if the general list has anything for us
 548                  * and if that also fails, we must allocate a page from
 549                  * the map and create a new bucket to place in our PCPU
 550                  * container (already locked). If the map is starved then
 551                  * we're really in for trouble, as we have to wait on
 552                  * the general container's condition variable.
 553                  */
 554                 gen_list = MB_GET_GEN_LIST(mb_list);
 555                 MB_LOCK_CONT(gen_list);
 556
 557                 if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead)))
 558                     != NULL) {
 559                         /*
 560                          * Give ownership of the bucket to our CPU's
 561                          * container, but only actually put the bucket
 562                          * in the container if it doesn't become free
 563                          * upon removing an mbuf from it.
 564                          */
 565                         SLIST_REMOVE_HEAD(&(gen_list->mb_cont.mc_bhead),
 566                             mb_blist);
 567                         bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
 568                         (*(gen_list->mb_cont.mc_numpgs))--;
 569                         (*(cnt_lst->mb_cont.mc_numpgs))++;
 570                         *(gen_list->mb_cont.mc_objcount) -= bucket->mb_numfree;
 571                         bucket->mb_numfree--;
 572                         m = bucket->mb_free[(bucket->mb_numfree)];
 573                         if (bucket->mb_numfree == 0) {
 574                                 SLIST_NEXT(bucket, mb_blist) = NULL;
 575                                 bucket->mb_owner |= MB_BUCKET_FREE;
 576                         } else {
 577                                 SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead),
 578                                      bucket, mb_blist);
 579                                 *(cnt_lst->mb_cont.mc_objcount) +=
 580                                     bucket->mb_numfree;
 581                         }
 582                         MB_UNLOCK_CONT(gen_list);
 583                         MB_UNLOCK_CONT(cnt_lst);
 584                 } else {
 585                         /*
 586                          * We'll have to allocate a new page.
 587                          */
 588                         MB_UNLOCK_CONT(gen_list);
 589                         bucket = mb_pop_cont(mb_list, how, cnt_lst);
 590                         if (bucket != NULL) {
 591                                 bucket->mb_numfree--;
 592                                 m = bucket->mb_free[(bucket->mb_numfree)];
 593                                 (*(cnt_lst->mb_cont.mc_objcount))--;
 594                                 MB_UNLOCK_CONT(cnt_lst);
 595                         } else {
 596                                 if (how == M_TRYWAIT) {
 597                                   /*
 598                                    * Absolute worst-case scenario. We block if
 599                                    * we're willing to, but only after trying to
 600                                    * steal from other lists.
 601                                    */
 602                                         mb_list->ml_mapfull = 1;
 603                                         m = mb_alloc_wait(mb_list);
 604                                 } else
 605                                         /* XXX: No consistency. */
 606                                         mbstat.m_drops++;
 607                         }
 608                 }
 609         }
 610
 611         return (m);
 612 }
 613
 614 /*
 615  * This is the worst-case scenario called only if we're allocating with
 616  * M_TRYWAIT. We first drain all the protocols, then try to find an mbuf
 617  * by looking in every PCPU container. If we're still unsuccesful, we
 618  * try the general container one last time and possibly block on our
 619  * starved cv.
 620  */
 621 void *
 622 mb_alloc_wait(struct mb_lstmngr *mb_list)
 623 {
 624         struct  mb_pcpu_list    *cnt_lst;
 625         struct  mb_gen_list     *gen_list;
 626         struct  mb_bucket       *bucket;
 627         void                    *m;
 628         int                     i, cv_ret;
 629
 630         /*
 631          * Try to reclaim mbuf-related objects (mbufs, clusters).
 632          */
 633         mb_reclaim();
 634
 635         /*
 636          * Cycle all the PCPU containers. Increment starved counts if found
 637          * empty.
 638          */
 639         for (i = 0; i < NCPU; i++) {
 640                 if (CPU_ABSENT(i))
 641                         continue;
 642                 cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, i);
 643                 MB_LOCK_CONT(cnt_lst);
 644
 645                 /*
 646                  * If container is non-empty, get a single object from it.
 647                  * If empty, increment starved count.
 648                  */
 649                 if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) !=
 650                     NULL) {
 651                         MB_GET_OBJECT(m, bucket, cnt_lst);
 652                         MB_UNLOCK_CONT(cnt_lst);
 653                         mbstat.m_wait++;        /* XXX: No consistency. */
 654                         return (m);
 655                 } else
 656                         cnt_lst->mb_cont.mc_starved++;
 657
 658                 MB_UNLOCK_CONT(cnt_lst);
 659         }
 660
 661         /*
 662          * We're still here, so that means it's time to get the general
 663          * container lock, check it one more time (now that mb_reclaim()
 664          * has been called) and if we still get nothing, block on the cv.
 665          */
 666         gen_list = MB_GET_GEN_LIST(mb_list);
 667         MB_LOCK_CONT(gen_list);
 668         if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL) {
 669                 MB_GET_OBJECT(m, bucket, gen_list);
 670                 MB_UNLOCK_CONT(gen_list);
 671                 mbstat.m_wait++;        /* XXX: No consistency. */
 672                 return (m);
 673         }
 674
 675         gen_list->mb_cont.mc_starved++;
 676         cv_ret = cv_timedwait(&(gen_list->mgl_mstarved),
 677             gen_list->mb_cont.mc_lock, mbuf_wait);
 678         gen_list->mb_cont.mc_starved--;
 679
 680         if ((cv_ret == 0) &&
 681             ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL)) {
 682                 MB_GET_OBJECT(m, bucket, gen_list);
 683                 mbstat.m_wait++;        /* XXX: No consistency. */
 684         } else {
 685                 mbstat.m_drops++;       /* XXX: No consistency. */
 686                 m = NULL;
 687         }
 688
 689         MB_UNLOCK_CONT(gen_list);
 690
 691         return (m);
 692 }
 693
 694 /*
 695  * Free an object to its rightful container.
 696  * In the very general case, this operation is really very easy.
 697  * Complications arise primarily if:
 698  *      (a) We've hit the high limit on number of free objects allowed in
 699  *          our PCPU container.
 700  *      (b) We're in a critical situation where our container has been
 701  *          marked 'starved' and we need to issue wakeups on the starved
 702  *          condition variable.
 703  *      (c) Minor (odd) cases: our bucket has migrated while we were
 704  *          waiting for the lock; our bucket is in the general container;
 705  *          our bucket is empty.
 706  */
 707 static __inline
 708 void
 709 mb_free(struct mb_lstmngr *mb_list, void *m)
 710 {
 711         struct  mb_pcpu_list    *cnt_lst;
 712         struct  mb_gen_list     *gen_list;
 713         struct  mb_bucket       *bucket;
 714         u_int                   owner;
 715
 716         bucket = mb_list->ml_btable[MB_BUCKET_INDX(m, mb_list)];
 717
 718         /*
 719          * Make sure that if after we lock the bucket's present container the
 720          * bucket has migrated, that we drop the lock and get the new one.
 721          */
 722 retry_lock:
 723         owner = bucket->mb_owner & ~MB_BUCKET_FREE;
 724         switch (owner) {
 725         case MB_GENLIST_OWNER:
 726                 gen_list = MB_GET_GEN_LIST(mb_list);
 727                 MB_LOCK_CONT(gen_list);
 728                 if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) {
 729                         MB_UNLOCK_CONT(gen_list);
 730                         goto retry_lock;
 731                 }
 732
 733                 /*
 734                  * If we're intended for the general container, this is
 735                  * real easy: no migrating required. The only `bogon'
 736                  * is that we're now contending with all the threads
 737                  * dealing with the general list, but this is expected.
 738                  */
 739                 MB_PUT_OBJECT(m, bucket, gen_list);
 740                 if (gen_list->mb_cont.mc_starved > 0)
 741                         cv_signal(&(gen_list->mgl_mstarved));
 742                 MB_UNLOCK_CONT(gen_list);
 743                 break;
 744
 745         default:
 746                 cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, owner);
 747                 MB_LOCK_CONT(cnt_lst);
 748                 if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) {
 749                         MB_UNLOCK_CONT(cnt_lst);
 750                         goto retry_lock;
 751                 }
 752
 753                 MB_PUT_OBJECT(m, bucket, cnt_lst);
 754
 755                 if (cnt_lst->mb_cont.mc_starved > 0) {
 756                         /*
 757                          * This is a tough case. It means that we've
 758                          * been flagged at least once to indicate that
 759                          * we're empty, and that the system is in a critical
 760                          * situation, so we ought to migrate at least one
 761                          * bucket over to the general container.
 762                          * There may or may not be a thread blocking on
 763                          * the starved condition variable, but chances
 764                          * are that one will eventually come up soon so
 765                          * it's better to migrate now than never.
 766                          */
 767                         gen_list = MB_GET_GEN_LIST(mb_list);
 768                         MB_LOCK_CONT(gen_list);
 769                         KASSERT((bucket->mb_owner & MB_BUCKET_FREE) != 0,
 770                             ("mb_free: corrupt bucket %p\n", bucket));
 771                         SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead),
 772                             bucket, mb_blist);
 773                         bucket->mb_owner = MB_GENLIST_OWNER;
 774                         (*(cnt_lst->mb_cont.mc_objcount))--;
 775                         (*(gen_list->mb_cont.mc_objcount))++;
 776                         (*(cnt_lst->mb_cont.mc_numpgs))--;
 777                         (*(gen_list->mb_cont.mc_numpgs))++;
 778
 779                         /*
 780                          * Determine whether or not to keep transferring
 781                          * buckets to the general list or whether we've
 782                          * transferred enough already.
 783                          * We realize that although we may flag another
 784                          * bucket to be migrated to the general container
 785                          * that in the meantime, the thread that was
 786                          * blocked on the cv is already woken up and
 787                          * long gone. But in that case, the worst
 788                          * consequence is that we will end up migrating
 789                          * one bucket too many, which is really not a big
 790                          * deal, especially if we're close to a critical
 791                          * situation.
 792                          */
 793                         if (gen_list->mb_cont.mc_starved > 0) {
 794                                 cnt_lst->mb_cont.mc_starved--;
 795                                 cv_signal(&(gen_list->mgl_mstarved));
 796                         } else
 797                                 cnt_lst->mb_cont.mc_starved = 0;
 798
 799                         MB_UNLOCK_CONT(gen_list);
 800                         MB_UNLOCK_CONT(cnt_lst);
 801                         break;
 802                 }
 803
 804                 if (*(cnt_lst->mb_cont.mc_objcount) > *(mb_list->ml_wmhigh)) {
 805                         /*
 806                          * We've hit the high limit of allowed numbers of mbufs
 807                          * on this PCPU list. We must now migrate a bucket
 808                          * over to the general container.
 809                          */
 810                         gen_list = MB_GET_GEN_LIST(mb_list);
 811                         MB_LOCK_CONT(gen_list);
 812                         if ((bucket->mb_owner & MB_BUCKET_FREE) == 0) {
 813                                 bucket =
 814                                     SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead));
 815                                 SLIST_REMOVE_HEAD(&(cnt_lst->mb_cont.mc_bhead),
 816                                     mb_blist);
 817                         }
 818                         SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead),
 819                             bucket, mb_blist);
 820                         bucket->mb_owner = MB_GENLIST_OWNER;
 821                         *(cnt_lst->mb_cont.mc_objcount) -= bucket->mb_numfree;
 822                         *(gen_list->mb_cont.mc_objcount) += bucket->mb_numfree;
 823                         (*(cnt_lst->mb_cont.mc_numpgs))--;
 824                         (*(gen_list->mb_cont.mc_numpgs))++;
 825
 826                         MB_UNLOCK_CONT(gen_list);
 827                         MB_UNLOCK_CONT(cnt_lst);
 828                         break;
 829                 }
 830
 831                 if (bucket->mb_owner & MB_BUCKET_FREE) {
 832                         SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead),
 833                             bucket, mb_blist);
 834                         bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
 835                 }
 836
 837                 MB_UNLOCK_CONT(cnt_lst);
 838                 break;
 839         }
 840
 841         return;
 842 }
 843
 844 /*
 845  * Drain protocols in hopes to free up some resources.
 846  *
 847  * LOCKING NOTES:
 848  * No locks should be held when this is called. The drain routines have to
 849  * presently acquire some locks which raises the possibility of lock order
 850  * violation if we're holding any mutex if that mutex is acquired in reverse
 851  * order relative to one of the locks in the drain routines.
 852  */
 853 void
 854 mb_reclaim(void)
 855 {
 856         struct  domain  *dp;
 857         struct  protosw *pr;
 858
 859 /*
 860  * XXX: Argh, we almost always trip here with witness turned on now-a-days
 861  * XXX: because we often come in with Giant held. For now, there's no way
 862  * XXX: to avoid this.
 863  */
 864 #ifdef WITNESS
 865         KASSERT(witness_list(curproc) == 0,
 866             ("mb_reclaim() called with locks held"));
 867 #endif
 868
 869         mbstat.m_drain++;       /* XXX: No consistency. */
 870
 871         for (dp = domains; dp; dp = dp->dom_next)
 872                 for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
 873                         if (pr->pr_drain)
 874                                 (*pr->pr_drain)();
 875
 876 }
 877
 878 /*
 879  * Local mbuf & cluster alloc macros and routines.
 880  * Local macro and function names begin with an underscore ("_").
 881  */
 882 void    _mext_free(struct mbuf *);
 883 void    _mclfree(struct mbuf *);
 884
 885 #define _m_get(m, how, type) do {                                       \
 886         (m) = (struct mbuf *)mb_alloc(&mb_list_mbuf, (how));            \
 887         if ((m) != NULL) {                                              \
 888                 (m)->m_type = (type);                                   \
 889                 (m)->m_next = NULL;                                     \
 890                 (m)->m_nextpkt = NULL;                                  \
 891                 (m)->m_data = (m)->m_dat;                               \
 892                 (m)->m_flags = 0;                                       \
 893         }                                                               \
 894 } while (0)
 895
 896 #define _m_gethdr(m, how, type) do {                                    \
 897         (m) = (struct mbuf *)mb_alloc(&mb_list_mbuf, (how));            \
 898         if ((m) != NULL) {                                              \
 899                 (m)->m_type = (type);                                   \
 900                 (m)->m_next = NULL;                                     \
 901                 (m)->m_nextpkt = NULL;                                  \
 902                 (m)->m_data = (m)->m_pktdat;                            \
 903                 (m)->m_flags = M_PKTHDR;                                \
 904                 (m)->m_pkthdr.rcvif = NULL;                             \
 905                 (m)->m_pkthdr.csum_flags = 0;                           \
 906                 (m)->m_pkthdr.aux = NULL;                               \
 907         }                                                               \
 908 } while (0)
 909
 910 /* XXX: Check for M_PKTHDR && m_pkthdr.aux is bogus... please fix (see KAME) */
 911 #define _m_free(m, n) do {                                              \
 912         (n) = (m)->m_next;                                              \
 913         if ((m)->m_flags & M_EXT)                                       \
 914                 MEXTFREE((m));                                          \
 915         if (((m)->m_flags & M_PKTHDR) != 0 && (m)->m_pkthdr.aux) {      \
 916                 m_freem((m)->m_pkthdr.aux);                             \
 917                 (m)->m_pkthdr.aux = NULL;                               \
 918         }                                                               \
 919         mb_free(&mb_list_mbuf, (m));                                    \
 920 } while (0)
 921
 922 #define _mext_init_ref(m) do {                                          \
 923         (m)->m_ext.ref_cnt = malloc(sizeof(u_int), M_MBUF, M_NOWAIT);   \
 924         if ((m)->m_ext.ref_cnt != NULL) {                               \
 925                 *((m)->m_ext.ref_cnt) = 0;                              \
 926                 MEXT_ADD_REF((m));                                      \
 927         }                                                               \
 928 } while (0)
 929
 930 #define _mext_dealloc_ref(m)                                            \
 931         free((m)->m_ext.ref_cnt, M_MBUF)
 932
 933 void
 934 _mext_free(struct mbuf *mb)
 935 {
 936
 937         if (mb->m_ext.ext_type == EXT_CLUSTER)
 938                 mb_free(&mb_list_clust, (caddr_t)mb->m_ext.ext_buf);
 939         else
 940                 (*(mb->m_ext.ext_free))(mb->m_ext.ext_buf, mb->m_ext.ext_args);
 941
 942         _mext_dealloc_ref(mb);
 943         return;
 944 }
 945
 946 /* We only include this here to avoid making m_clget() excessively large
 947  * due to too much inlined code. */
 948 void
 949 _mclfree(struct mbuf *mb)
 950 {
 951
 952         mb_free(&mb_list_clust, (caddr_t)mb->m_ext.ext_buf);
 953         mb->m_ext.ext_buf = NULL;
 954         return;
 955 }
 956
 957 /*
 958  * Exported space allocation and de-allocation routines.
 959  */
 960 struct mbuf *
 961 m_get(int how, int type)
 962 {
 963         struct  mbuf *mb;
 964
 965         _m_get(mb, how, type);
 966         return (mb);
 967 }
 968
 969 struct mbuf *
 970 m_gethdr(int how, int type)
 971 {
 972         struct  mbuf *mb;
 973
 974         _m_gethdr(mb, how, type);
 975         return (mb);
 976 }
 977
 978 struct mbuf *
 979 m_get_clrd(int how, int type)
 980 {
 981         struct  mbuf *mb;
 982
 983         _m_get(mb, how, type);
 984
 985         if (mb != NULL)
 986                 bzero(mtod(mb, caddr_t), MLEN);
 987
 988         return (mb);
 989 }
 990
 991 struct mbuf *
 992 m_gethdr_clrd(int how, int type)
 993 {
 994         struct  mbuf *mb;
 995
 996         _m_gethdr(mb, how, type);
 997
 998         if (mb != NULL)
 999                 bzero(mtod(mb, caddr_t), MHLEN);
1000
1001         return (mb);
1002 }
1003
1004 struct mbuf *
1005 m_free(struct mbuf *mb)
1006 {
1007         struct  mbuf *nb;
1008
1009         _m_free(mb, nb);
1010         return (nb);
1011 }
1012
1013 void
1014 m_clget(struct mbuf *mb, int how)
1015 {
1016
1017         mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust, how);
1018         if (mb->m_ext.ext_buf != NULL) {
1019                 _mext_init_ref(mb);
1020                 if (mb->m_ext.ref_cnt == NULL)
1021                         _mclfree(mb);
1022                 else {
1023                         mb->m_data = mb->m_ext.ext_buf;
1024                         mb->m_flags |= M_EXT;
1025                         mb->m_ext.ext_free = NULL;
1026                         mb->m_ext.ext_args = NULL;
1027                         mb->m_ext.ext_size = MCLBYTES;
1028                         mb->m_ext.ext_type = EXT_CLUSTER;
1029                 }
1030         }
1031         return;
1032 }
1033
1034 void
1035 m_extadd(struct mbuf *mb, caddr_t buf, u_int size,
1036          void (*freef)(caddr_t, void *), void *args, short flags, int type)
1037 {
1038
1039         _mext_init_ref(mb);
1040         if (mb->m_ext.ref_cnt != NULL) {
1041                 mb->m_flags |= (M_EXT | flags);
1042                 mb->m_ext.ext_buf = buf;
1043                 mb->m_data = mb->m_ext.ext_buf;
1044                 mb->m_ext.ext_size = size;
1045                 mb->m_ext.ext_free = freef;
1046                 mb->m_ext.ext_args = args;
1047                 mb->m_ext.ext_type = type;
1048         }
1049         return;
1050 }