3 * Bosko Milekic <bmilekic@FreeBSD.org>. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. The name of the author may not be used to endorse or promote products
14 * derived from this software without specific prior written permission.
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 #include "opt_param.h"
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/malloc.h>
37 #include <sys/mutex.h>
38 #include <sys/condvar.h>
40 #include <sys/kernel.h>
41 #include <sys/sysctl.h>
42 #include <sys/domain.h>
43 #include <sys/protosw.h>
45 #include <vm/vm_kern.h>
46 #include <vm/vm_extern.h>
49 * Maximum number of PCPU containers. If you know what you're doing you could
50 * explicitly define MBALLOC_NCPU to be exactly the number of CPUs on your
51 * system during compilation, and thus prevent kernel structure bloat.
53 * SMP and non-SMP kernels clearly have a different number of possible cpus,
54 * but because we cannot assume a dense array of CPUs, we always allocate
55 * and traverse PCPU containers up to NCPU amount and merely check for
59 #define NCPU MBALLOC_NCPU
65 * The mbuf allocator is heavily based on Alfred Perlstein's
66 * (alfred@FreeBSD.org) "memcache" allocator which is itself based
67 * on concepts from several per-CPU memory allocators. The difference
68 * between this allocator and memcache is that, among other things:
70 * (i) We don't free back to the map from the free() routine - we leave the
71 * option of implementing lazy freeing (from a kproc) in the future.
73 * (ii) We allocate from separate sub-maps of kmem_map, thus limiting the
74 * maximum number of allocatable objects of a given type. Further,
75 * we handle blocking on a cv in the case that the map is starved and
76 * we have to rely solely on cached (circulating) objects.
78 * The mbuf allocator keeps all objects that it allocates in mb_buckets.
79 * The buckets keep a page worth of objects (an object can be an mbuf or an
80 * mbuf cluster) and facilitate moving larger sets of contiguous objects
81 * from the per-CPU lists to the main list for the given object. The buckets
82 * also have an added advantage in that after several moves from a per-CPU
83 * list to the main list and back to the per-CPU list, contiguous objects
84 * are kept together, thus trying to put the TLB cache to good use.
86 * The buckets are kept on singly-linked lists called "containers." A container
87 * is protected by a mutex lock in order to ensure consistency. The mutex lock
88 * itself is allocated seperately and attached to the container at boot time,
89 * thus allowing for certain containers to share the same mutex lock. Per-CPU
90 * containers for mbufs and mbuf clusters all share the same per-CPU
91 * lock whereas the "general system" containers (i.e. the "main lists") for
92 * these objects share one global lock.
96 SLIST_ENTRY(mb_bucket) mb_blist;
102 struct mb_container {
103 SLIST_HEAD(mc_buckethd, mb_bucket) mc_bhead;
113 struct mb_container mb_cont;
114 struct cv mgl_mstarved;
117 struct mb_pcpu_list {
118 struct mb_container mb_cont;
122 * Boot-time configurable object counts that will determine the maximum
123 * number of permitted objects in the mbuf and mcluster cases. In the
124 * ext counter (nmbcnt) case, it's just an indicator serving to scale
125 * kmem_map size properly - in other words, we may be allowed to allocate
126 * more than nmbcnt counters, whereas we will never be allowed to allocate
127 * more than nmbufs mbufs or nmbclusters mclusters.
128 * As for nsfbufs, it is used to indicate how many sendfile(2) buffers will be
129 * allocatable by the sfbuf allocator (found in uipc_syscalls.c)
132 #define NMBCLUSTERS (1024 + maxusers * 64)
135 #define NMBUFS (nmbclusters * 2)
138 #define NSFBUFS (512 + maxusers * 16)
141 #define NMBCNTS (nmbclusters + nsfbufs)
149 * Perform sanity checks of tunables declared above.
152 tunable_mbinit(void *dummy)
156 * This has to be done before VM init.
158 nmbclusters = NMBCLUSTERS;
159 TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
161 TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
163 TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs);
165 TUNABLE_INT_FETCH("kern.ipc.nmbcnt", &nmbcnt);
167 if (nmbufs < nmbclusters * 2)
168 nmbufs = nmbclusters * 2;
169 if (nmbcnt < nmbclusters + nsfbufs)
170 nmbcnt = nmbclusters + nsfbufs;
174 SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL);
177 * The freelist structures and mutex locks. The number statically declared
178 * here depends on the number of CPUs.
180 * We setup in such a way that all the objects (mbufs, clusters)
181 * share the same mutex lock. It has been established that we do not benefit
182 * from different locks for different objects, so we use the same lock,
183 * regardless of object type.
186 struct mb_gen_list *ml_genlist;
187 struct mb_pcpu_list *ml_cntlst[NCPU];
188 struct mb_bucket **ml_btable;
190 vm_offset_t ml_mapbase;
191 vm_offset_t ml_maptop;
196 struct mb_lstmngr mb_list_mbuf, mb_list_clust;
197 struct mtx mbuf_gen, mbuf_pcpu[NCPU];
200 * Local macros for internal allocator structure manipulations.
203 #define MB_GET_PCPU_LIST(mb_lst) (mb_lst)->ml_cntlst[PCPU_GET(cpuid)]
205 #define MB_GET_PCPU_LIST(mb_lst) (mb_lst)->ml_cntlst[0]
208 #define MB_GET_PCPU_LIST_NUM(mb_lst, num) (mb_lst)->ml_cntlst[(num)]
210 #define MB_GET_GEN_LIST(mb_lst) (mb_lst)->ml_genlist
212 #define MB_LOCK_CONT(mb_cnt) mtx_lock((mb_cnt)->mb_cont.mc_lock)
214 #define MB_UNLOCK_CONT(mb_cnt) mtx_unlock((mb_cnt)->mb_cont.mc_lock)
216 #define MB_BUCKET_INDX(mb_obj, mb_lst) \
217 (int)(((caddr_t)(mb_obj) - (caddr_t)(mb_lst)->ml_mapbase) / PAGE_SIZE)
219 #define MB_GET_OBJECT(mb_objp, mb_bckt, mb_lst) \
221 struct mc_buckethd *_mchd = &((mb_lst)->mb_cont.mc_bhead); \
223 (mb_bckt)->mb_numfree--; \
224 (mb_objp) = (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)]; \
225 (*((mb_lst)->mb_cont.mc_objcount))--; \
226 if ((mb_bckt)->mb_numfree == 0) { \
227 SLIST_REMOVE_HEAD(_mchd, mb_blist); \
228 SLIST_NEXT((mb_bckt), mb_blist) = NULL; \
229 (mb_bckt)->mb_owner |= MB_BUCKET_FREE; \
233 #define MB_PUT_OBJECT(mb_objp, mb_bckt, mb_lst) \
234 (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)] = (mb_objp); \
235 (mb_bckt)->mb_numfree++; \
236 (*((mb_lst)->mb_cont.mc_objcount))++;
238 #define MB_MBTYPES_INC(mb_cnt, mb_type, mb_num) \
239 if ((mb_type) != MT_NOTMBUF) \
240 (*((mb_cnt)->mb_cont.mc_types + (mb_type))) += (mb_num)
242 #define MB_MBTYPES_DEC(mb_cnt, mb_type, mb_num) \
243 if ((mb_type) != MT_NOTMBUF) \
244 (*((mb_cnt)->mb_cont.mc_types + (mb_type))) -= (mb_num)
247 * Ownership of buckets/containers is represented by integers. The PCPU
248 * lists range from 0 to NCPU-1. We need a free numerical id for the general
249 * list (we use NCPU). We also need a non-conflicting free bit to indicate
250 * that the bucket is free and removed from a container, while not losing
251 * the bucket's originating container id. We use the highest bit
252 * for the free marker.
254 #define MB_GENLIST_OWNER (NCPU)
255 #define MB_BUCKET_FREE (1 << (sizeof(int) * 8 - 1))
258 * sysctl(8) exported objects
260 struct mbstat mbstat; /* General stats + infos. */
261 struct mbpstat mb_statpcpu[NCPU+1]; /* PCPU + Gen. container alloc stats */
262 int mbuf_wait = 64; /* Sleep time for wait code (ticks) */
263 u_int mbuf_limit = 512; /* Upper lim. on # of mbufs per CPU */
264 u_int clust_limit = 128; /* Upper lim. on # of clusts per CPU */
265 SYSCTL_DECL(_kern_ipc);
266 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbclusters, CTLFLAG_RD, &nmbclusters, 0,
267 "Maximum number of mbuf clusters available");
268 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RD, &nmbufs, 0,
269 "Maximum number of mbufs available");
270 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbcnt, CTLFLAG_RD, &nmbcnt, 0,
271 "Number used to scale kmem_map to ensure sufficient space for counters");
272 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RD, &nsfbufs, 0,
273 "Maximum number of sendfile(2) sf_bufs available");
274 SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW, &mbuf_wait, 0,
275 "Sleep time of mbuf subsystem wait allocations during exhaustion");
276 SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_limit, CTLFLAG_RW, &mbuf_limit, 0,
277 "Upper limit of number of mbufs allowed on each PCPU list");
278 SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_limit, CTLFLAG_RW, &clust_limit, 0,
279 "Upper limit of number of mbuf clusters allowed on each PCPU list");
280 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat,
281 "Mbuf general information and statistics");
282 SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mb_statpcpu, CTLFLAG_RD, mb_statpcpu,
283 sizeof(mb_statpcpu), "S,", "Mbuf allocator per CPU statistics");
286 * Prototypes of local allocator routines.
288 static __inline void *mb_alloc(struct mb_lstmngr *, int, short);
289 void *mb_alloc_wait(struct mb_lstmngr *, short);
290 static __inline void mb_free(struct mb_lstmngr *, void *, short);
291 static void mbuf_init(void *);
292 struct mb_bucket *mb_pop_cont(struct mb_lstmngr *, int,
293 struct mb_pcpu_list *);
294 void mb_reclaim(void);
297 * Initial allocation numbers. Each parameter represents the number of buckets
298 * of each object that will be placed initially in each PCPU container for
301 #define NMB_MBUF_INIT 4
302 #define NMB_CLUST_INIT 16
305 * Initialize the mbuf subsystem.
307 * We sub-divide the kmem_map into several submaps; this way, we don't have
308 * to worry about artificially limiting the number of mbuf or mbuf cluster
309 * allocations, due to fear of one type of allocation "stealing" address
310 * space initially reserved for another.
312 * Setup both the general containers and all the PCPU containers. Populate
313 * the PCPU containers with initial numbers.
315 MALLOC_DEFINE(M_MBUF, "mbufmgr", "mbuf subsystem management structures");
316 SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL)
318 mbuf_init(void *dummy)
320 struct mb_pcpu_list *pcpu_cnt;
321 vm_size_t mb_map_size;
325 * Setup all the submaps, for each type of object that we deal
326 * with in this allocator.
328 mb_map_size = (vm_size_t)(nmbufs * MSIZE);
329 mb_map_size = rounddown(mb_map_size, PAGE_SIZE);
330 mb_list_mbuf.ml_btable = malloc((unsigned long)mb_map_size / PAGE_SIZE *
331 sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT);
332 if (mb_list_mbuf.ml_btable == NULL)
334 mb_list_mbuf.ml_map = kmem_suballoc(kmem_map,&(mb_list_mbuf.ml_mapbase),
335 &(mb_list_mbuf.ml_maptop), mb_map_size);
336 mb_list_mbuf.ml_mapfull = 0;
337 mb_list_mbuf.ml_objsize = MSIZE;
338 mb_list_mbuf.ml_wmhigh = &mbuf_limit;
340 mb_map_size = (vm_size_t)(nmbclusters * MCLBYTES);
341 mb_map_size = rounddown(mb_map_size, PAGE_SIZE);
342 mb_list_clust.ml_btable = malloc((unsigned long)mb_map_size / PAGE_SIZE
343 * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT);
344 if (mb_list_clust.ml_btable == NULL)
346 mb_list_clust.ml_map = kmem_suballoc(kmem_map,
347 &(mb_list_clust.ml_mapbase), &(mb_list_clust.ml_maptop),
349 mb_list_clust.ml_mapfull = 0;
350 mb_list_clust.ml_objsize = MCLBYTES;
351 mb_list_clust.ml_wmhigh = &clust_limit;
353 /* XXX XXX XXX: mbuf_map->system_map = clust_map->system_map = 1 */
356 * Allocate required general (global) containers for each object type.
358 mb_list_mbuf.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF,
360 mb_list_clust.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF,
362 if ((mb_list_mbuf.ml_genlist == NULL) ||
363 (mb_list_clust.ml_genlist == NULL))
367 * Initialize condition variables and general container mutex locks.
369 mtx_init(&mbuf_gen, "mbuf subsystem general lists lock", 0);
370 cv_init(&(mb_list_mbuf.ml_genlist->mgl_mstarved), "mbuf pool starved");
371 cv_init(&(mb_list_clust.ml_genlist->mgl_mstarved),
372 "mcluster pool starved");
373 mb_list_mbuf.ml_genlist->mb_cont.mc_lock =
374 mb_list_clust.ml_genlist->mb_cont.mc_lock = &mbuf_gen;
377 * Setup the general containers for each object.
379 mb_list_mbuf.ml_genlist->mb_cont.mc_numowner =
380 mb_list_clust.ml_genlist->mb_cont.mc_numowner = MB_GENLIST_OWNER;
381 mb_list_mbuf.ml_genlist->mb_cont.mc_starved =
382 mb_list_clust.ml_genlist->mb_cont.mc_starved = 0;
383 mb_list_mbuf.ml_genlist->mb_cont.mc_objcount =
384 &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbfree);
385 mb_list_clust.ml_genlist->mb_cont.mc_objcount =
386 &(mb_statpcpu[MB_GENLIST_OWNER].mb_clfree);
387 mb_list_mbuf.ml_genlist->mb_cont.mc_numpgs =
388 &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbpgs);
389 mb_list_clust.ml_genlist->mb_cont.mc_numpgs =
390 &(mb_statpcpu[MB_GENLIST_OWNER].mb_clpgs);
391 mb_list_mbuf.ml_genlist->mb_cont.mc_types =
392 &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbtypes[0]);
393 mb_list_clust.ml_genlist->mb_cont.mc_types = NULL;
394 SLIST_INIT(&(mb_list_mbuf.ml_genlist->mb_cont.mc_bhead));
395 SLIST_INIT(&(mb_list_clust.ml_genlist->mb_cont.mc_bhead));
398 * Initialize general mbuf statistics
400 mbstat.m_msize = MSIZE;
401 mbstat.m_mclbytes = MCLBYTES;
402 mbstat.m_minclsize = MINCLSIZE;
403 mbstat.m_mlen = MLEN;
404 mbstat.m_mhlen = MHLEN;
405 mbstat.m_numtypes = MT_NTYPES;
408 * Allocate and initialize PCPU containers.
410 for (i = 0; i < NCPU; i++) {
414 mb_list_mbuf.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list),
416 mb_list_clust.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list),
418 if ((mb_list_mbuf.ml_cntlst[i] == NULL) ||
419 (mb_list_clust.ml_cntlst[i] == NULL))
422 mtx_init(&mbuf_pcpu[i], "mbuf PCPU list lock", 0);
423 mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_lock =
424 mb_list_clust.ml_cntlst[i]->mb_cont.mc_lock = &mbuf_pcpu[i];
426 mb_statpcpu[i].mb_active = 1;
427 mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numowner =
428 mb_list_clust.ml_cntlst[i]->mb_cont.mc_numowner = i;
429 mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_starved =
430 mb_list_clust.ml_cntlst[i]->mb_cont.mc_starved = 0;
431 mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_objcount =
432 &(mb_statpcpu[i].mb_mbfree);
433 mb_list_clust.ml_cntlst[i]->mb_cont.mc_objcount =
434 &(mb_statpcpu[i].mb_clfree);
435 mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numpgs =
436 &(mb_statpcpu[i].mb_mbpgs);
437 mb_list_clust.ml_cntlst[i]->mb_cont.mc_numpgs =
438 &(mb_statpcpu[i].mb_clpgs);
439 mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_types =
440 &(mb_statpcpu[i].mb_mbtypes[0]);
441 mb_list_clust.ml_cntlst[i]->mb_cont.mc_types = NULL;
443 SLIST_INIT(&(mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_bhead));
444 SLIST_INIT(&(mb_list_clust.ml_cntlst[i]->mb_cont.mc_bhead));
447 * Perform initial allocations.
449 pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_mbuf, i);
450 MB_LOCK_CONT(pcpu_cnt);
451 for (j = 0; j < NMB_MBUF_INIT; j++) {
452 if (mb_pop_cont(&mb_list_mbuf, M_DONTWAIT, pcpu_cnt)
456 MB_UNLOCK_CONT(pcpu_cnt);
458 pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_clust, i);
459 MB_LOCK_CONT(pcpu_cnt);
460 for (j = 0; j < NMB_CLUST_INIT; j++) {
461 if (mb_pop_cont(&mb_list_clust, M_DONTWAIT, pcpu_cnt)
465 MB_UNLOCK_CONT(pcpu_cnt);
470 panic("mbuf_init(): failed to initialize mbuf subsystem!");
474 * Populate a given mbuf PCPU container with a bucket full of fresh new
475 * buffers. Return a pointer to the new bucket (already in the container if
476 * successful), or return NULL on failure.
479 * PCPU container lock must be held when this is called.
480 * The lock is dropped here so that we can cleanly call the underlying VM
481 * code. If we fail, we return with no locks held. If we succeed (i.e. return
482 * non-NULL), we return with the PCPU lock held, ready for allocation from
483 * the returned bucket.
486 mb_pop_cont(struct mb_lstmngr *mb_list, int how, struct mb_pcpu_list *cnt_lst)
488 struct mb_bucket *bucket;
492 MB_UNLOCK_CONT(cnt_lst);
494 * If our object's (finite) map is starved now (i.e. no more address
495 * space), bail out now.
497 if (mb_list->ml_mapfull)
500 bucket = malloc(sizeof(struct mb_bucket) +
501 PAGE_SIZE / mb_list->ml_objsize * sizeof(void *), M_MBUF,
502 how == M_TRYWAIT ? M_WAITOK : M_NOWAIT);
506 p = (caddr_t)kmem_malloc(mb_list->ml_map, PAGE_SIZE,
507 how == M_TRYWAIT ? M_WAITOK : M_NOWAIT);
509 free(bucket, M_MBUF);
510 if (how == M_TRYWAIT)
511 mb_list->ml_mapfull = 1;
515 bucket->mb_numfree = 0;
516 mb_list->ml_btable[MB_BUCKET_INDX(p, mb_list)] = bucket;
517 for (i = 0; i < (PAGE_SIZE / mb_list->ml_objsize); i++) {
518 bucket->mb_free[i] = p;
519 bucket->mb_numfree++;
520 p += mb_list->ml_objsize;
523 MB_LOCK_CONT(cnt_lst);
524 bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
525 SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead), bucket, mb_blist);
526 (*(cnt_lst->mb_cont.mc_numpgs))++;
527 *(cnt_lst->mb_cont.mc_objcount) += bucket->mb_numfree;
533 * Allocate an mbuf-subsystem type object.
534 * The general case is very easy. Complications only arise if our PCPU
535 * container is empty. Things get worse if the PCPU container is empty,
536 * the general container is empty, and we've run out of address space
537 * in our map; then we try to block if we're willing to (M_TRYWAIT).
541 mb_alloc(struct mb_lstmngr *mb_list, int how, short type)
543 struct mb_pcpu_list *cnt_lst;
544 struct mb_bucket *bucket;
548 cnt_lst = MB_GET_PCPU_LIST(mb_list);
549 MB_LOCK_CONT(cnt_lst);
551 if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) != NULL) {
553 * This is the easy allocation case. We just grab an object
554 * from a bucket in the PCPU container. At worst, we
555 * have just emptied the bucket and so we remove it
556 * from the container.
558 MB_GET_OBJECT(m, bucket, cnt_lst);
559 MB_MBTYPES_INC(cnt_lst, type, 1);
560 MB_UNLOCK_CONT(cnt_lst);
562 struct mb_gen_list *gen_list;
565 * This is the less-common more difficult case. We must
566 * first verify if the general list has anything for us
567 * and if that also fails, we must allocate a page from
568 * the map and create a new bucket to place in our PCPU
569 * container (already locked). If the map is starved then
570 * we're really in for trouble, as we have to wait on
571 * the general container's condition variable.
573 gen_list = MB_GET_GEN_LIST(mb_list);
574 MB_LOCK_CONT(gen_list);
576 if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead)))
579 * Give ownership of the bucket to our CPU's
580 * container, but only actually put the bucket
581 * in the container if it doesn't become free
582 * upon removing an mbuf from it.
584 SLIST_REMOVE_HEAD(&(gen_list->mb_cont.mc_bhead),
586 bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
587 (*(gen_list->mb_cont.mc_numpgs))--;
588 (*(cnt_lst->mb_cont.mc_numpgs))++;
589 *(gen_list->mb_cont.mc_objcount) -= bucket->mb_numfree;
590 bucket->mb_numfree--;
591 m = bucket->mb_free[(bucket->mb_numfree)];
592 if (bucket->mb_numfree == 0) {
593 SLIST_NEXT(bucket, mb_blist) = NULL;
594 bucket->mb_owner |= MB_BUCKET_FREE;
596 SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead),
598 *(cnt_lst->mb_cont.mc_objcount) +=
601 MB_UNLOCK_CONT(gen_list);
602 MB_MBTYPES_INC(cnt_lst, type, 1);
603 MB_UNLOCK_CONT(cnt_lst);
606 * We'll have to allocate a new page.
608 MB_UNLOCK_CONT(gen_list);
609 bucket = mb_pop_cont(mb_list, how, cnt_lst);
610 if (bucket != NULL) {
611 bucket->mb_numfree--;
612 m = bucket->mb_free[(bucket->mb_numfree)];
613 (*(cnt_lst->mb_cont.mc_objcount))--;
614 MB_MBTYPES_INC(cnt_lst, type, 1);
615 MB_UNLOCK_CONT(cnt_lst);
617 if (how == M_TRYWAIT) {
619 * Absolute worst-case scenario. We block if
620 * we're willing to, but only after trying to
621 * steal from other lists.
623 m = mb_alloc_wait(mb_list, type);
626 * no way to indent this code decently
629 static int last_report;
630 /* XXX: No consistency. */
632 if (ticks < last_report ||
633 (ticks - last_report) >= hz) {
636 "mb_alloc for type %d failed, consider increase mbuf value.\n", type);
648 * This is the worst-case scenario called only if we're allocating with
649 * M_TRYWAIT. We first drain all the protocols, then try to find an mbuf
650 * by looking in every PCPU container. If we're still unsuccesful, we
651 * try the general container one last time and possibly block on our
655 mb_alloc_wait(struct mb_lstmngr *mb_list, short type)
657 struct mb_pcpu_list *cnt_lst;
658 struct mb_gen_list *gen_list;
659 struct mb_bucket *bucket;
664 * Try to reclaim mbuf-related objects (mbufs, clusters).
669 * Cycle all the PCPU containers. Increment starved counts if found
672 for (i = 0; i < NCPU; i++) {
675 cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, i);
676 MB_LOCK_CONT(cnt_lst);
679 * If container is non-empty, get a single object from it.
680 * If empty, increment starved count.
682 if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) !=
684 MB_GET_OBJECT(m, bucket, cnt_lst);
685 MB_MBTYPES_INC(cnt_lst, type, 1);
686 MB_UNLOCK_CONT(cnt_lst);
687 mbstat.m_wait++; /* XXX: No consistency. */
690 cnt_lst->mb_cont.mc_starved++;
692 MB_UNLOCK_CONT(cnt_lst);
696 * We're still here, so that means it's time to get the general
697 * container lock, check it one more time (now that mb_reclaim()
698 * has been called) and if we still get nothing, block on the cv.
700 gen_list = MB_GET_GEN_LIST(mb_list);
701 MB_LOCK_CONT(gen_list);
702 if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL) {
703 MB_GET_OBJECT(m, bucket, gen_list);
704 MB_MBTYPES_INC(gen_list, type, 1);
705 MB_UNLOCK_CONT(gen_list);
706 mbstat.m_wait++; /* XXX: No consistency. */
710 gen_list->mb_cont.mc_starved++;
711 cv_ret = cv_timedwait(&(gen_list->mgl_mstarved),
712 gen_list->mb_cont.mc_lock, mbuf_wait);
713 gen_list->mb_cont.mc_starved--;
716 ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL)) {
717 MB_GET_OBJECT(m, bucket, gen_list);
718 MB_MBTYPES_INC(gen_list, type, 1);
719 mbstat.m_wait++; /* XXX: No consistency. */
721 mbstat.m_drops++; /* XXX: No consistency. */
725 MB_UNLOCK_CONT(gen_list);
731 * Free an object to its rightful container.
732 * In the very general case, this operation is really very easy.
733 * Complications arise primarily if:
734 * (a) We've hit the high limit on number of free objects allowed in
735 * our PCPU container.
736 * (b) We're in a critical situation where our container has been
737 * marked 'starved' and we need to issue wakeups on the starved
738 * condition variable.
739 * (c) Minor (odd) cases: our bucket has migrated while we were
740 * waiting for the lock; our bucket is in the general container;
741 * our bucket is empty.
745 mb_free(struct mb_lstmngr *mb_list, void *m, short type)
747 struct mb_pcpu_list *cnt_lst;
748 struct mb_gen_list *gen_list;
749 struct mb_bucket *bucket;
752 bucket = mb_list->ml_btable[MB_BUCKET_INDX(m, mb_list)];
755 * Make sure that if after we lock the bucket's present container the
756 * bucket has migrated, that we drop the lock and get the new one.
759 owner = bucket->mb_owner & ~MB_BUCKET_FREE;
761 case MB_GENLIST_OWNER:
762 gen_list = MB_GET_GEN_LIST(mb_list);
763 MB_LOCK_CONT(gen_list);
764 if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) {
765 MB_UNLOCK_CONT(gen_list);
770 * If we're intended for the general container, this is
771 * real easy: no migrating required. The only `bogon'
772 * is that we're now contending with all the threads
773 * dealing with the general list, but this is expected.
775 MB_PUT_OBJECT(m, bucket, gen_list);
776 MB_MBTYPES_DEC(gen_list, type, 1);
777 if (gen_list->mb_cont.mc_starved > 0)
778 cv_signal(&(gen_list->mgl_mstarved));
779 MB_UNLOCK_CONT(gen_list);
783 cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, owner);
784 MB_LOCK_CONT(cnt_lst);
785 if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) {
786 MB_UNLOCK_CONT(cnt_lst);
790 MB_PUT_OBJECT(m, bucket, cnt_lst);
791 MB_MBTYPES_DEC(cnt_lst, type, 1);
793 if (cnt_lst->mb_cont.mc_starved > 0) {
795 * This is a tough case. It means that we've
796 * been flagged at least once to indicate that
797 * we're empty, and that the system is in a critical
798 * situation, so we ought to migrate at least one
799 * bucket over to the general container.
800 * There may or may not be a thread blocking on
801 * the starved condition variable, but chances
802 * are that one will eventually come up soon so
803 * it's better to migrate now than never.
805 gen_list = MB_GET_GEN_LIST(mb_list);
806 MB_LOCK_CONT(gen_list);
807 KASSERT((bucket->mb_owner & MB_BUCKET_FREE) != 0,
808 ("mb_free: corrupt bucket %p\n", bucket));
809 SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead),
811 bucket->mb_owner = MB_GENLIST_OWNER;
812 (*(cnt_lst->mb_cont.mc_objcount))--;
813 (*(gen_list->mb_cont.mc_objcount))++;
814 (*(cnt_lst->mb_cont.mc_numpgs))--;
815 (*(gen_list->mb_cont.mc_numpgs))++;
818 * Determine whether or not to keep transferring
819 * buckets to the general list or whether we've
820 * transferred enough already.
821 * We realize that although we may flag another
822 * bucket to be migrated to the general container
823 * that in the meantime, the thread that was
824 * blocked on the cv is already woken up and
825 * long gone. But in that case, the worst
826 * consequence is that we will end up migrating
827 * one bucket too many, which is really not a big
828 * deal, especially if we're close to a critical
831 if (gen_list->mb_cont.mc_starved > 0) {
832 cnt_lst->mb_cont.mc_starved--;
833 cv_signal(&(gen_list->mgl_mstarved));
835 cnt_lst->mb_cont.mc_starved = 0;
837 MB_UNLOCK_CONT(gen_list);
838 MB_UNLOCK_CONT(cnt_lst);
842 if (*(cnt_lst->mb_cont.mc_objcount) > *(mb_list->ml_wmhigh)) {
844 * We've hit the high limit of allowed numbers of mbufs
845 * on this PCPU list. We must now migrate a bucket
846 * over to the general container.
848 gen_list = MB_GET_GEN_LIST(mb_list);
849 MB_LOCK_CONT(gen_list);
850 if ((bucket->mb_owner & MB_BUCKET_FREE) == 0) {
852 SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead));
853 SLIST_REMOVE_HEAD(&(cnt_lst->mb_cont.mc_bhead),
856 SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead),
858 bucket->mb_owner = MB_GENLIST_OWNER;
859 *(cnt_lst->mb_cont.mc_objcount) -= bucket->mb_numfree;
860 *(gen_list->mb_cont.mc_objcount) += bucket->mb_numfree;
861 (*(cnt_lst->mb_cont.mc_numpgs))--;
862 (*(gen_list->mb_cont.mc_numpgs))++;
865 * While we're at it, transfer some of the mbtypes
866 * "count load" onto the general list's mbtypes
867 * array, seeing as how we're moving the bucket
868 * there now, meaning that the freeing of objects
869 * there will now decrement the _general list's_
870 * mbtypes counters, and no longer our PCPU list's
871 * mbtypes counters. We do this for the type presently
872 * being freed in an effort to keep the mbtypes
873 * counters approximately balanced across all lists.
875 MB_MBTYPES_DEC(cnt_lst, type, (PAGE_SIZE /
876 mb_list->ml_objsize) - bucket->mb_numfree);
877 MB_MBTYPES_INC(gen_list, type, (PAGE_SIZE /
878 mb_list->ml_objsize) - bucket->mb_numfree);
880 MB_UNLOCK_CONT(gen_list);
881 MB_UNLOCK_CONT(cnt_lst);
885 if (bucket->mb_owner & MB_BUCKET_FREE) {
886 SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead),
888 bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
891 MB_UNLOCK_CONT(cnt_lst);
899 * Drain protocols in hopes to free up some resources.
902 * No locks should be held when this is called. The drain routines have to
903 * presently acquire some locks which raises the possibility of lock order
904 * violation if we're holding any mutex if that mutex is acquired in reverse
905 * order relative to one of the locks in the drain routines.
914 * XXX: Argh, we almost always trip here with witness turned on now-a-days
915 * XXX: because we often come in with Giant held. For now, there's no way
916 * XXX: to avoid this.
919 KASSERT(witness_list(curthread) == 0,
920 ("mb_reclaim() called with locks held"));
923 mbstat.m_drain++; /* XXX: No consistency. */
925 for (dp = domains; dp; dp = dp->dom_next)
926 for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
933 * Local mbuf & cluster alloc macros and routines.
934 * Local macro and function names begin with an underscore ("_").
936 void _mext_free(struct mbuf *);
937 void _mclfree(struct mbuf *);
939 #define _m_get(m, how, type) do { \
940 (m) = (struct mbuf *)mb_alloc(&mb_list_mbuf, (how), (type)); \
942 (m)->m_type = (type); \
943 (m)->m_next = NULL; \
944 (m)->m_nextpkt = NULL; \
945 (m)->m_data = (m)->m_dat; \
950 #define _m_gethdr(m, how, type) do { \
951 (m) = (struct mbuf *)mb_alloc(&mb_list_mbuf, (how), (type)); \
953 (m)->m_type = (type); \
954 (m)->m_next = NULL; \
955 (m)->m_nextpkt = NULL; \
956 (m)->m_data = (m)->m_pktdat; \
957 (m)->m_flags = M_PKTHDR; \
958 (m)->m_pkthdr.rcvif = NULL; \
959 (m)->m_pkthdr.csum_flags = 0; \
960 (m)->m_pkthdr.aux = NULL; \
964 /* XXX: Check for M_PKTHDR && m_pkthdr.aux is bogus... please fix (see KAME) */
965 #define _m_free(m, n) do { \
967 if ((m)->m_flags & M_EXT) \
969 if (((m)->m_flags & M_PKTHDR) != 0 && (m)->m_pkthdr.aux) { \
970 m_freem((m)->m_pkthdr.aux); \
971 (m)->m_pkthdr.aux = NULL; \
973 mb_free(&mb_list_mbuf, (m), (m)->m_type); \
976 #define _mext_init_ref(m) do { \
977 (m)->m_ext.ref_cnt = malloc(sizeof(u_int), M_MBUF, M_NOWAIT); \
978 if ((m)->m_ext.ref_cnt != NULL) { \
979 *((m)->m_ext.ref_cnt) = 0; \
984 #define _mext_dealloc_ref(m) \
985 free((m)->m_ext.ref_cnt, M_MBUF)
988 _mext_free(struct mbuf *mb)
991 if (mb->m_ext.ext_type == EXT_CLUSTER)
992 mb_free(&mb_list_clust, (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF);
994 (*(mb->m_ext.ext_free))(mb->m_ext.ext_buf, mb->m_ext.ext_args);
996 _mext_dealloc_ref(mb);
1000 /* We only include this here to avoid making m_clget() excessively large
1001 * due to too much inlined code. */
1003 _mclfree(struct mbuf *mb)
1006 mb_free(&mb_list_clust, (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF);
1007 mb->m_ext.ext_buf = NULL;
1012 * Exported space allocation and de-allocation routines.
1015 m_get(int how, int type)
1019 _m_get(mb, how, type);
1024 m_gethdr(int how, int type)
1028 _m_gethdr(mb, how, type);
1033 m_get_clrd(int how, int type)
1037 _m_get(mb, how, type);
1040 bzero(mtod(mb, caddr_t), MLEN);
1046 m_gethdr_clrd(int how, int type)
1050 _m_gethdr(mb, how, type);
1053 bzero(mtod(mb, caddr_t), MHLEN);
1059 m_free(struct mbuf *mb)
1068 m_clget(struct mbuf *mb, int how)
1071 mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust, how, MT_NOTMBUF);
1072 if (mb->m_ext.ext_buf != NULL) {
1074 if (mb->m_ext.ref_cnt == NULL)
1077 mb->m_data = mb->m_ext.ext_buf;
1078 mb->m_flags |= M_EXT;
1079 mb->m_ext.ext_free = NULL;
1080 mb->m_ext.ext_args = NULL;
1081 mb->m_ext.ext_size = MCLBYTES;
1082 mb->m_ext.ext_type = EXT_CLUSTER;
1089 m_extadd(struct mbuf *mb, caddr_t buf, u_int size,
1090 void (*freef)(caddr_t, void *), void *args, short flags, int type)
1094 if (mb->m_ext.ref_cnt != NULL) {
1095 mb->m_flags |= (M_EXT | flags);
1096 mb->m_ext.ext_buf = buf;
1097 mb->m_data = mb->m_ext.ext_buf;
1098 mb->m_ext.ext_size = size;
1099 mb->m_ext.ext_free = freef;
1100 mb->m_ext.ext_args = args;
1101 mb->m_ext.ext_type = type;
1107 * Change type for mbuf `mb'; this is a relatively expensive operation and
1108 * should be avoided.
1111 m_chtype(struct mbuf *mb, short new_type)
1113 struct mb_gen_list *gen_list;
1115 gen_list = MB_GET_GEN_LIST(&mb_list_mbuf);
1116 MB_LOCK_CONT(gen_list);
1117 MB_MBTYPES_DEC(gen_list, mb->m_type, 1);
1118 MB_MBTYPES_INC(gen_list, new_type, 1);
1119 MB_UNLOCK_CONT(gen_list);
1120 mb->m_type = new_type;