2 * Copyright (c) 2008 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/net/altq/altq_fairq.c,v 1.1 2008/04/06 18:58:15 dillon Exp $
38 * Matt: I gutted altq_priq.c and used it as a skeleton on which to build
39 * fairq. The fairq algorithm is completely different then priq, of course,
40 * but because I used priq's skeleton I believe I should include priq's
43 * Copyright (C) 2000-2003
44 * Sony Computer Science Laboratories Inc. All rights reserved.
46 * Redistribution and use in source and binary forms, with or without
47 * modification, are permitted provided that the following conditions
49 * 1. Redistributions of source code must retain the above copyright
50 * notice, this list of conditions and the following disclaimer.
51 * 2. Redistributions in binary form must reproduce the above copyright
52 * notice, this list of conditions and the following disclaimer in the
53 * documentation and/or other materials provided with the distribution.
55 * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58 * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
69 * FAIRQ - take traffic classified by keep state (hashed into
70 * mbuf->m_pkthdr.altq_state_hash) and bucketize it. Fairly extract
71 * the first packet from each bucket in a round-robin fashion.
73 * TODO - better overall qlimit support (right now it is per-bucket).
74 * - NOTE: red etc is per bucket, not overall.
75 * - better service curve support.
79 * altq on em0 fairq bandwidth 650Kb queue { std, bulk }
80 * queue std priority 3 bandwidth 400Kb \
81 * fairq (buckets 64, default, hogs 1Kb) qlimit 50
82 * queue bulk priority 2 bandwidth 100Kb \
83 * fairq (buckets 64, hogs 1Kb) qlimit 50
85 * pass out on em0 from any to any keep state queue std
86 * pass out on em0 inet proto tcp ..... port ... keep state queue bulk
90 #include "opt_inet6.h"
92 #ifdef ALTQ_FAIRQ /* fairq is enabled in the kernel conf */
94 #include <sys/param.h>
95 #include <sys/malloc.h>
97 #include <sys/socket.h>
98 #include <sys/sockio.h>
99 #include <sys/systm.h>
100 #include <sys/proc.h>
101 #include <sys/errno.h>
102 #include <sys/kernel.h>
103 #include <sys/queue.h>
106 #include <net/if_var.h>
107 #include <netinet/in.h>
109 #include <netpfil/pf/pf.h>
110 #include <netpfil/pf/pf_altq.h>
111 #include <netpfil/pf/pf_mtag.h>
112 #include <altq/altq.h>
113 #include <altq/altq_fairq.h>
116 * function prototypes
118 static int fairq_clear_interface(struct fairq_if *);
119 static int fairq_request(struct ifaltq *, int, void *);
120 static void fairq_purge(struct fairq_if *);
121 static struct fairq_class *fairq_class_create(struct fairq_if *, int, int, u_int, struct fairq_opts *, int);
122 static int fairq_class_destroy(struct fairq_class *);
123 static int fairq_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *);
124 static struct mbuf *fairq_dequeue(struct ifaltq *, int);
126 static int fairq_addq(struct fairq_class *, struct mbuf *, u_int32_t);
127 static struct mbuf *fairq_getq(struct fairq_class *, uint64_t);
128 static struct mbuf *fairq_pollq(struct fairq_class *, uint64_t, int *);
129 static fairq_bucket_t *fairq_selectq(struct fairq_class *, int);
130 static void fairq_purgeq(struct fairq_class *);
132 static void get_class_stats(struct fairq_classstats *, struct fairq_class *);
133 static struct fairq_class *clh_to_clp(struct fairq_if *, uint32_t);
136 fairq_pfattach(struct pf_altq *a)
141 if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL)
144 error = altq_attach(&ifp->if_snd, ALTQT_FAIRQ, a->altq_disc,
145 fairq_enqueue, fairq_dequeue, fairq_request, NULL, NULL);
151 fairq_add_altq(struct pf_altq *a)
153 struct fairq_if *pif;
156 if ((ifp = ifunit(a->ifname)) == NULL)
158 if (!ALTQ_IS_READY(&ifp->if_snd))
162 pif = malloc(sizeof(struct fairq_if),
163 M_DEVBUF, M_WAITOK | M_ZERO);
164 pif->pif_bandwidth = a->ifbandwidth;
165 pif->pif_maxpri = -1;
166 pif->pif_ifq = &ifp->if_snd;
168 /* keep the state in pf_altq */
175 fairq_remove_altq(struct pf_altq *a)
177 struct fairq_if *pif;
179 if ((pif = a->altq_disc) == NULL)
183 fairq_clear_interface(pif);
190 fairq_add_queue(struct pf_altq *a)
192 struct fairq_if *pif;
193 struct fairq_class *cl;
195 if ((pif = a->altq_disc) == NULL)
198 /* check parameters */
199 if (a->priority >= FAIRQ_MAXPRI)
203 if (pif->pif_classes[a->priority] != NULL)
205 if (clh_to_clp(pif, a->qid) != NULL)
208 cl = fairq_class_create(pif, a->priority, a->qlimit, a->bandwidth,
209 &a->pq_u.fairq_opts, a->qid);
217 fairq_remove_queue(struct pf_altq *a)
219 struct fairq_if *pif;
220 struct fairq_class *cl;
222 if ((pif = a->altq_disc) == NULL)
225 if ((cl = clh_to_clp(pif, a->qid)) == NULL)
228 return (fairq_class_destroy(cl));
232 fairq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes)
234 struct fairq_if *pif;
235 struct fairq_class *cl;
236 struct fairq_classstats stats;
239 if ((pif = altq_lookup(a->ifname, ALTQT_FAIRQ)) == NULL)
242 if ((cl = clh_to_clp(pif, a->qid)) == NULL)
245 if (*nbytes < sizeof(stats))
248 get_class_stats(&stats, cl);
250 if ((error = copyout((caddr_t)&stats, ubuf, sizeof(stats))) != 0)
252 *nbytes = sizeof(stats);
257 * bring the interface back to the initial state by discarding
258 * all the filters and classes.
261 fairq_clear_interface(struct fairq_if *pif)
263 struct fairq_class *cl;
266 /* clear out the classes */
267 for (pri = 0; pri <= pif->pif_maxpri; pri++) {
268 if ((cl = pif->pif_classes[pri]) != NULL)
269 fairq_class_destroy(cl);
276 fairq_request(struct ifaltq *ifq, int req, void *arg)
278 struct fairq_if *pif = (struct fairq_if *)ifq->altq_disc;
280 IFQ_LOCK_ASSERT(ifq);
290 /* discard all the queued packets on the interface */
292 fairq_purge(struct fairq_if *pif)
294 struct fairq_class *cl;
297 for (pri = 0; pri <= pif->pif_maxpri; pri++) {
298 if ((cl = pif->pif_classes[pri]) != NULL && cl->cl_head)
301 if (ALTQ_IS_ENABLED(pif->pif_ifq))
302 pif->pif_ifq->ifq_len = 0;
305 static struct fairq_class *
306 fairq_class_create(struct fairq_if *pif, int pri, int qlimit,
307 u_int bandwidth, struct fairq_opts *opts, int qid)
309 struct fairq_class *cl;
310 int flags = opts->flags;
311 u_int nbuckets = opts->nbuckets;
315 if (flags & FARF_RED) {
317 printf("fairq_class_create: RED not configured for FAIRQ!\n");
324 if (nbuckets > FAIRQ_MAX_BUCKETS)
325 nbuckets = FAIRQ_MAX_BUCKETS;
326 /* enforce power-of-2 size */
327 while ((nbuckets ^ (nbuckets - 1)) != ((nbuckets << 1) - 1))
330 if ((cl = pif->pif_classes[pri]) != NULL) {
331 /* modify the class instead of creating a new one */
332 IFQ_LOCK(cl->cl_pif->pif_ifq);
335 IFQ_UNLOCK(cl->cl_pif->pif_ifq);
337 if (cl->cl_qtype == Q_RIO)
338 rio_destroy((rio_t *)cl->cl_red);
341 if (cl->cl_qtype == Q_RED)
342 red_destroy(cl->cl_red);
345 cl = malloc(sizeof(struct fairq_class),
346 M_DEVBUF, M_WAITOK | M_ZERO);
347 cl->cl_nbuckets = nbuckets;
348 cl->cl_nbucket_mask = nbuckets - 1;
350 cl->cl_buckets = malloc(
351 sizeof(struct fairq_bucket) * cl->cl_nbuckets,
352 M_DEVBUF, M_WAITOK | M_ZERO);
356 pif->pif_classes[pri] = cl;
357 if (flags & FARF_DEFAULTCLASS)
358 pif->pif_default = cl;
360 qlimit = 50; /* use default */
361 cl->cl_qlimit = qlimit;
362 for (i = 0; i < cl->cl_nbuckets; ++i) {
363 qlimit(&cl->cl_buckets[i].queue) = qlimit;
365 cl->cl_bandwidth = bandwidth / 8;
366 cl->cl_qtype = Q_DROPTAIL;
367 cl->cl_flags = flags & FARF_USERFLAGS;
369 if (pri > pif->pif_maxpri)
370 pif->pif_maxpri = pri;
373 cl->cl_hogs_m1 = opts->hogs_m1 / 8;
374 cl->cl_lssc_m1 = opts->lssc_m1 / 8; /* NOT YET USED */
377 if (flags & (FARF_RED|FARF_RIO)) {
378 int red_flags, red_pkttime;
381 if (flags & FARF_ECN)
382 red_flags |= REDF_ECN;
384 if (flags & FARF_CLEARDSCP)
385 red_flags |= RIOF_CLEARDSCP;
387 if (pif->pif_bandwidth < 8)
388 red_pkttime = 1000 * 1000 * 1000; /* 1 sec */
390 red_pkttime = (int64_t)pif->pif_ifq->altq_ifp->if_mtu
391 * 1000 * 1000 * 1000 / (pif->pif_bandwidth / 8);
393 if (flags & FARF_RIO) {
394 cl->cl_red = (red_t *)rio_alloc(0, NULL,
395 red_flags, red_pkttime);
396 if (cl->cl_red != NULL)
397 cl->cl_qtype = Q_RIO;
400 if (flags & FARF_RED) {
401 cl->cl_red = red_alloc(0, 0,
402 cl->cl_qlimit * 10/100,
403 cl->cl_qlimit * 30/100,
404 red_flags, red_pkttime);
405 if (cl->cl_red != NULL)
406 cl->cl_qtype = Q_RED;
409 #endif /* ALTQ_RED */
415 fairq_class_destroy(struct fairq_class *cl)
417 struct fairq_if *pif;
420 IFQ_LOCK(cl->cl_pif->pif_ifq);
426 pif->pif_classes[cl->cl_pri] = NULL;
427 if (pif->pif_poll_cache == cl)
428 pif->pif_poll_cache = NULL;
429 if (pif->pif_maxpri == cl->cl_pri) {
430 for (pri = cl->cl_pri; pri >= 0; pri--)
431 if (pif->pif_classes[pri] != NULL) {
432 pif->pif_maxpri = pri;
436 pif->pif_maxpri = -1;
438 IFQ_UNLOCK(cl->cl_pif->pif_ifq);
440 if (cl->cl_red != NULL) {
442 if (cl->cl_qtype == Q_RIO)
443 rio_destroy((rio_t *)cl->cl_red);
446 if (cl->cl_qtype == Q_RED)
447 red_destroy(cl->cl_red);
450 free(cl->cl_buckets, M_DEVBUF);
457 * fairq_enqueue is an enqueue function to be registered to
458 * (*altq_enqueue) in struct ifaltq.
461 fairq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr)
463 struct fairq_if *pif = (struct fairq_if *)ifq->altq_disc;
464 struct fairq_class *cl = NULL; /* Make compiler happy */
466 u_int32_t qid_hash = 0;
469 IFQ_LOCK_ASSERT(ifq);
471 /* grab class set by classifier */
472 if ((m->m_flags & M_PKTHDR) == 0) {
473 /* should not happen */
474 printf("altq: packet for %s does not have pkthdr\n",
475 ifq->altq_ifp->if_xname);
480 if ((t = pf_find_mtag(m)) != NULL) {
481 cl = clh_to_clp(pif, t->qid);
482 qid_hash = t->qid_hash;
485 cl = pif->pif_default;
491 cl->cl_flags |= FARF_HAS_PACKETS;
492 cl->cl_pktattr = NULL;
494 if (fairq_addq(cl, m, qid_hash) != 0) {
495 /* drop occurred. mbuf was freed in fairq_addq. */
496 PKTCNTR_ADD(&cl->cl_dropcnt, len);
505 * fairq_dequeue is a dequeue function to be registered to
506 * (*altq_dequeue) in struct ifaltq.
508 * note: ALTDQ_POLL returns the next packet without removing the packet
509 * from the queue. ALTDQ_REMOVE is a normal dequeue operation.
510 * ALTDQ_REMOVE must return the same packet if called immediately
514 fairq_dequeue(struct ifaltq *ifq, int op)
516 struct fairq_if *pif = (struct fairq_if *)ifq->altq_disc;
517 struct fairq_class *cl;
518 struct fairq_class *best_cl;
520 struct mbuf *m = NULL;
521 uint64_t cur_time = read_machclk();
525 IFQ_LOCK_ASSERT(ifq);
527 if (IFQ_IS_EMPTY(ifq)) {
531 if (pif->pif_poll_cache && op == ALTDQ_REMOVE) {
532 best_cl = pif->pif_poll_cache;
533 m = fairq_getq(best_cl, cur_time);
534 pif->pif_poll_cache = NULL;
537 PKTCNTR_ADD(&best_cl->cl_xmitcnt, m_pktlen(m));
544 for (pri = pif->pif_maxpri; pri >= 0; pri--) {
545 if ((cl = pif->pif_classes[pri]) == NULL)
547 if ((cl->cl_flags & FARF_HAS_PACKETS) == 0)
549 m = fairq_pollq(cl, cur_time, &hit_limit);
551 cl->cl_flags &= ~FARF_HAS_PACKETS;
556 * Only override the best choice if we are under
559 if (hit_limit == 0 || best_cl == NULL) {
565 * Remember the highest priority mbuf in case we
566 * do not find any lower priority mbufs.
572 if (op == ALTDQ_POLL) {
573 pif->pif_poll_cache = best_cl;
575 } else if (best_cl) {
576 m = fairq_getq(best_cl, cur_time);
579 PKTCNTR_ADD(&best_cl->cl_xmitcnt, m_pktlen(m));
588 fairq_addq(struct fairq_class *cl, struct mbuf *m, u_int32_t bucketid)
595 * If the packet doesn't have any keep state put it on the end of
596 * our queue. XXX this can result in out of order delivery.
600 b = cl->cl_head->prev;
602 b = &cl->cl_buckets[0];
604 hindex = bucketid & cl->cl_nbucket_mask;
605 b = &cl->cl_buckets[hindex];
609 * Add the bucket to the end of the circular list of active buckets.
611 * As a special case we add the bucket to the beginning of the list
612 * instead of the end if it was not previously on the list and if
613 * its traffic is less then the hog level.
615 if (b->in_use == 0) {
617 if (cl->cl_head == NULL) {
622 b->next = cl->cl_head;
623 b->prev = cl->cl_head->prev;
627 if (b->bw_delta && cl->cl_hogs_m1) {
628 bw = b->bw_bytes * machclk_freq / b->bw_delta;
629 if (bw < cl->cl_hogs_m1)
636 if (cl->cl_qtype == Q_RIO)
637 return rio_addq((rio_t *)cl->cl_red, &b->queue, m, cl->cl_pktattr);
640 if (cl->cl_qtype == Q_RED)
641 return red_addq(cl->cl_red, &b->queue, m, cl->cl_pktattr);
643 if (qlen(&b->queue) >= qlimit(&b->queue)) {
648 if (cl->cl_flags & FARF_CLEARDSCP)
649 write_dsfield(m, cl->cl_pktattr, 0);
657 fairq_getq(struct fairq_class *cl, uint64_t cur_time)
662 b = fairq_selectq(cl, 0);
666 else if (cl->cl_qtype == Q_RIO)
667 m = rio_getq((rio_t *)cl->cl_red, &b->queue);
670 else if (cl->cl_qtype == Q_RED)
671 m = red_getq(cl->cl_red, &b->queue);
674 m = _getq(&b->queue);
677 * Calculate the BW change
683 * Per-class bandwidth calculation
685 delta = (cur_time - cl->cl_last_time);
686 if (delta > machclk_freq * 8)
687 delta = machclk_freq * 8;
688 cl->cl_bw_delta += delta;
689 cl->cl_bw_bytes += m->m_pkthdr.len;
690 cl->cl_last_time = cur_time;
691 cl->cl_bw_delta -= cl->cl_bw_delta >> 3;
692 cl->cl_bw_bytes -= cl->cl_bw_bytes >> 3;
695 * Per-bucket bandwidth calculation
697 delta = (cur_time - b->last_time);
698 if (delta > machclk_freq * 8)
699 delta = machclk_freq * 8;
700 b->bw_delta += delta;
701 b->bw_bytes += m->m_pkthdr.len;
702 b->last_time = cur_time;
703 b->bw_delta -= b->bw_delta >> 3;
704 b->bw_bytes -= b->bw_bytes >> 3;
710 * Figure out what the next packet would be if there were no limits. If
711 * this class hits its bandwidth limit *hit_limit is set to no-zero, otherwise
712 * it is set to 0. A non-NULL mbuf is returned either way.
715 fairq_pollq(struct fairq_class *cl, uint64_t cur_time, int *hit_limit)
723 b = fairq_selectq(cl, 1);
726 m = qhead(&b->queue);
729 * Did this packet exceed the class bandwidth? Calculate the
730 * bandwidth component of the packet.
732 * - Calculate bytes per second
734 delta = cur_time - cl->cl_last_time;
735 if (delta > machclk_freq * 8)
736 delta = machclk_freq * 8;
737 cl->cl_bw_delta += delta;
738 cl->cl_last_time = cur_time;
739 if (cl->cl_bw_delta) {
740 bw = cl->cl_bw_bytes * machclk_freq / cl->cl_bw_delta;
742 if (bw > cl->cl_bandwidth)
745 printf("BW %6ju relative to %6u %d queue %p\n",
746 (uintmax_t)bw, cl->cl_bandwidth, *hit_limit, b);
753 * Locate the next queue we want to pull a packet out of. This code
754 * is also responsible for removing empty buckets from the circular list.
758 fairq_selectq(struct fairq_class *cl, int ispoll)
763 if (ispoll == 0 && cl->cl_polled) {
765 cl->cl_polled = NULL;
769 while ((b = cl->cl_head) != NULL) {
771 * Remove empty queues from consideration
773 if (qempty(&b->queue)) {
775 cl->cl_head = b->next;
776 if (cl->cl_head == b) {
779 b->next->prev = b->prev;
780 b->prev->next = b->next;
786 * Advance the round robin. Queues with bandwidths less
787 * then the hog bandwidth are allowed to burst.
789 if (cl->cl_hogs_m1 == 0) {
790 cl->cl_head = b->next;
791 } else if (b->bw_delta) {
792 bw = b->bw_bytes * machclk_freq / b->bw_delta;
793 if (bw >= cl->cl_hogs_m1) {
794 cl->cl_head = b->next;
812 fairq_purgeq(struct fairq_class *cl)
817 while ((b = fairq_selectq(cl, 0)) != NULL) {
818 while ((m = _getq(&b->queue)) != NULL) {
819 PKTCNTR_ADD(&cl->cl_dropcnt, m_pktlen(m));
822 ASSERT(qlen(&b->queue) == 0);
827 get_class_stats(struct fairq_classstats *sp, struct fairq_class *cl)
831 sp->class_handle = cl->cl_handle;
832 sp->qlimit = cl->cl_qlimit;
833 sp->xmit_cnt = cl->cl_xmitcnt;
834 sp->drop_cnt = cl->cl_dropcnt;
835 sp->qtype = cl->cl_qtype;
841 sp->qlength += qlen(&b->queue);
843 } while (b != cl->cl_head);
847 if (cl->cl_qtype == Q_RED)
848 red_getstats(cl->cl_red, &sp->red[0]);
851 if (cl->cl_qtype == Q_RIO)
852 rio_getstats((rio_t *)cl->cl_red, &sp->red[0]);
856 /* convert a class handle to the corresponding class pointer */
857 static struct fairq_class *
858 clh_to_clp(struct fairq_if *pif, uint32_t chandle)
860 struct fairq_class *cl;
866 for (idx = pif->pif_maxpri; idx >= 0; idx--)
867 if ((cl = pif->pif_classes[idx]) != NULL &&
868 cl->cl_handle == chandle)
874 #endif /* ALTQ_FAIRQ */